Merge branch 'linus' into release

Conflicts: arch/x86/kernel/cpu/cpufreq/longhaul.c Signed-off-by: Len Brown <len.brown@intel.com>
author: Len Brown <len.brown@intel.com> 2009-04-05 02:14:15 -0400
committer: Len Brown <len.brown@intel.com> 2009-04-05 02:14:15 -0400
commit: 478c6a43fcbc6c11609f8cee7c7b57223907754f (patch)
tree: a7f7952099da60d33032aed6de9c0c56c9f8779e /fs/ocfs2
parent: 8a3f257c704e02aee9869decd069a806b45be3f1 (diff)
parent: 6bb597507f9839b13498781e481f5458aea33620 (diff)
34 files changed, 4396 insertions, 694 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e5..fbeaec762103 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
                                return PTR_ERR(acl);
                }
                if (!acl)
-                        inode->i_mode &= ~current->fs->umask;
+                        inode->i_mode &= ~current_umask();
        }
        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19e3a96aa02c..678a067d9251 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                          u64 blkno)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+static void ocfs2_dx_root_update_clusters(struct inode *inode,
+                                          struct ocfs2_extent_tree *et,
+                                          u32 clusters)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+static int ocfs2_dx_root_sanity_check(struct inode *inode,
+                                      struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+        return 0;
+}
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dx_root_block *dx_root = et->et_object;
+        et->et_root_el = &dx_root->dr_list;
+}
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+        .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
+        .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
+        .eo_update_clusters     = ocfs2_dx_root_update_clusters,
+        .eo_sanity_check        = ocfs2_dx_root_sanity_check,
+        .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
+};
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                 &ocfs2_xattr_value_et_ops);
 }
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                    struct inode *inode,
+                                    struct buffer_head *bh)
+{
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+                                 NULL, &ocfs2_dx_root_et_ops);
+}
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                            u64 new_last_eb_blk)
 {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index cceff5c37f47..353254ba29e1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
                                        struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                    struct inode *inode,
+                                    struct buffer_head *bh);
 /*
 * Read an extent block into *bh.  If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e1709a679b7..b2c52b3a1484 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 }
 const struct address_space_operations ocfs2_aops = {
-        .readpage       = ocfs2_readpage,
+        .readpage               = ocfs2_readpage,
-        .readpages      = ocfs2_readpages,
+        .readpages              = ocfs2_readpages,
-        .writepage      = ocfs2_writepage,
+        .writepage              = ocfs2_writepage,
-        .write_begin    = ocfs2_write_begin,
+        .write_begin            = ocfs2_write_begin,
-        .write_end      = ocfs2_write_end,
+        .write_end              = ocfs2_write_end,
-        .bmap           = ocfs2_bmap,
+        .bmap                   = ocfs2_bmap,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .direct_IO      = ocfs2_direct_IO,
+        .direct_IO              = ocfs2_direct_IO,
-        .invalidatepage = ocfs2_invalidatepage,
+        .invalidatepage         = ocfs2_invalidatepage,
-        .releasepage    = ocfs2_releasepage,
+        .releasepage            = ocfs2_releasepage,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 04697ba7f73e..4f85eceab376 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,7 @@
 #include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/debugfs.h>
 #include "heartbeat.h"
 #include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+#define O2HB_DEBUG_DIR                  "o2hb"
+#define O2HB_DEBUG_LIVENODES            "livenodes"
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
 static LIST_HEAD(o2hb_all_regions);
 static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
        return 0;
 }
-void o2hb_init(void)
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+        unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        char *buf = NULL;
+        int i = -1;
+        int out = 0;
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                goto bail;
+        o2hb_fill_node_map(map, sizeof(map));
+        while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+                out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+        out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+        i_size_write(inode, out);
+        file->private_data = buf;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                                 size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
+{
+        return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+static struct file_operations o2hb_debug_fops = {
+        .open =         o2hb_debug_open,
+        .release =      o2hb_debug_release,
+        .read =         o2hb_debug_read,
+        .llseek =       generic_file_llseek,
+};
+void o2hb_exit(void)
+{
+        if (o2hb_debug_livenodes)
+                debugfs_remove(o2hb_debug_livenodes);
+        if (o2hb_debug_dir)
+                debugfs_remove(o2hb_debug_dir);
+}
+int o2hb_init(void)
 {
        int i;
@@ -918,6 +994,24 @@ void o2hb_init(void)
        INIT_LIST_HEAD(&o2hb_node_events);
        memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+        if (!o2hb_debug_dir) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
+                                                   S_IFREG|S_IRUSR,
+                                                   o2hb_debug_dir, NULL,
+                                                   &o2hb_debug_fops);
+        if (!o2hb_debug_livenodes) {
+                mlog_errno(-ENOMEM);
+                debugfs_remove(o2hb_debug_dir);
+                return -ENOMEM;
+        }
+        return 0;
 }
 /* if we're already in a callback then we're already serialized by the sem */
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index e511339886b3..2f1649253b49 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
                              struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
-void o2hb_init(void);
+void o2hb_exit(void);
+int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 70e8fa9e2539..7ee6188bc79a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
        o2cb_sys_shutdown();
        o2net_exit();
+        o2hb_exit();
 }
 static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
        cluster_print_version();
-        o2hb_init();
+        ret = o2hb_init();
+        if (ret)
+                goto out;
        ret = o2net_init();
        if (ret)
-                goto out;
+                goto out_o2hb;
        ret = o2net_register_hb_callbacks();
        if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
        o2net_unregister_hb_callbacks();
 out_o2net:
        o2net_exit();
+out_o2hb:
+        o2hb_exit();
 out:
        return ret;
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index e9d7c2038c0f..7d604480557a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -455,7 +455,7 @@ out_move:
        d_move(dentry, target);
 }
-struct dentry_operations ocfs2_dentry_ops = {
+const struct dentry_operations ocfs2_dentry_ops = {
        .d_revalidate           = ocfs2_dentry_revalidate,
        .d_iput                 = ocfs2_dentry_iput,
 };
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index d06e16c06640..faa12e75f98d 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -26,7 +26,7 @@
 #ifndef OCFS2_DCACHE_H
 #define OCFS2_DCACHE_H
-extern struct dentry_operations ocfs2_dentry_ops;
+extern const struct dentry_operations ocfs2_dentry_ops;
 struct ocfs2_dentry_lock {
        /* Use count of dentry lock */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2c4098cf337..e71160cda110 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
+#include <linux/sort.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -58,6 +59,7 @@
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
+#include "sysfile.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
-                            struct inode *dir,
-                            struct buffer_head *parent_fe_bh,
-                            unsigned int blocks_wanted,
-                            struct buffer_head **new_de_bh);
 static int ocfs2_do_extend_dir(struct super_block *sb,
                               handle_t *handle,
                               struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
 /*
 * These are distinct checks because future versions of the file system will
 * want to have a trailing dirent structure independent of indexing.
 */
-static int ocfs2_dir_has_trailer(struct inode *dir)
+static int ocfs2_supports_dir_trailer(struct inode *dir)
 {
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+        return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
 }
-static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
 {
-        return ocfs2_meta_ecc(osb);
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        return ocfs2_meta_ecc(osb) ||
+                ocfs2_supports_indexed_dirs(osb);
 }
 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 {
        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        if (!ocfs2_dir_has_trailer(dir))
+        if (!ocfs2_supports_dir_trailer(dir))
                return 0;
        if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 }
 static void ocfs2_init_dir_trailer(struct inode *inode,
-                                   struct buffer_head *bh)
+                                   struct buffer_head *bh, u16 rec_len)
 {
        struct ocfs2_dir_block_trailer *trailer;
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+        trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+                                     struct buffer_head *dx_root_bh,
+                                     struct buffer_head *dirdata_bh)
+{
+        int ret;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_block_trailer *trailer;
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        trailer->db_free_next = dx_root->dr_free_blk;
+        dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out:
+        return ret;
+}
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+        return res->dl_prev_leaf_bh == NULL;
+}
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+        brelse(res->dl_dx_root_bh);
+        brelse(res->dl_leaf_bh);
+        brelse(res->dl_dx_leaf_bh);
+        brelse(res->dl_prev_leaf_bh);
+}
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+                return 1;
+        return 0;
+}
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+        return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+        __u32   sum = 0;
+        __u32   b0 = buf[0], b1 = buf[1];
+        __u32   a = in[0], b = in[1], c = in[2], d = in[3];
+        int     n = 16;
+        do {
+                sum += DELTA;
+                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+        } while (--n);
+        buf[0] += b0;
+        buf[1] += b1;
+}
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = msg[i] + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+                                   struct ocfs2_dx_hinfo *hinfo)
+{
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        const char      *p;
+        __u32           in[8], buf[4];
+        /*
+         * XXX: Is this really necessary, if the index is never looked
+         * at by readdir? Is a hash value of '0' a bad idea?
+         */
+        if ((len == 1 && !strncmp(".", name, 1)) ||
+            (len == 2 && !strncmp("..", name, 2))) {
+                buf[0] = buf[1] = 0;
+                goto out;
+        }
+#ifdef OCFS2_DEBUG_DX_DIRS
+        /*
+         * This makes it very easy to debug indexing problems. We
+         * should never allow this to be selected without hand editing
+         * this file though.
+         */
+        buf[0] = buf[1] = len;
+        goto out;
+#endif
+        memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+        p = name;
+        while (len > 0) {
+                str2hashbuf(p, len, in, 4);
+                TEA_transform(buf, in);
+                len -= 16;
+                p += 16;
+        }
+out:
+        hinfo->major_hash = buf[0];
+        hinfo->minor_hash = buf[1];
 }
 /*
@@ -312,6 +470,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
 }
 /*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+        int rc = 0;
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+        if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Invalid dirblock #%llu: "
+                            "signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            trailer->db_signature);
+                goto out;
+        }
+        if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Directory block #%llu has an invalid "
+                            "db_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                goto out;
+        }
+        if (le64_to_cpu(trailer->db_parent_dinode) !=
+            OCFS2_I(dir)->ip_blkno) {
+                rc = -EINVAL;
+                ocfs2_error(dir->i_sb,
+                            "Directory block #%llu on dinode "
+                            "#%llu has an invalid parent_dinode "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                goto out;
+        }
+out:
+        return rc;
+}
+/*
 * This function forces all errors to -EIO for consistency with its
 * predecessor, ocfs2_bread().  We haven't audited what returning the
 * real error codes would do to callers.  We log the real codes with
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
        int rc = 0;
        struct buffer_head *tmp = *bh;
-        struct ocfs2_dir_block_trailer *trailer;
        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
                                    ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
                goto out;
        }
-        /*
-         * We check the trailer here rather than in
-         * ocfs2_validate_dir_block() because that function doesn't have
-         * the inode to test.
-         */
        if (!(flags & OCFS2_BH_READAHEAD) &&
-            ocfs2_dir_has_trailer(inode)) {
+            ocfs2_supports_dir_trailer(inode)) {
-                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                rc = ocfs2_check_dir_trailer(inode, tmp);
-                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                if (rc) {
-                        rc = -EINVAL;
+                        if (!*bh)
-                        ocfs2_error(inode->i_sb,
+                                brelse(tmp);
-                                    "Invalid dirblock #%llu: "
+                        mlog_errno(rc);
-                                    "signature = %.*s\n",
-                                    (unsigned long long)tmp->b_blocknr, 7,
-                                    trailer->db_signature);
-                        goto out;
-                }
-                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
-                        rc = -EINVAL;
-                        ocfs2_error(inode->i_sb,
-                                    "Directory block #%llu has an invalid "
-                                    "db_blkno of %llu",
-                                    (unsigned long long)tmp->b_blocknr,
-                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
-                        goto out;
-                }
-                if (le64_to_cpu(trailer->db_parent_dinode) !=
-                    OCFS2_I(inode)->ip_blkno) {
-                        rc = -EINVAL;
-                        ocfs2_error(inode->i_sb,
-                                    "Directory block #%llu on dinode "
-                                    "#%llu has an invalid parent_dinode "
-                                    "of %llu",
-                                    (unsigned long long)tmp->b_blocknr,
-                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
                        goto out;
                }
        }
@@ -379,6 +553,141 @@ out:
        return rc ? -EIO : 0;
 }
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+                                       struct buffer_head **bh)
+{
+        int ret;
+        struct buffer_head *tmp = *bh;
+        ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_supports_dir_trailer(dir)) {
+                ret = ocfs2_check_dir_trailer(dir, tmp);
+                if (ret) {
+                        if (!*bh)
+                                brelse(tmp);
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (!ret && !*bh)
+                *bh = tmp;
+out:
+        return ret;
+}
+static int ocfs2_validate_dx_root(struct super_block *sb,
+                                  struct buffer_head *bh)
+{
+        int ret;
+        struct ocfs2_dx_root_block *dx_root;
+        BUG_ON(!buffer_uptodate(bh));
+        dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+        if (ret) {
+                mlog(ML_ERROR,
+                     "Checksum failed for dir index root block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return ret;
+        }
+        if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+                ocfs2_error(sb,
+                            "Dir Index Root # %llu has bad signature %.*s",
+                            (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+                            7, dx_root->dr_signature);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+                              struct buffer_head **dx_root_bh)
+{
+        int ret;
+        u64 blkno = le64_to_cpu(di->i_dx_root);
+        struct buffer_head *tmp = *dx_root_bh;
+        ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!ret && !*dx_root_bh)
+                *dx_root_bh = tmp;
+        return ret;
+}
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+                                  struct buffer_head *bh)
+{
+        int ret;
+        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+        if (ret) {
+                mlog(ML_ERROR,
+                     "Checksum failed for dir index leaf block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return ret;
+        }
+        if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+                ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+                            7, dx_leaf->dl_signature);
+                return -EROFS;
+        }
+        return 0;
+}
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+                              struct buffer_head **dx_leaf_bh)
+{
+        int ret;
+        struct buffer_head *tmp = *dx_leaf_bh;
+        ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!ret && !*dx_leaf_bh)
+                *dx_leaf_bh = tmp;
+        return ret;
+}
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+                                struct buffer_head **dx_leaf_bhs)
+{
+        int ret;
+        ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+                                ocfs2_validate_dx_leaf);
+        if (ret)
+                mlog_errno(ret);
+        return ret;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -480,39 +789,340 @@ cleanup_and_exit:
        return ret;
 }
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+                                   struct ocfs2_extent_list *el,
+                                   u32 major_hash,
+                                   u32 *ret_cpos,
+                                   u64 *ret_phys_blkno,
+                                   unsigned int *ret_clen)
+{
+        int ret = 0, i, found;
+        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_rec *rec = NULL;
+        if (el->l_tree_depth) {
+                ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+                el = &eb->h_list;
+                if (el->l_tree_depth) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has non zero tree depth in "
+                                    "btree tree block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
+                }
+        }
+        found = 0;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                            "record (%u, %u, 0) in btree", inode->i_ino,
+                            le32_to_cpu(rec->e_cpos),
+                            ocfs2_rec_clusters(el, rec));
+                ret = -EROFS;
+                goto out;
+        }
+        if (ret_phys_blkno)
+                *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+        if (ret_cpos)
+                *ret_cpos = le32_to_cpu(rec->e_cpos);
+        if (ret_clen)
+                *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+out:
+        brelse(eb_bh);
+        return ret;
+}
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                                   u32 minor_hash)
+{
+        return minor_hash & osb->osb_dx_mask;
+}
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                          struct ocfs2_dx_hinfo *hinfo)
+{
+        return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+                               struct ocfs2_extent_list *el,
+                               struct ocfs2_dx_hinfo *hinfo,
+                               u32 *ret_cpos,
+                               u64 *ret_phys_blkno)
+{
+        int ret = 0;
+        unsigned int cend, uninitialized_var(clen);
+        u32 uninitialized_var(cpos);
+        u64 uninitialized_var(blkno);
+        u32 name_hash = hinfo->major_hash;
+        ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+                                      &clen);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        cend = cpos + clen;
+        if (name_hash >= cend) {
+                /* We want the last cluster */
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+                cpos += clen - 1;
+        } else {
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+                                                  name_hash - cpos);
+                cpos = name_hash;
+        }
+        /*
+         * We now have the cluster which should hold our entry. To
+         * find the exact block from the start of the cluster to
+         * search, we take the lower bits of the hash.
+         */
+        blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+        if (ret_phys_blkno)
+                *ret_phys_blkno = blkno;
+        if (ret_cpos)
+                *ret_cpos = cpos;
+out:
+        return ret;
+}
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+                               struct inode *dir,
+                               struct ocfs2_dx_root_block *dx_root,
+                               struct ocfs2_dir_lookup_result *res)
+{
+        int ret, i, found;
+        u64 uninitialized_var(phys);
+        struct buffer_head *dx_leaf_bh = NULL;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct ocfs2_dx_entry *dx_entry = NULL;
+        struct buffer_head *dir_ent_bh = NULL;
+        struct ocfs2_dir_entry *dir_ent = NULL;
+        struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+        struct ocfs2_extent_list *dr_el;
+        struct ocfs2_dx_entry_list *entry_list;
+        ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+        if (ocfs2_dx_root_inline(dx_root)) {
+                entry_list = &dx_root->dr_entries;
+                goto search;
+        }
+        dr_el = &dx_root->dr_list;
+        ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
+             "returns: %llu\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             namelen, name, hinfo->major_hash, hinfo->minor_hash,
+             (unsigned long long)phys);
+        ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+        mlog(0, "leaf info: num_used: %d, count: %d\n",
+             le16_to_cpu(dx_leaf->dl_list.de_num_used),
+             le16_to_cpu(dx_leaf->dl_list.de_count));
+        entry_list = &dx_leaf->dl_list;
+search:
+        /*
+         * Empty leaf is legal, so no need to check for that.
+         */
+        found = 0;
+        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+                dx_entry = &entry_list->de_entries[i];
+                if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+                    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+                        continue;
+                /*
+                 * Search unindexed leaf block now. We're not
+                 * guaranteed to find anything.
+                 */
+                ret = ocfs2_read_dir_block_direct(dir,
+                                          le64_to_cpu(dx_entry->dx_dirent_blk),
+                                          &dir_ent_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * XXX: We should check the unindexed block here,
+                 * before using it.
+                 */
+                found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+                                              0, dir_ent_bh->b_data,
+                                              dir->i_sb->s_blocksize, &dir_ent);
+                if (found == 1)
+                        break;
+                if (found == -1) {
+                        /* This means we found a bad directory entry. */
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                brelse(dir_ent_bh);
+                dir_ent_bh = NULL;
+        }
+        if (found <= 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        res->dl_leaf_bh = dir_ent_bh;
+        res->dl_entry = dir_ent;
+        res->dl_dx_leaf_bh = dx_leaf_bh;
+        res->dl_dx_entry = dx_entry;
+        ret = 0;
+out:
+        if (ret) {
+                brelse(dx_leaf_bh);
+                brelse(dir_ent_bh);
+        }
+        return ret;
+}
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+                               struct inode *dir,
+                               struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        ret = ocfs2_read_inode_block(dir, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+        if (ret) {
+                if (ret != -ENOENT)
+                        mlog_errno(ret);
+                goto out;
+        }
+        lookup->dl_dx_root_bh = dx_root_bh;
+        dx_root_bh = NULL;
+out:
+        brelse(di_bh);
+        brelse(dx_root_bh);
+        return ret;
+}
 /*
 * Try to find an entry of the provided name within 'dir'.
 *
- * If nothing was found, NULL is returned. Otherwise, a buffer_head
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
- * and pointer to the dir entry are passed back.
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
 *
 * Caller can NOT assume anything about the contents of the
- * buffer_head - it is passed back only so that it can be passed into
+ * buffer_heads - they are passed back only so that it can be passed
- * any one of the manipulation functions (add entry, delete entry,
+ * into any one of the manipulation functions (add entry, delete
- * etc). As an example, bh in the extent directory case is a data
+ * entry, etc). As an example, bh in the extent directory case is a
- * block, in the inline-data case it actually points to an inode.
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
 */
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+int ocfs2_find_entry(const char *name, int namelen,
-                                     struct inode *dir,
+                     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
-                                     struct ocfs2_dir_entry **res_dir)
 {
-        *res_dir = NULL;
+        struct buffer_head *bh;
+        struct ocfs2_dir_entry *res_dir = NULL;
+        if (ocfs2_dir_indexed(dir))
+                return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+        /*
+         * The unindexed dir code only uses part of the lookup
+         * structure, so there's no reason to push it down further
+         * than this.
+         */
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_find_entry_id(name, namelen, dir, res_dir);
+                bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+        else
+                bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+        if (bh == NULL)
+                return -ENOENT;
-        return ocfs2_find_entry_el(name, namelen, dir, res_dir);
+        lookup->dl_leaf_bh = bh;
+        lookup->dl_entry = res_dir;
+        return 0;
 }
 /*
 * Update inode number and type of a previously found directory entry.
 */
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                       struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode)
 {
        int ret;
        ocfs2_journal_access_func access = ocfs2_journal_access_db;
+        struct ocfs2_dir_entry *de = res->dl_entry;
+        struct buffer_head *de_bh = res->dl_leaf_bh;
        /*
         * The same code works fine for both inline-data and extent
@@ -538,6 +1148,10 @@ out:
        return ret;
 }
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
 static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                                struct ocfs2_dir_entry *de_del,
                                struct buffer_head *bh, char *first_de,
@@ -587,6 +1201,181 @@ bail:
        return status;
 }
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+        unsigned int hole;
+        if (le64_to_cpu(de->inode) == 0)
+                hole = le16_to_cpu(de->rec_len);
+        else
+                hole = le16_to_cpu(de->rec_len) -
+                        OCFS2_DIR_REC_LEN(de->name_len);
+        return hole;
+}
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+                                  struct buffer_head *dirblock_bh)
+{
+        int size, this_hole, largest_hole = 0;
+        char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+        struct ocfs2_dir_entry *de;
+        trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+        size = ocfs2_dir_trailer_blk_off(sb);
+        limit = start + size;
+        de_buf = start;
+        de = (struct ocfs2_dir_entry *)de_buf;
+        do {
+                if (de_buf != trailer) {
+                        this_hole = ocfs2_figure_dirent_hole(de);
+                        if (this_hole > largest_hole)
+                                largest_hole = this_hole;
+                }
+                de_buf += le16_to_cpu(de->rec_len);
+                de = (struct ocfs2_dir_entry *)de_buf;
+        } while (de_buf < limit);
+        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+                return largest_hole;
+        return 0;
+}
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+                                       int index)
+{
+        int num_used = le16_to_cpu(entry_list->de_num_used);
+        if (num_used == 1 || index == (num_used - 1))
+                goto clear;
+        memmove(&entry_list->de_entries[index],
+                &entry_list->de_entries[index + 1],
+                (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+        num_used--;
+        memset(&entry_list->de_entries[num_used], 0,
+               sizeof(struct ocfs2_dx_entry));
+        entry_list->de_num_used = cpu_to_le16(num_used);
+}
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+                                 struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, index, max_rec_len, add_to_free_list = 0;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+        struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+        struct ocfs2_dir_block_trailer *trailer;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        /*
+         * This function gets a bit messy because we might have to
+         * modify the root block, regardless of whether the indexed
+         * entries are stored inline.
+         */
+        /*
+         * *Only* set 'entry_list' here, based on where we're looking
+         * for the indexed entries. Later, we might still want to
+         * journal both blocks, based on free list state.
+         */
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root)) {
+                entry_list = &dx_root->dr_entries;
+        } else {
+                dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+                entry_list = &dx_leaf->dl_list;
+        }
+        /* Neither of these are a disk corruption - that should have
+         * been caught by lookup, before we got here. */
+        BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+        BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+        index = (char *)dx_entry - (char *)entry_list->de_entries;
+        index /= sizeof(*dx_entry);
+        if (index >= le16_to_cpu(entry_list->de_num_used)) {
+                mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+                     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+                     entry_list, dx_entry);
+                return -EIO;
+        }
+        /*
+         * We know that removal of this dirent will leave enough room
+         * for a new one, so add this block to the free list if it
+         * isn't already there.
+         */
+        trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+        if (trailer->db_free_rec_len == 0)
+                add_to_free_list = 1;
+        /*
+         * Add the block holding our index into the journal before
+         * removing the unindexed entry. If we get an error return
+         * from __ocfs2_delete_entry(), then it hasn't removed the
+         * entry yet. Likewise, successful return means we *must*
+         * remove the indexed entry.
+         *
+         * We're also careful to journal the root tree block here as
+         * the entry count needs to be updated. Also, we might be
+         * adding to the start of the free list.
+         */
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!ocfs2_dx_root_inline(dx_root)) {
+                ret = ocfs2_journal_access_dl(handle, dir,
+                                              lookup->dl_dx_leaf_bh,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        mlog(0, "Dir %llu: delete entry at index: %d\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+        ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+                                   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+        trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+        if (add_to_free_list) {
+                trailer->db_free_next = dx_root->dr_free_blk;
+                dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+                ocfs2_journal_dirty(handle, dx_root_bh);
+        }
+        /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+        ocfs2_journal_dirty(handle, leaf_bh);
+        le32_add_cpu(&dx_root->dr_num_entries, -1);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+        ocfs2_dx_list_remove_entry(entry_list, index);
+        if (!ocfs2_dx_root_inline(dx_root))
+                ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+out:
+        return ret;
+}
 static inline int ocfs2_delete_entry_id(handle_t *handle,
                                        struct inode *dir,
                                        struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
 }
 /*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * Delete a directory entry. Hide the details of directory
- * previous entry
+ * implementation from the caller.
 */
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                       struct ocfs2_dir_entry *de_del,
+                       struct ocfs2_dir_lookup_result *res)
-                       struct buffer_head *bh)
 {
+        if (ocfs2_dir_indexed(dir))
+                return ocfs2_delete_entry_dx(handle, dir, res);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                return ocfs2_delete_entry_id(handle, dir, de_del, bh);
+                return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+                                             res->dl_leaf_bh);
-        return ocfs2_delete_entry_el(handle, dir, de_del, bh);
+        return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+                                     res->dl_leaf_bh);
 }
 /*
@@ -663,18 +1456,166 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
        return 0;
 }
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+                                          struct ocfs2_dx_entry *dx_new_entry)
+{
+        int i;
+        i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+        dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+        le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+                                       struct ocfs2_dx_hinfo *hinfo,
+                                       u64 dirent_blk)
+{
+        int i;
+        struct ocfs2_dx_entry *dx_entry;
+        i = le16_to_cpu(entry_list->de_num_used);
+        dx_entry = &entry_list->de_entries[i];
+        memset(dx_entry, 0, sizeof(*dx_entry));
+        dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+        dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+        dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+        le16_add_cpu(&entry_list->de_num_used, 1);
+}
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+                                      struct ocfs2_dx_hinfo *hinfo,
+                                      u64 dirent_blk,
+                                      struct buffer_head *dx_leaf_bh)
+{
+        int ret;
+        struct ocfs2_dx_leaf *dx_leaf;
+        ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
+out:
+        return ret;
+}
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+                                        struct ocfs2_dx_hinfo *hinfo,
+                                        u64 dirent_blk,
+                                        struct ocfs2_dx_root_block *dx_root)
+{
+        ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+                               struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret = 0;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root)) {
+                ocfs2_dx_inline_root_insert(dir, handle,
+                                            &lookup->dl_hinfo,
+                                            lookup->dl_leaf_bh->b_blocknr,
+                                            dx_root);
+        } else {
+                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+                                                 lookup->dl_leaf_bh->b_blocknr,
+                                                 lookup->dl_dx_leaf_bh);
+                if (ret)
+                        goto out;
+        }
+        le32_add_cpu(&dx_root->dr_num_entries, 1);
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out:
+        return ret;
+}
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+                                       handle_t *handle,
+                                       struct ocfs2_dir_lookup_result *lookup)
+{
+        struct ocfs2_dir_block_trailer *trailer, *prev;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *bh;
+        trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+        if (ocfs2_free_list_at_root(lookup)) {
+                bh = lookup->dl_dx_root_bh;
+                dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+                dx_root->dr_free_blk = trailer->db_free_next;
+        } else {
+                bh = lookup->dl_prev_leaf_bh;
+                prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+                prev->db_free_next = trailer->db_free_next;
+        }
+        trailer->db_free_rec_len = cpu_to_le16(0);
+        trailer->db_free_next = cpu_to_le64(0);
+        ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+                                   struct ocfs2_dir_lookup_result *lookup)
+{
+        int max_rec_len;
+        struct ocfs2_dir_block_trailer *trailer;
+        /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+        if (max_rec_len) {
+                /*
+                 * There's still room in this block, so no need to remove it
+                 * from the free list. In this case, we just want to update
+                 * the rec len accounting.
+                 */
+                trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+                trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+                ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+        } else {
+                ocfs2_remove_block_from_free_list(dir, handle, lookup);
+        }
+}
 /* we don't always have a dentry for what we want to add, so people
 * like orphan dir can call this instead.
 *
- * If you pass me insert_bh, I'll skip the search of the other dir
+ * The lookup context must have been filled from
- * blocks and put the record in there.
+ * ocfs2_prepare_dir_for_insert.
 */
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                      struct buffer_head *insert_bh)
+                      struct ocfs2_dir_lookup_result *lookup)
 {
        unsigned long offset;
        unsigned short rec_len;
@@ -683,6 +1624,7 @@ int __ocfs2_add_entry(handle_t *handle,
        struct super_block *sb = dir->i_sb;
        int retval, status;
        unsigned int size = sb->s_blocksize;
+        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
        char *data_start = insert_bh->b_data;
        mlog_entry_void();
@@ -690,7 +1632,31 @@ int __ocfs2_add_entry(handle_t *handle,
        if (!namelen)
                return -EINVAL;
-        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+        if (ocfs2_dir_indexed(dir)) {
+                struct buffer_head *bh;
+                /*
+                 * An indexed dir may require that we update the free space
+                 * list. Reserve a write to the previous node in the list so
+                 * that we don't fail later.
+                 *
+                 * XXX: This can be either a dx_root_block, or an unindexed
+                 * directory tree leaf block.
+                 */
+                if (ocfs2_free_list_at_root(lookup)) {
+                        bh = lookup->dl_dx_root_bh;
+                        retval = ocfs2_journal_access_dr(handle, dir, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                } else {
+                        bh = lookup->dl_prev_leaf_bh;
+                        retval = ocfs2_journal_access_db(handle, dir, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                }
+                if (retval) {
+                        mlog_errno(retval);
+                        return retval;
+                }
+        } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                data_start = di->id2.i_data.id_data;
                size = i_size_read(dir);
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
                                status = ocfs2_journal_access_di(handle, dir,
                                                                 insert_bh,
                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                        else
+                        else {
                                status = ocfs2_journal_access_db(handle, dir,
                                                                 insert_bh,
-                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                if (ocfs2_dir_indexed(dir)) {
+                                        status = ocfs2_dx_dir_insert(dir,
+                                                                handle,
+                                                                lookup);
+                                        if (status) {
+                                                mlog_errno(status);
+                                                goto bail;
+                                        }
+                                }
+                        }
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
                        de->name_len = namelen;
                        memcpy(de->name, name, namelen);
+                        if (ocfs2_dir_indexed(dir))
+                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
                        status = ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
@@ -870,6 +1851,10 @@ out:
        return 0;
 }
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                    u64 *f_version,
                                    loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                             struct buffer_head **dirent_bh,
+                             struct ocfs2_dir_lookup_result *lookup)
-                             struct ocfs2_dir_entry **dirent)
 {
        int status = -ENOENT;
-        mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
+        mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
-                   namelen, name, blkno, inode, dirent_bh, dirent);
+             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+        status = ocfs2_find_entry(name, namelen, inode, lookup);
-        if (!*dirent_bh || !*dirent) {
+        if (status)
-                status = -ENOENT;
                goto leave;
-        }
-        *blkno = le64_to_cpu((*dirent)->inode);
+        *blkno = le64_to_cpu(lookup->dl_entry->inode);
        status = 0;
 leave:
-        if (status < 0) {
-                *dirent = NULL;
-                brelse(*dirent_bh);
-                *dirent_bh = NULL;
-        }
-        mlog_exit(status);
        return status;
 }
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno)
 {
        int ret;
-        struct buffer_head *bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *dirent = NULL;
-        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
+        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
-        brelse(bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        return ret;
 }
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
                              int namelen)
 {
        int ret;
-        struct buffer_head *dirent_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *dirent = NULL;
        mlog_entry("dir %llu, name '%.*s'\n",
                   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
        ret = -EEXIST;
-        dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
-        if (dirent_bh)
                goto bail;
        ret = 0;
 bail:
-        brelse(dirent_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(ret);
        return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
        unsigned seen_dot;
        unsigned seen_dot_dot;
        unsigned seen_other;
+        unsigned dx_dir;
 };
 static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
                                   loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        /*
         * Check the positions of "." and ".." records to be sure
         * they're in the correct place.
+         *
+         * Indexed directories don't need to proceed past the first
+         * two entries, so we end the scan after seeing '..'. Despite
+         * that, we allow the scan to proceed In the event that we
+         * have a corrupted indexed directory (no dot or dot dot
+         * entries). This allows us to double check for existing
+         * entries which might not have been found in the index.
         */
        if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
                p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        if (name_len == 2 && !strncmp("..", name, 2) &&
            pos == OCFS2_DIR_REC_LEN(1)) {
                p->seen_dot_dot = 1;
+                if (p->dx_dir && p->seen_dot)
+                        return 1;
                return 0;
        }
        p->seen_other = 1;
        return 1;
 }
+static int ocfs2_empty_dir_dx(struct inode *inode,
+                              struct ocfs2_empty_dir_priv *priv)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dinode *di;
+        struct ocfs2_dx_root_block *dx_root;
+        priv->dx_dir = 1;
+        ret = ocfs2_read_inode_block(inode, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+                priv->seen_other = 1;
+out:
+        brelse(di_bh);
+        brelse(dx_root_bh);
+        return ret;
+}
 /*
 * routine to check that the specified directory is empty (for rmdir)
 *
 * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
 */
 int ocfs2_empty_dir(struct inode *inode)
 {
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
        memset(&priv, 0, sizeof(priv));
+        if (ocfs2_dir_indexed(inode)) {
+                ret = ocfs2_empty_dir_dx(inode, &priv);
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * We still run ocfs2_dir_foreach to get the checks
+                 * for "." and "..".
+                 */
+        }
        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
        if (ret)
                mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct inode *parent,
                                 struct inode *inode,
                                 struct buffer_head *fe_bh,
-                                 struct ocfs2_alloc_context *data_ac)
+                                 struct ocfs2_alloc_context *data_ac,
+                                 struct buffer_head **ret_new_bh)
 {
        int status;
        unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        mlog_entry_void();
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(inode))
                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(inode)) {
-                ocfs2_init_dir_trailer(inode, new_bh);
+                int size = le16_to_cpu(de->rec_len);
+                /*
+                 * Figure out the size of the hole left over after
+                 * insertion of '.' and '..'. The trailer wants this
+                 * information.
+                 */
+                size -= OCFS2_DIR_REC_LEN(2);
+                size -= sizeof(struct ocfs2_dir_block_trailer);
+                ocfs2_init_dir_trailer(inode, new_bh, size);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        }
        status = 0;
+        if (ret_new_bh) {
+                *ret_new_bh = new_bh;
+                new_bh = NULL;
+        }
 bail:
        brelse(new_bh);
@@ -1336,20 +2384,427 @@ bail:
        return status;
 }
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+                                     handle_t *handle, struct inode *dir,
+                                     struct buffer_head *di_bh,
+                                     struct buffer_head *dirdata_bh,
+                                     struct ocfs2_alloc_context *meta_ac,
+                                     int dx_inline, u32 num_entries,
+                                     struct buffer_head **ret_dx_root_bh)
+{
+        int ret;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        u16 dr_suballoc_bit;
+        u64 dr_blkno;
+        unsigned int num_bits;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+                                   &num_bits, &dr_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Dir %llu, attach new index block: %llu\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             (unsigned long long)dr_blkno);
+        dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+        if (dx_root_bh == NULL) {
+                ret = -EIO;
+                goto out;
+        }
+        ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        memset(dx_root, 0, osb->sb->s_blocksize);
+        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+        dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+        dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+        dx_root->dr_num_entries = cpu_to_le32(num_entries);
+        if (le16_to_cpu(trailer->db_free_rec_len))
+                dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+        else
+                dx_root->dr_free_blk = cpu_to_le64(0);
+        if (dx_inline) {
+                dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+                dx_root->dr_entries.de_count =
+                        cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+        } else {
+                dx_root->dr_list.l_count =
+                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+        }
+        ret = ocfs2_journal_dirty(handle, dx_root_bh);
+        if (ret)
+                mlog_errno(ret);
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        di->i_dx_root = cpu_to_le64(dr_blkno);
+        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret)
+                mlog_errno(ret);
+        *ret_dx_root_bh = dx_root_bh;
+        dx_root_bh = NULL;
+out:
+        brelse(dx_root_bh);
+        return ret;
+}
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+                                       handle_t *handle, struct inode *dir,
+                                       struct buffer_head **dx_leaves,
+                                       int num_dx_leaves, u64 start_blk)
+{
+        int ret, i;
+        struct ocfs2_dx_leaf *dx_leaf;
+        struct buffer_head *bh;
+        for (i = 0; i < num_dx_leaves; i++) {
+                bh = sb_getblk(osb->sb, start_blk + i);
+                if (bh == NULL) {
+                        ret = -EIO;
+                        goto out;
+                }
+                dx_leaves[i] = bh;
+                ocfs2_set_new_buffer_uptodate(dir, bh);
+                ret = ocfs2_journal_access_dl(handle, dir, bh,
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+                memset(dx_leaf, 0, osb->sb->s_blocksize);
+                strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+                dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+                dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+                dx_leaf->dl_list.de_count =
+                        cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+                mlog(0,
+                     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
+                     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                     (unsigned long long)bh->b_blocknr,
+                     le16_to_cpu(dx_leaf->dl_list.de_count));
+                ocfs2_journal_dirty(handle, bh);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                      u32 cpos, handle_t *handle,
+                                      struct ocfs2_alloc_context *data_ac,
+                                      struct buffer_head **dx_leaves,
+                                      int num_dx_leaves, u64 *ret_phys_blkno)
+{
+        int ret;
+        u32 phys, num;
+        u64 phys_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        /*
+         * XXX: For create, this should claim cluster for the index
+         * *before* the unindexed insert so that we have a better
+         * chance of contiguousness as the directory grows in number
+         * of entries.
+         */
+        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Format the new cluster first. That way, we're inserting
+         * valid data.
+         */
+        phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+        ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+                                          num_dx_leaves, phys_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_phys_blkno = phys_blkno;
+out:
+        return ret;
+}
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                    struct ocfs2_extent_tree *et,
+                                    u32 cpos, handle_t *handle,
+                                    struct ocfs2_alloc_context *data_ac,
+                                    struct ocfs2_alloc_context *meta_ac,
+                                    struct buffer_head **dx_leaves,
+                                    int num_dx_leaves)
+{
+        int ret;
+        u64 phys_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+                                         num_dx_leaves, &phys_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+                                  meta_ac);
+        if (ret)
+                mlog_errno(ret);
+out:
+        return ret;
+}
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+                                                        int *ret_num_leaves)
+{
+        int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+        struct buffer_head **dx_leaves;
+        dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+                            GFP_NOFS);
+        if (dx_leaves && ret_num_leaves)
+                *ret_num_leaves = num_dx_leaves;
+        return dx_leaves;
+}
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+                                 handle_t *handle,
+                                 struct inode *parent,
+                                 struct inode *inode,
+                                 struct buffer_head *di_bh,
+                                 struct ocfs2_alloc_context *data_ac,
+                                 struct ocfs2_alloc_context *meta_ac)
+{
+        int ret;
+        struct buffer_head *leaf_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_hinfo hinfo;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        /*
+         * Our strategy is to create the directory as though it were
+         * unindexed, then add the index block. This works with very
+         * little complication since the state of a new directory is a
+         * very well known quantity.
+         *
+         * Essentially, we have two dirents ("." and ".."), in the 1st
+         * block which need indexing. These are easily inserted into
+         * the index block.
+         */
+        ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+                                    data_ac, &leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+                                        meta_ac, 1, 2, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+        ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+        ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+out:
+        brelse(dx_root_bh);
+        brelse(leaf_bh);
+        return ret;
+}
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                       struct ocfs2_alloc_context *data_ac)
+                       struct ocfs2_alloc_context *data_ac,
+                       struct ocfs2_alloc_context *meta_ac)
 {
        BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+        if (ocfs2_supports_indexed_dirs(osb))
+                return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+                                             data_ac, meta_ac);
        return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
-                                     data_ac);
+                                     data_ac, NULL);
+}
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+                                    handle_t *handle,
+                                    struct buffer_head **dx_leaves,
+                                    int num_dx_leaves,
+                                    u32 *num_dx_entries,
+                                    struct buffer_head *dirent_bh)
+{
+        int ret, namelen, i;
+        char *de_buf, *limit;
+        struct ocfs2_dir_entry *de;
+        struct buffer_head *dx_leaf_bh;
+        struct ocfs2_dx_hinfo hinfo;
+        u64 dirent_blk = dirent_bh->b_blocknr;
+        de_buf = dirent_bh->b_data;
+        limit = de_buf + dir->i_sb->s_blocksize;
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                namelen = de->name_len;
+                if (!namelen || !de->inode)
+                        goto inc;
+                ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+                i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+                dx_leaf_bh = dx_leaves[i];
+                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+                                                 dirent_blk, dx_leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                *num_dx_entries = *num_dx_entries + 1;
+inc:
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+out:
+        return ret;
+}
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+                                         struct buffer_head *dx_root_bh,
+                                         struct buffer_head *dirent_bh)
+{
+        char *de_buf, *limit;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dir_entry *de;
+        struct ocfs2_dx_hinfo hinfo;
+        u64 dirent_blk = dirent_bh->b_blocknr;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        de_buf = dirent_bh->b_data;
+        limit = de_buf + dir->i_sb->s_blocksize;
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                if (!de->name_len || !de->inode)
+                        goto inc;
+                ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+                mlog(0,
+                     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
+                     (unsigned long long)dir->i_ino, hinfo.major_hash,
+                     hinfo.minor_hash,
+                     le16_to_cpu(dx_root->dr_entries.de_num_used),
+                     de->name_len, de->name);
+                ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+                                           dirent_blk);
+                le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+}
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+                                         struct buffer_head *di_bh)
+{
+        int dirent_count = 0;
+        char *de_buf, *limit;
+        struct ocfs2_dir_entry *de;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        de_buf = di->id2.i_data.id_data;
+        limit = de_buf + i_size_read(dir);
+        while (de_buf < limit) {
+                de = (struct ocfs2_dir_entry *)de_buf;
+                if (de->name_len && de->inode)
+                        dirent_count++;
+                de_buf += le16_to_cpu(de->rec_len);
+        }
+        /* We are careful to leave room for one extra record. */
+        return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
 }
 /*
@@ -1358,18 +2813,26 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 * expansion from an inline directory to one with extents. The first dir block
 * in that case is taken from the inline data portion of the inode block.
 *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
 * We add the dir trailer if this filesystem wants it.
 */
-static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     struct super_block *sb)
+                                             struct inode *dir)
 {
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
        unsigned int new_size = sb->s_blocksize;
-        unsigned int bytes;
+        unsigned int bytes, this_hole;
+        unsigned int largest_hole = 0;
-        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+        if (ocfs2_new_dir_wants_trailer(dir))
                new_size = ocfs2_dir_trailer_blk_off(sb);
        bytes = new_size - old_size;
@@ -1378,12 +2841,26 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
        de_buf = start;
        de = (struct ocfs2_dir_entry *)de_buf;
        do {
+                this_hole = ocfs2_figure_dirent_hole(de);
+                if (this_hole > largest_hole)
+                        largest_hole = this_hole;
                prev_de = de;
                de_buf += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)de_buf;
        } while (de_buf < limit);
        le16_add_cpu(&prev_de->rec_len, bytes);
+        /* We need to double check this after modification of the final
+         * dirent. */
+        this_hole = ocfs2_figure_dirent_hole(prev_de);
+        if (this_hole > largest_hole)
+                largest_hole = this_hole;
+        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+                return largest_hole;
+        return 0;
 }
 /*
@@ -1396,29 +2873,61 @@ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
 */
 static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
+                                   struct ocfs2_dir_lookup_result *lookup,
                                   struct buffer_head **first_block_bh)
 {
-        u32 alloc, bit_off, len;
+        u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
        struct super_block *sb = dir->i_sb;
-        int ret, credits = ocfs2_inline_to_extents_credits(sb);
+        int ret, i, num_dx_leaves = 0, dx_inline = 0,
-        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
+                credits = ocfs2_inline_to_extents_credits(sb);
+        u64 dx_insert_blkno, blkno,
+                bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_alloc_context *meta_ac = NULL;
        struct buffer_head *dirdata_bh = NULL;
+        struct buffer_head *dx_root_bh = NULL;
+        struct buffer_head **dx_leaves = NULL;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
-        int did_quota = 0;
+        struct ocfs2_extent_tree dx_et;
+        int did_quota = 0, bytes_allocated = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
        alloc = ocfs2_clusters_for_bytes(sb, bytes);
+        dx_alloc = 0;
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                credits += ocfs2_add_dir_index_credits(sb);
+                dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+                if (!dx_inline) {
+                        /* Add one more cluster for an index leaf */
+                        dx_alloc++;
+                        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+                                                                &num_dx_leaves);
+                        if (!dx_leaves) {
+                                ret = -ENOMEM;
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+                /* This gets us the dx_root */
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        /*
-         * We should never need more than 2 clusters for this -
+         * We should never need more than 2 clusters for the unindexed
-         * maximum dirent size is far less than one block. In fact,
+         * tree - maximum dirent size is far less than one block. In
-         * the only time we'd need more than one cluster is if
+         * fact, the only time we'd need more than one cluster is if
         * blocksize == clustersize and the dirent won't fit in the
         * extra space that the expansion to a single block gives. As
         * of today, that only happens on 4k/4k file systems.
@@ -1435,7 +2944,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        /*
         * Prepare for worst case allocation scenario of two separate
-         * extents.
+         * extents in the unindexed tree.
         */
        if (alloc == 2)
                credits += OCFS2_SUBALLOC_ALLOC;
@@ -1448,11 +2957,29 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        }
        if (vfs_dq_alloc_space_nodirty(dir,
-                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                                ocfs2_clusters_to_bytes(osb->sb,
+                                                        alloc + dx_alloc))) {
                ret = -EDQUOT;
                goto out_commit;
        }
        did_quota = 1;
+        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+                /*
+                 * Allocate our index cluster first, to maximize the
+                 * possibility that unindexed leaves grow
+                 * contiguously.
+                 */
+                ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+                                                 dx_leaves, num_dx_leaves,
+                                                 &dx_insert_blkno);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+        }
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1463,6 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                mlog_errno(ret);
                goto out_commit;
        }
+        bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
        /*
         * Operations are carefully ordered so that we set up the new
@@ -1489,9 +3017,16 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+        i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
-        if (ocfs2_supports_dir_trailer(osb))
+        if (ocfs2_new_dir_wants_trailer(dir)) {
-                ocfs2_init_dir_trailer(dir, dirdata_bh);
+                /*
+                 * Prepare the dir trailer up front. It will otherwise look
+                 * like a valid dirent. Even if inserting the index fails
+                 * (unlikely), then all we'll have done is given first dir
+                 * block a small amount of fragmentation.
+                 */
+                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+        }
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1499,6 +3034,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_commit;
        }
+        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+                /*
+                 * Dx dirs with an external cluster need to do this up
+                 * front. Inline dx root's get handled later, after
+                 * we've allocated our root block. We get passed back
+                 * a total number of items so that dr_num_entries can
+                 * be correctly set once the dx_root has been
+                 * allocated.
+                 */
+                ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+                                               num_dx_leaves, &num_dx_entries,
+                                               dirdata_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
        /*
         * Set extent, i_size, etc on the directory. After this, the
         * inode should contain the same exact dirents as before and
@@ -1551,6 +3104,27 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_commit;
        }
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+                                                dirdata_bh, meta_ac, dx_inline,
+                                                num_dx_entries, &dx_root_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                if (dx_inline) {
+                        ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+                                                      dirdata_bh);
+                } else {
+                        ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
+                        ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+                                                  dx_insert_blkno, 1, 0, NULL);
+                        if (ret)
+                                mlog_errno(ret);
+                }
+        }
        /*
         * We asked for two clusters, but only got one in the 1st
         * pass. Claim the 2nd cluster as a separate extent.
@@ -1570,15 +3144,32 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                        mlog_errno(ret);
                        goto out_commit;
                }
+                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
        }
        *first_block_bh = dirdata_bh;
        dirdata_bh = NULL;
+        if (ocfs2_supports_indexed_dirs(osb)) {
+                unsigned int off;
+                if (!dx_inline) {
+                        /*
+                         * We need to return the correct block within the
+                         * cluster which should hold our entry.
+                         */
+                        off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+                                                    &lookup->dl_hinfo);
+                        get_bh(dx_leaves[off]);
+                        lookup->dl_dx_leaf_bh = dx_leaves[off];
+                }
+                lookup->dl_dx_root_bh = dx_root_bh;
+                dx_root_bh = NULL;
+        }
 out_commit:
        if (ret < 0 && did_quota)
-                vfs_dq_free_space_nodirty(dir,
+                vfs_dq_free_space_nodirty(dir, bytes_allocated);
-                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1587,8 +3178,17 @@ out_sem:
 out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        if (dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++)
+                        brelse(dx_leaves[i]);
+                kfree(dx_leaves);
+        }
        brelse(dirdata_bh);
+        brelse(dx_root_bh);
        return ret;
 }
@@ -1658,11 +3258,14 @@ bail:
 * is to be turned into an extent based one. The size of the dirent to
 * insert might be larger than the space gained by growing to just one
 * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
 */
 static int ocfs2_extend_dir(struct ocfs2_super *osb,
                            struct inode *dir,
                            struct buffer_head *parent_fe_bh,
                            unsigned int blocks_wanted,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct buffer_head **new_de_bh)
 {
        int status = 0;
@@ -1677,17 +3280,29 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        struct ocfs2_dir_entry * de;
        struct super_block *sb = osb->sb;
        struct ocfs2_extent_tree et;
+        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
        mlog_entry_void();
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                /*
+                 * This would be a code error as an inline directory should
+                 * never have an index root.
+                 */
+                BUG_ON(dx_root_bh);
                status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
-                                                 blocks_wanted, &new_bh);
+                                                 blocks_wanted, lookup,
+                                                 &new_bh);
                if (status) {
                        mlog_errno(status);
                        goto bail;
                }
+                /* Expansion from inline to an indexed directory will
+                 * have given us this. */
+                dx_root_bh = lookup->dl_dx_root_bh;
                if (blocks_wanted == 1) {
                        /*
                         * If the new dirent will fit inside the space
@@ -1751,6 +3366,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        }
 do_extend:
+        if (ocfs2_dir_indexed(dir))
+                credits++; /* For attaching the new dirent block to the
+                            * dx_root */
        down_write(&OCFS2_I(dir)->ip_alloc_sem);
        drop_alloc_sem = 1;
@@ -1781,9 +3400,19 @@ do_extend:
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        if (ocfs2_dir_has_trailer(dir)) {
+        if (ocfs2_supports_dir_trailer(dir)) {
                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
-                ocfs2_init_dir_trailer(dir, new_bh);
+                ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+                if (ocfs2_dir_indexed(dir)) {
+                        status = ocfs2_dx_dir_link_trailer(dir, handle,
+                                                           dx_root_bh, new_bh);
+                        if (status) {
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                }
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
@@ -1839,7 +3468,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * This calculates how many free bytes we'd have in block zero, should
         * this function force expansion to an extent tree.
         */
-        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+        if (ocfs2_new_dir_wants_trailer(dir))
                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
        else
                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
@@ -1970,12 +3599,766 @@ bail:
        return status;
 }
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+        const struct ocfs2_dx_entry *entry1 = a;
+        const struct ocfs2_dx_entry *entry2 = b;
+        u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+        u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+        u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+        u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+        if (major_hash1 > major_hash2)
+                return 1;
+        if (major_hash1 < major_hash2)
+                return -1;
+        /*
+         * It is not strictly necessary to sort by minor
+         */
+        if (minor_hash1 > minor_hash2)
+                return 1;
+        if (minor_hash1 < minor_hash2)
+                return -1;
+        return 0;
+}
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+        struct ocfs2_dx_entry *entry1 = a;
+        struct ocfs2_dx_entry *entry2 = b;
+        struct ocfs2_dx_entry tmp;
+        BUG_ON(size != sizeof(*entry1));
+        tmp = *entry1;
+        *entry1 = *entry2;
+        *entry2 = tmp;
+}
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+        int i, num = le16_to_cpu(dl_list->de_num_used);
+        for (i = 0; i < (num - 1); i++) {
+                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+                    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+                        return 0;
+        }
+        return 1;
+}
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+                                        u32 leaf_cpos, u32 insert_hash,
+                                        u32 *split_hash)
+{
+        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+        int i, num_used = le16_to_cpu(dl_list->de_num_used);
+        int allsame;
+        /*
+         * There's a couple rare, but nasty corner cases we have to
+         * check for here. All of them involve a leaf where all value
+         * have the same hash, which is what we look for first.
+         *
+         * Most of the time, all of the above is false, and we simply
+         * pick the median value for a split.
+         */
+        allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+        if (allsame) {
+                u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+                if (val == insert_hash) {
+                        /*
+                         * No matter where we would choose to split,
+                         * the new entry would want to occupy the same
+                         * block as these. Since there's no space left
+                         * in their existing block, we know there
+                         * won't be space after the split.
+                         */
+                        return -ENOSPC;
+                }
+                if (val == leaf_cpos) {
+                        /*
+                         * Because val is the same as leaf_cpos (which
+                         * is the smallest value this leaf can have),
+                         * yet is not equal to insert_hash, then we
+                         * know that insert_hash *must* be larger than
+                         * val (and leaf_cpos). At least cpos+1 in value.
+                         *
+                         * We also know then, that there cannot be an
+                         * adjacent extent (otherwise we'd be looking
+                         * at it). Choosing this value gives us a
+                         * chance to get some contiguousness.
+                         */
+                        *split_hash = leaf_cpos + 1;
+                        return 0;
+                }
+                if (val > insert_hash) {
+                        /*
+                         * val can not be the same as insert hash, and
+                         * also must be larger than leaf_cpos. Also,
+                         * we know that there can't be a leaf between
+                         * cpos and val, otherwise the entries with
+                         * hash 'val' would be there.
+                         */
+                        *split_hash = val;
+                        return 0;
+                }
+                *split_hash = insert_hash;
+                return 0;
+        }
+        /*
+         * Since the records are sorted and the checks above
+         * guaranteed that not all records in this block are the same,
+         * we simple travel forward, from the median, and pick the 1st
+         * record whose value is larger than leaf_cpos.
+         */
+        for (i = (num_used / 2); i < num_used; i++)
+                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+                    leaf_cpos)
+                        break;
+        BUG_ON(i == num_used); /* Should be impossible */
+        *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+        return 0;
+}
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+                                       handle_t *handle,
+                                       struct ocfs2_dx_leaf *tmp_dx_leaf,
+                                       struct buffer_head **orig_dx_leaves,
+                                       struct buffer_head **new_dx_leaves,
+                                       int num_dx_leaves)
+{
+        int i, j, num_used;
+        u32 major_hash;
+        struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+        struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+        struct ocfs2_dx_entry *dx_entry;
+        tmp_list = &tmp_dx_leaf->dl_list;
+        for (i = 0; i < num_dx_leaves; i++) {
+                orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+                orig_list = &orig_dx_leaf->dl_list;
+                new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+                new_list = &new_dx_leaf->dl_list;
+                num_used = le16_to_cpu(orig_list->de_num_used);
+                memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+                tmp_list->de_num_used = cpu_to_le16(0);
+                memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+                for (j = 0; j < num_used; j++) {
+                        dx_entry = &orig_list->de_entries[j];
+                        major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+                        if (major_hash >= split_hash)
+                                ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+                                                              dx_entry);
+                        else
+                                ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+                                                              dx_entry);
+                }
+                memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+                ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+                ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+        }
+}
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+                                          struct ocfs2_dx_root_block *dx_root)
+{
+        int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+        credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+        credits += ocfs2_quota_trans_credits(osb->sb);
+        return credits;
+}
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+                                  struct buffer_head *dx_root_bh,
+                                  struct buffer_head *dx_leaf_bh,
+                                  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+                                  u64 leaf_blkno)
+{
+        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        int credits, ret, i, num_used, did_quota = 0;
+        u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+        u64 orig_leaves_start;
+        int num_dx_leaves;
+        struct buffer_head **orig_dx_leaves = NULL;
+        struct buffer_head **new_dx_leaves = NULL;
+        struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+        struct ocfs2_extent_tree et;
+        handle_t *handle = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+        mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
+             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+             (unsigned long long)leaf_blkno, insert_hash);
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        /*
+         * XXX: This is a rather large limit. We should use a more
+         * realistic value.
+         */
+        if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+                return -ENOSPC;
+        num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+        if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+                mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+                     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                     (unsigned long long)leaf_blkno, num_used);
+                ret = -EIO;
+                goto out;
+        }
+        orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+        if (!orig_dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+        if (!new_dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+        if (ret) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
+        ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * This block is changing anyway, so we can sort it in place.
+         */
+        sort(dx_leaf->dl_list.de_entries, num_used,
+             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+             dx_leaf_sort_swap);
+        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+                                           &split_hash);
+        if (ret) {
+                mlog_errno(ret);
+                goto  out_commit;
+        }
+        mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
+             leaf_cpos, split_hash, insert_hash);
+        /*
+         * We have to carefully order operations here. There are items
+         * which want to be in the new cluster before insert, but in
+         * order to put those items in the new cluster, we alter the
+         * old cluster. A failure to insert gets nasty.
+         *
+         * So, start by reserving writes to the old
+         * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+         * the new cluster for us, before inserting it. The insert
+         * won't happen if there's an error before that. Once the
+         * insert is done then, we can transfer from one leaf into the
+         * other without fear of hitting any error.
+         */
+        /*
+         * The leaf transfer wants some scratch space so that we don't
+         * wind up doing a bunch of expensive memmove().
+         */
+        tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+        if (!tmp_dx_leaf) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+        ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+                                   orig_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        for (i = 0; i < num_dx_leaves; i++) {
+                ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
+        cpos = split_hash;
+        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                       data_ac, meta_ac, new_dx_leaves,
+                                       num_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+                                   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_commit_trans(osb, handle);
+out:
+        if (orig_dx_leaves || new_dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++) {
+                        if (orig_dx_leaves)
+                                brelse(orig_dx_leaves[i]);
+                        if (new_dx_leaves)
+                                brelse(new_dx_leaves[i]);
+                }
+                kfree(orig_dx_leaves);
+                kfree(new_dx_leaves);
+        }
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        kfree(tmp_dx_leaf);
+        return ret;
+}
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+                                   struct buffer_head *di_bh,
+                                   struct buffer_head *dx_root_bh,
+                                   const char *name, int namelen,
+                                   struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, rebalanced = 0;
+        struct ocfs2_dx_root_block *dx_root;
+        struct buffer_head *dx_leaf_bh = NULL;
+        struct ocfs2_dx_leaf *dx_leaf;
+        u64 blkno;
+        u32 leaf_cpos;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+restart_search:
+        ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+                                  &leaf_cpos, &blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+        if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+            le16_to_cpu(dx_leaf->dl_list.de_count)) {
+                if (rebalanced) {
+                        /*
+                         * Rebalancing should have provided us with
+                         * space in an appropriate leaf.
+                         *
+                         * XXX: Is this an abnormal condition then?
+                         * Should we print a message here?
+                         */
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+                                             &lookup->dl_hinfo, leaf_cpos,
+                                             blkno);
+                if (ret) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * Restart the lookup. The rebalance might have
+                 * changed which block our item fits into. Mark our
+                 * progress, so we only execute this once.
+                 */
+                brelse(dx_leaf_bh);
+                dx_leaf_bh = NULL;
+                rebalanced = 1;
+                goto restart_search;
+        }
+        lookup->dl_dx_leaf_bh = dx_leaf_bh;
+        dx_leaf_bh = NULL;
+out:
+        brelse(dx_leaf_bh);
+        return ret;
+}
+static int ocfs2_search_dx_free_list(struct inode *dir,
+                                     struct buffer_head *dx_root_bh,
+                                     int namelen,
+                                     struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret = -ENOSPC;
+        struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+        struct ocfs2_dir_block_trailer *db;
+        u64 next_block;
+        int rec_len = OCFS2_DIR_REC_LEN(namelen);
+        struct ocfs2_dx_root_block *dx_root;
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        next_block = le64_to_cpu(dx_root->dr_free_blk);
+        while (next_block) {
+                brelse(prev_leaf_bh);
+                prev_leaf_bh = leaf_bh;
+                leaf_bh = NULL;
+                ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+                if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+                        lookup->dl_leaf_bh = leaf_bh;
+                        lookup->dl_prev_leaf_bh = prev_leaf_bh;
+                        leaf_bh = NULL;
+                        prev_leaf_bh = NULL;
+                        break;
+                }
+                next_block = le64_to_cpu(db->db_free_next);
+        }
+        if (!next_block)
+                ret = -ENOSPC;
+out:
+        brelse(leaf_bh);
+        brelse(prev_leaf_bh);
+        return ret;
+}
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+                                       struct buffer_head *dx_root_bh)
+{
+        int ret, num_dx_leaves, i, j, did_quota = 0;
+        struct buffer_head **dx_leaves = NULL;
+        struct ocfs2_extent_tree et;
+        u64 insert_blkno;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        handle_t *handle = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        struct ocfs2_dx_entry *dx_entry;
+        struct ocfs2_dx_leaf *target_leaf;
+        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+        if (!dx_leaves) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
+        /*
+         * We do this up front, before the allocation, so that a
+         * failure to add the dx_root_bh to the journal won't result
+         * us losing clusters.
+         */
+        ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+                                         num_dx_leaves, &insert_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * Transfer the entries from our dx_root into the appropriate
+         * block
+         */
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+                dx_entry = &entry_list->de_entries[i];
+                j = __ocfs2_dx_dir_hash_idx(osb,
+                                            le32_to_cpu(dx_entry->dx_minor_hash));
+                target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+                ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+                /* Each leaf has been passed to the journal already
+                 * via __ocfs2_dx_dir_new_cluster() */
+        }
+        dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+        memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+               offsetof(struct ocfs2_dx_root_block, dr_list));
+        dx_root->dr_list.l_count =
+                cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+        /* This should never fail considering we start with an empty
+         * dx_root. */
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
+                                  insert_blkno, 1, 0, NULL);
+        if (ret)
+                mlog_errno(ret);
+        did_quota = 0;
+        ocfs2_journal_dirty(handle, dx_root_bh);
+out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                                          ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_commit_trans(osb, handle);
+out:
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (dx_leaves) {
+                for (i = 0; i < num_dx_leaves; i++)
+                        brelse(dx_leaves[i]);
+                kfree(dx_leaves);
+        }
+        return ret;
+}
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dx_entry_list *entry_list;
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        entry_list = &dx_root->dr_entries;
+        if (le16_to_cpu(entry_list->de_num_used) >=
+            le16_to_cpu(entry_list->de_count))
+                return -ENOSPC;
+        return 0;
+}
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+                                           struct buffer_head *di_bh,
+                                           const char *name,
+                                           int namelen,
+                                           struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret, free_dx_root = 1;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct buffer_head *dx_root_bh = NULL;
+        struct buffer_head *leaf_bh = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_dx_root_block *dx_root;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+                ret = -ENOSPC;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (ocfs2_dx_root_inline(dx_root)) {
+                ret = ocfs2_inline_dx_has_space(dx_root_bh);
+                if (ret == 0)
+                        goto search_el;
+                /*
+                 * We ran out of room in the root block. Expand it to
+                 * an extent, then allow ocfs2_find_dir_space_dx to do
+                 * the rest.
+                 */
+                ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /*
+         * Insert preparation for an indexed directory is split into two
+         * steps. The call to find_dir_space_dx reserves room in the index for
+         * an additional item. If we run out of space there, it's a real error
+         * we can't continue on.
+         */
+        ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+                                      namelen, lookup);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+search_el:
+        /*
+         * Next, we need to find space in the unindexed tree. This call
+         * searches using the free space linked list. If the unindexed tree
+         * lacks sufficient space, we'll expand it below. The expansion code
+         * is smart enough to add any new blocks to the free space list.
+         */
+        ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+        if (ret && ret != -ENOSPC) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* Do this up here - ocfs2_extend_dir might need the dx_root */
+        lookup->dl_dx_root_bh = dx_root_bh;
+        free_dx_root = 0;
+        if (ret == -ENOSPC) {
+                ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * We make the assumption here that new leaf blocks are added
+                 * to the front of our free list.
+                 */
+                lookup->dl_prev_leaf_bh = NULL;
+                lookup->dl_leaf_bh = leaf_bh;
+        }
+out:
+        if (free_dx_root)
+                brelse(dx_root_bh);
+        return ret;
+}
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct inode *dir,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                 struct buffer_head **ret_de_bh)
+                                 struct ocfs2_dir_lookup_result *lookup)
 {
        int ret;
        unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
        mlog(0, "getting ready to insert namelen %d into dir %llu\n",
             namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        *ret_de_bh = NULL;
        if (!namelen) {
                ret = -EINVAL;
                mlog_errno(ret);
                goto out;
        }
+        /*
+         * Do this up front to reduce confusion.
+         *
+         * The directory might start inline, then be turned into an
+         * indexed one, in which case we'd need to hash deep inside
+         * ocfs2_find_dir_space_id(). Since
+         * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+         * done, there seems no point in spreading out the calls. We
+         * can optimize away the case where the file system doesn't
+         * support indexing.
+         */
+        if (ocfs2_supports_indexed_dirs(osb))
+                ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+        if (ocfs2_dir_indexed(dir)) {
+                ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+                                                      name, namelen, lookup);
+                if (ret)
+                        mlog_errno(ret);
+                goto out;
+        }
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
                                              namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(bh);
                ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
-                                       &bh);
+                                       lookup, &bh);
                if (ret) {
                        if (ret != -ENOSPC)
                                mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(!bh);
        }
-        *ret_de_bh = bh;
+        lookup->dl_leaf_bh = bh;
        bh = NULL;
 out:
        brelse(bh);
        return ret;
 }
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+                                     struct buffer_head *di_bh,
+                                     struct buffer_head *dx_root_bh)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_dx_root_block *dx_root;
+        struct inode *dx_alloc_inode = NULL;
+        struct buffer_head *dx_alloc_bh = NULL;
+        handle_t *handle;
+        u64 blk;
+        u16 bit;
+        u64 bg_blkno;
+        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+        dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+                                        EXTENT_ALLOC_SYSTEM_INODE,
+                                        le16_to_cpu(dx_root->dr_suballoc_slot));
+        if (!dx_alloc_inode) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&dx_alloc_inode->i_mutex);
+        ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        di->i_dx_root = cpu_to_le64(0ULL);
+        ocfs2_journal_dirty(handle, di_bh);
+        blk = le64_to_cpu(dx_root->dr_blkno);
+        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+                                       bit, bg_blkno, 1);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        ocfs2_inode_unlock(dx_alloc_inode, 1);
+out_mutex:
+        mutex_unlock(&dx_alloc_inode->i_mutex);
+        brelse(dx_alloc_bh);
+out:
+        iput(dx_alloc_inode);
+        return ret;
+}
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+        int ret;
+        unsigned int uninitialized_var(clen);
+        u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+        u64 uninitialized_var(blkno);
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct buffer_head *dx_root_bh = NULL;
+        struct ocfs2_dx_root_block *dx_root;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        if (!ocfs2_dir_indexed(dir))
+                return 0;
+        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+        if (ocfs2_dx_root_inline(dx_root))
+                goto remove_index;
+        ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+        /* XXX: What if dr_clusters is too large? */
+        while (le32_to_cpu(dx_root->dr_clusters)) {
+                ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+                                              major_hash, &cpos, &blkno, &clen);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                                               &dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (cpos == 0)
+                        break;
+                major_hash = cpos - 1;
+        }
+remove_index:
+        ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_remove_from_cache(dir, dx_root_bh);
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &dealloc);
+        brelse(dx_root_bh);
+        return ret;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index c511e2e18e9f..e683f3deb645 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,44 +26,70 @@
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
-struct buffer_head *ocfs2_find_entry(const char *name,
+struct ocfs2_dx_hinfo {
-                                     int namelen,
+        u32     major_hash;
-                                     struct inode *dir,
+        u32     minor_hash;
-                                     struct ocfs2_dir_entry **res_dir);
+};
+struct ocfs2_dir_lookup_result {
+        struct buffer_head              *dl_leaf_bh;    /* Unindexed leaf
+                                                         * block */
+        struct ocfs2_dir_entry          *dl_entry;      /* Target dirent in
+                                                         * unindexed leaf */
+        struct buffer_head              *dl_dx_root_bh; /* Root of indexed
+                                                         * tree */
+        struct buffer_head              *dl_dx_leaf_bh; /* Indexed leaf block */
+        struct ocfs2_dx_entry           *dl_dx_entry;   /* Target dx_entry in
+                                                         * indexed leaf */
+        struct ocfs2_dx_hinfo           dl_hinfo;       /* Name hash results */
+        struct buffer_head              *dl_prev_leaf_bh;/* Previous entry in
+                                                          * dir free space
+                                                          * list. NULL if
+                                                          * previous entry is
+                                                          * dx root block. */
+};
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+int ocfs2_find_entry(const char *name, int namelen,
+                     struct inode *dir,
+                     struct ocfs2_dir_lookup_result *lookup);
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                       struct ocfs2_dir_entry *de_del,
+                       struct ocfs2_dir_lookup_result *res);
-                       struct buffer_head *bh);
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                      struct buffer_head *insert_bh);
+                      struct ocfs2_dir_lookup_result *lookup);
 static inline int ocfs2_add_entry(handle_t *handle,
                                  struct dentry *dentry,
                                  struct inode *inode, u64 blkno,
                                  struct buffer_head *parent_fe_bh,
-                                  struct buffer_head *insert_bh)
+                                  struct ocfs2_dir_lookup_result *lookup)
 {
        return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
                                 dentry->d_name.name, dentry->d_name.len,
-                                 inode, blkno, parent_fe_bh, insert_bh);
+                                 inode, blkno, parent_fe_bh, lookup);
 }
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                       struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode);
 int ocfs2_check_dir_for_entry(struct inode *dir,
                              const char *name,
                              int namelen);
 int ocfs2_empty_dir(struct inode *inode);
 int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                             struct buffer_head **dirent_bh,
+                             struct ocfs2_dir_lookup_result *res);
-                             struct ocfs2_dir_entry **dirent);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                 struct buffer_head **ret_de_bh);
+                                 struct ocfs2_dir_lookup_result *lookup);
 struct ocfs2_alloc_context;
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                       struct ocfs2_alloc_context *data_ac);
+                       struct ocfs2_alloc_context *data_ac,
+                       struct ocfs2_alloc_context *meta_ac);
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
                                                            void *data);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bb53714813ab..0102be35980c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -52,16 +52,12 @@
 enum dlm_mle_type {
        DLM_MLE_BLOCK,
        DLM_MLE_MASTER,
-        DLM_MLE_MIGRATION
+        DLM_MLE_MIGRATION,
-};
+        DLM_MLE_NUM_TYPES
-struct dlm_lock_name {
-        u8 len;
-        u8 name[DLM_LOCKID_NAME_MAX];
 };
 struct dlm_master_list_entry {
-        struct list_head list;
+        struct hlist_node master_hash_node;
        struct list_head hb_events;
        struct dlm_ctxt *dlm;
        spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
        enum dlm_mle_type type;
        struct o2hb_callback_func mle_hb_up;
        struct o2hb_callback_func mle_hb_down;
-        union {
+        struct dlm_lock_resource *mleres;
-                struct dlm_lock_resource *res;
+        unsigned char mname[DLM_LOCKID_NAME_MAX];
-                struct dlm_lock_name name;
+        unsigned int mnamelen;
-        } u;
+        unsigned int mnamehash;
 };
 enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
-        struct list_head master_list;
+        struct hlist_head **master_hash;
        struct list_head mle_hb_events;
        /* these give a really vague idea of the system load */
-        atomic_t local_resources;
+        atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
-        atomic_t remote_resources;
+        atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
-        atomic_t unknown_resources;
+        atomic_t res_tot_count;
+        atomic_t res_cur_count;
        struct dlm_debug_ctxt *dlm_debug_ctxt;
        struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
        return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
 }
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+                                                 unsigned i)
+{
+        return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+                        (i % DLM_BUCKETS_PER_PAGE);
+}
 /* these keventd work queue items are for less-frequently
 * called functions that cannot be directly called from the
 * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                              unsigned int len);
 int dlm_is_host_down(int errno);
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res,
-                              u8 owner);
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                                 const char *lockid,
                                                 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
 /* create/destroy slab caches */
 int dlm_init_master_caches(void);
 void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
        return bit;
 }
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+                                         struct dlm_lock_resource *res,
+                                         u8 owner)
+{
+        assert_spin_locked(&res->spinlock);
+        res->owner = owner;
+}
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+                                            struct dlm_lock_resource *res,
+                                            u8 owner)
+{
+        assert_spin_locked(&res->spinlock);
+        if (owner != res->owner)
+                dlm_set_lockres_owner(dlm, res, owner);
+}
 #endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index b32f60a5acfb..df52f706f669 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
 static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 {
        int out = 0;
-        unsigned int namelen;
-        const char *name;
        char *mle_type;
-        if (mle->type != DLM_MLE_MASTER) {
-                namelen = mle->u.name.len;
-                name = mle->u.name.name;
-        } else {
-                namelen = mle->u.res->lockname.len;
-                name = mle->u.res->lockname.name;
-        }
        if (mle->type == DLM_MLE_BLOCK)
                mle_type = "BLK";
        else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
        else
                mle_type = "MIG";
-        out += stringify_lockname(name, namelen, buf + out, len - out);
+        out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
        out += snprintf(buf + out, len - out,
                        "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
                        mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
 static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
        struct dlm_master_list_entry *mle;
-        int out = 0;
+        struct hlist_head *bucket;
-        unsigned long total = 0;
+        struct hlist_node *list;
+        int i, out = 0;
+        unsigned long total = 0, longest = 0, bktcnt;
        out += snprintf(db->buf + out, db->len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
        spin_lock(&dlm->master_lock);
-        list_for_each_entry(mle, &dlm->master_list, list) {
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                ++total;
+                bucket = dlm_master_hash(dlm, i);
-                if (db->len - out < 200)
+                hlist_for_each(list, bucket) {
-                        continue;
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
-                out += dump_mle(mle, db->buf + out, db->len - out);
+                                          master_hash_node);
+                        ++total;
+                        ++bktcnt;
+                        if (db->len - out < 200)
+                                continue;
+                        out += dump_mle(mle, db->buf + out, db->len - out);
+                }
+                longest = max(longest, bktcnt);
+                bktcnt = 0;
        }
        spin_unlock(&dlm->master_lock);
        out += snprintf(db->buf + out, db->len - out,
-                        "Total on list: %ld\n", total);
+                        "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int out = 0;
        struct dlm_reco_node_data *node;
        char *state;
-        int lres, rres, ures, tres;
+        int cur_mles = 0, tot_mles = 0;
+        int i;
-        lres = atomic_read(&dlm->local_resources);
-        rres = atomic_read(&dlm->remote_resources);
-        ures = atomic_read(&dlm->unknown_resources);
-        tres = lres + rres + ures;
        spin_lock(&dlm->spinlock);
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                 db->buf + out, db->len - out);
        out += snprintf(db->buf + out, db->len - out, "\n");
-        /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+        /* Lock Resources: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Lock Resources: %d (%d)\n",
+                        atomic_read(&dlm->res_cur_count),
+                        atomic_read(&dlm->res_tot_count));
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+                tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+                cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+        /* MLEs: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "MLEs: %d (%d)\n", cur_mles, tot_mles);
+        /*  Blocking: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "  Blocking: %d (%d)\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+        /*  Mastery: xxx (xxx) */
+        out += snprintf(db->buf + out, db->len - out,
+                        "  Mastery: %d (%d)\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+        /*  Migration: xxx (xxx) */
        out += snprintf(db->buf + out, db->len - out,
-                        "Mastered Resources Total: %d  Locally: %d  "
+                        "  Migration: %d (%d)\n",
-                        "Remotely: %d  Unknown: %d\n",
+                        atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
-                        tres, lres, rres, ures);
+                        atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
        out += snprintf(db->buf + out, db->len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
-                        "PendingBASTs=%s  Master=%s\n",
+                        "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
-                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
-                        (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
        /* Purge Count: xxx  Refs: xxx */
        out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d8d578f45613..4d9e6b288dd8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+        if (dlm->master_hash)
+                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
        if (dlm->name)
                kfree(dlm->name);
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
+        dlm->master_hash = (struct hlist_head **)
+                                dlm_alloc_pagevec(DLM_HASH_PAGES);
+        if (!dlm->master_hash) {
+                mlog_errno(-ENOMEM);
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+                kfree(dlm->name);
+                kfree(dlm);
+                dlm = NULL;
+                goto leave;
+        }
+        for (i = 0; i < DLM_HASH_BUCKETS; i++)
+                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
        ret = dlm_create_debugfs_subroot(dlm);
        if (ret < 0) {
+                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
                kfree(dlm->name);
                kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        init_waitqueue_head(&dlm->reco.event);
        init_waitqueue_head(&dlm->ast_wq);
        init_waitqueue_head(&dlm->migration_wq);
-        INIT_LIST_HEAD(&dlm->master_list);
        INIT_LIST_HEAD(&dlm->mle_hb_events);
        dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-        atomic_set(&dlm->local_resources, 0);
-        atomic_set(&dlm->remote_resources, 0);
+        atomic_set(&dlm->res_tot_count, 0);
-        atomic_set(&dlm->unknown_resources, 0);
+        atomic_set(&dlm->res_cur_count, 0);
+        for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+                atomic_set(&dlm->mle_tot_count[i], 0);
+                atomic_set(&dlm->mle_cur_count[i], 0);
+        }
        spin_lock_init(&dlm->work_lock);
        INIT_LIST_HEAD(&dlm->work_list);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0a2813947853..f8b653fcd4dd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
                                const char *name,
                                unsigned int namelen)
 {
-        struct dlm_lock_resource *res;
        if (dlm != mle->dlm)
                return 0;
-        if (mle->type == DLM_MLE_BLOCK ||
+        if (namelen != mle->mnamelen ||
-            mle->type == DLM_MLE_MIGRATION) {
+            memcmp(name, mle->mname, namelen) != 0)
-                if (namelen != mle->u.name.len ||
+                return 0;
-                    memcmp(name, mle->u.name.name, namelen)!=0)
-                        return 0;
-        } else {
-                res = mle->u.res;
-                if (namelen != res->lockname.len ||
-                    memcmp(res->lockname.name, name, namelen) != 0)
-                        return 0;
-        }
        return 1;
 }
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->dlm = dlm;
        mle->type = type;
-        INIT_LIST_HEAD(&mle->list);
+        INIT_HLIST_NODE(&mle->master_hash_node);
        INIT_LIST_HEAD(&mle->hb_events);
        memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
        spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->new_master = O2NM_MAX_NODES;
        mle->inuse = 0;
+        BUG_ON(mle->type != DLM_MLE_BLOCK &&
+               mle->type != DLM_MLE_MASTER &&
+               mle->type != DLM_MLE_MIGRATION);
        if (mle->type == DLM_MLE_MASTER) {
                BUG_ON(!res);
-                mle->u.res = res;
+                mle->mleres = res;
-        } else if (mle->type == DLM_MLE_BLOCK) {
+                memcpy(mle->mname, res->lockname.name, res->lockname.len);
-                BUG_ON(!name);
+                mle->mnamelen = res->lockname.len;
-                memcpy(mle->u.name.name, name, namelen);
+                mle->mnamehash = res->lockname.hash;
-                mle->u.name.len = namelen;
+        } else {
-        } else /* DLM_MLE_MIGRATION */ {
                BUG_ON(!name);
-                memcpy(mle->u.name.name, name, namelen);
+                mle->mleres = NULL;
-                mle->u.name.len = namelen;
+                memcpy(mle->mname, name, namelen);
+                mle->mnamelen = namelen;
+                mle->mnamehash = dlm_lockid_hash(name, namelen);
        }
+        atomic_inc(&dlm->mle_tot_count[mle->type]);
+        atomic_inc(&dlm->mle_cur_count[mle->type]);
        /* copy off the node_map and register hb callbacks on our copy */
        memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
        memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        __dlm_mle_attach_hb_events(dlm, mle);
 }
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&dlm->master_lock);
+        if (!hlist_unhashed(&mle->master_hash_node))
+                hlist_del_init(&mle->master_hash_node);
+}
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+        struct hlist_head *bucket;
+        assert_spin_locked(&dlm->master_lock);
+        bucket = dlm_master_hash(dlm, mle->mnamehash);
+        hlist_add_head(&mle->master_hash_node, bucket);
+}
 /* returns 1 if found, 0 if not */
 static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
+        struct hlist_head *bucket;
+        struct hlist_node *list;
+        unsigned int hash;
        assert_spin_locked(&dlm->master_lock);
-        list_for_each_entry(tmpmle, &dlm->master_list, list) {
+        hash = dlm_lockid_hash(name, namelen);
+        bucket = dlm_master_hash(dlm, hash);
+        hlist_for_each(list, bucket) {
+                tmpmle = hlist_entry(list, struct dlm_master_list_entry,
+                                     master_hash_node);
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
        mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
        dlm = mle->dlm;
-        if (mle->type != DLM_MLE_MASTER) {
-                mlog(0, "calling mle_release for %.*s, type %d\n",
-                     mle->u.name.len, mle->u.name.name, mle->type);
-        } else {
-                mlog(0, "calling mle_release for %.*s, type %d\n",
-                     mle->u.res->lockname.len,
-                     mle->u.res->lockname.name, mle->type);
-        }
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
+        mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+             mle->type);
        /* remove from list if not already */
-        if (!list_empty(&mle->list))
+        __dlm_unlink_mle(dlm, mle);
-                list_del_init(&mle->list);
        /* detach the mle from the domain node up/down events */
        __dlm_mle_detach_hb_events(dlm, mle);
+        atomic_dec(&dlm->mle_cur_count[mle->type]);
        /* NOTE: kfree under spinlock here.
         * if this is bad, we can move this to a freelist. */
        kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
                kmem_cache_destroy(dlm_lockres_cache);
 }
-static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  u8 owner)
-{
-        assert_spin_locked(&res->spinlock);
-        mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
-        if (owner == dlm->node_num)
-                atomic_inc(&dlm->local_resources);
-        else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-                atomic_inc(&dlm->unknown_resources);
-        else
-                atomic_inc(&dlm->remote_resources);
-        res->owner = owner;
-}
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res, u8 owner)
-{
-        assert_spin_locked(&res->spinlock);
-        if (owner == res->owner)
-                return;
-        if (res->owner == dlm->node_num)
-                atomic_dec(&dlm->local_resources);
-        else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-                atomic_dec(&dlm->unknown_resources);
-        else
-                atomic_dec(&dlm->remote_resources);
-        dlm_set_lockres_owner(dlm, res, owner);
-}
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
        }
        spin_unlock(&dlm->track_lock);
+        atomic_dec(&dlm->res_cur_count);
        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        kref_init(&res->refs);
+        atomic_inc(&dlm->res_tot_count);
+        atomic_inc(&dlm->res_cur_count);
        /* just for consistency */
        spin_lock(&res->spinlock);
        dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
                alloc_mle = NULL;
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
-                list_add(&mle->list, &dlm->master_list);
+                __dlm_insert_mle(dlm, mle);
                /* still holding the dlm spinlock, check the recovery map
                 * to see if there are any nodes that still need to be 
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                                     res->lockname.len,
                                                     res->lockname.name);
                                                mle->type = DLM_MLE_MASTER;
-                                                mle->u.res = res;
+                                                mle->mleres = res;
                                        }
                                }
                        }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
        BUG_ON(mle->type == DLM_MLE_MIGRATION);
-        if (mle->type != DLM_MLE_MASTER) {
+        request.namelen = (u8)mle->mnamelen;
-                request.namelen = mle->u.name.len;
+        memcpy(request.name, mle->mname, request.namelen);
-                memcpy(request.name, mle->u.name.name, request.namelen);
-        } else {
-                request.namelen = mle->u.res->lockname.len;
-                memcpy(request.name, mle->u.res->lockname.name,
-                        request.namelen);
-        }
 again:
        ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
                // "add the block.\n");
                dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
                set_bit(request->node_idx, mle->maybe_map);
-                list_add(&mle->list, &dlm->master_list);
+                __dlm_insert_mle(dlm, mle);
                response = DLM_MASTER_RESP_NO;
        } else {
                // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
                             assert->node_idx, rr, extra_ref, mle->inuse);
                        dlm_print_one_mle(mle);
                }
-                list_del_init(&mle->list);
+                __dlm_unlink_mle(dlm, mle);
                __dlm_mle_detach_hb_events(dlm, mle);
                __dlm_put_mle(mle);
                if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        tmp->master = master;
                        atomic_set(&tmp->woken, 1);
                        wake_up(&tmp->wq);
-                        /* remove it from the list so that only one
+                        /* remove it so that only one mle will be found */
-                         * mle will be found */
+                        __dlm_unlink_mle(dlm, tmp);
-                        list_del_init(&tmp->list);
-                        /* this was obviously WRONG.  mle is uninited here.  should be tmp. */
                        __dlm_mle_detach_hb_events(dlm, tmp);
                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
        mle->master = master;
        /* do this for consistency with other mle types */
        set_bit(new_master, mle->maybe_map);
-        list_add(&mle->list, &dlm->master_list);
+        __dlm_insert_mle(dlm, mle);
        return ret;
 }
+/*
-void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+                                        struct dlm_master_list_entry *mle)
 {
-        struct dlm_master_list_entry *mle, *next;
        struct dlm_lock_resource *res;
-        unsigned int hash;
-        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+        /* Find the lockres associated to the mle and set its owner to UNK */
-top:
+        res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
-        assert_spin_locked(&dlm->spinlock);
+                                   mle->mnamehash);
+        if (res) {
+                spin_unlock(&dlm->master_lock);
-        /* clean the master list */
+                /* move lockres onto recovery list */
-        spin_lock(&dlm->master_lock);
+                spin_lock(&res->spinlock);
-        list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
+                dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
-                BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                dlm_move_lockres_to_recovery_list(dlm, res);
-                       mle->type != DLM_MLE_MASTER &&
+                spin_unlock(&res->spinlock);
-                       mle->type != DLM_MLE_MIGRATION);
+                dlm_lockres_put(res);
-                /* MASTER mles are initiated locally.  the waiting
-                 * process will notice the node map change
-                 * shortly.  let that happen as normal. */
-                if (mle->type == DLM_MLE_MASTER)
-                        continue;
+                /* about to get rid of mle, detach from heartbeat */
+                __dlm_mle_detach_hb_events(dlm, mle);
-                /* BLOCK mles are initiated by other nodes.
+                /* dump the mle */
-                 * need to clean up if the dead node would have
+                spin_lock(&dlm->master_lock);
-                 * been the master. */
+                __dlm_put_mle(mle);
-                if (mle->type == DLM_MLE_BLOCK) {
+                spin_unlock(&dlm->master_lock);
-                        int bit;
+        }
-                        spin_lock(&mle->spinlock);
+        return res;
-                        bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+}
-                        if (bit != dead_node) {
-                                mlog(0, "mle found, but dead node %u would "
-                                     "not have been master\n", dead_node);
-                                spin_unlock(&mle->spinlock);
-                        } else {
-                                /* must drop the refcount by one since the
-                                 * assert_master will never arrive.  this
-                                 * may result in the mle being unlinked and
-                                 * freed, but there may still be a process
-                                 * waiting in the dlmlock path which is fine. */
-                                mlog(0, "node %u was expected master\n",
-                                     dead_node);
-                                atomic_set(&mle->woken, 1);
-                                spin_unlock(&mle->spinlock);
-                                wake_up(&mle->wq);
-                                /* do not need events any longer, so detach 
-                                 * from heartbeat */
-                                __dlm_mle_detach_hb_events(dlm, mle);
-                                __dlm_put_mle(mle);
-                        }
-                        continue;
-                }
-                /* everything else is a MIGRATION mle */
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+                                    struct dlm_master_list_entry *mle)
-                /* the rule for MIGRATION mles is that the master
+{
-                 * becomes UNKNOWN if *either* the original or
+        __dlm_mle_detach_hb_events(dlm, mle);
-                 * the new master dies.  all UNKNOWN lockreses
-                 * are sent to whichever node becomes the recovery
-                 * master.  the new master is responsible for
-                 * determining if there is still a master for
-                 * this lockres, or if he needs to take over
-                 * mastery.  either way, this node should expect
-                 * another message to resolve this. */
-                if (mle->master != dead_node &&
-                    mle->new_master != dead_node)
-                        continue;
-                /* if we have reached this point, this mle needs to
+        spin_lock(&mle->spinlock);
-                 * be removed from the list and freed. */
+        __dlm_unlink_mle(dlm, mle);
+        atomic_set(&mle->woken, 1);
+        spin_unlock(&mle->spinlock);
-                /* remove from the list early.  NOTE: unlinking
+        wake_up(&mle->wq);
-                 * list_head while in list_for_each_safe */
+}
-                __dlm_mle_detach_hb_events(dlm, mle);
-                spin_lock(&mle->spinlock);
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
-                list_del_init(&mle->list);
+                                struct dlm_master_list_entry *mle, u8 dead_node)
+{
+        int bit;
+        BUG_ON(mle->type != DLM_MLE_BLOCK);
+        spin_lock(&mle->spinlock);
+        bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+        if (bit != dead_node) {
+                mlog(0, "mle found, but dead node %u would not have been "
+                     "master\n", dead_node);
+                spin_unlock(&mle->spinlock);
+        } else {
+                /* Must drop the refcount by one since the assert_master will
+                 * never arrive. This may result in the mle being unlinked and
+                 * freed, but there may still be a process waiting in the
+                 * dlmlock path which is fine. */
+                mlog(0, "node %u was expected master\n", dead_node);
                atomic_set(&mle->woken, 1);
                spin_unlock(&mle->spinlock);
                wake_up(&mle->wq);
-                mlog(0, "%s: node %u died during migration from "
+                /* Do not need events any longer, so detach from heartbeat */
-                     "%u to %u!\n", dlm->name, dead_node,
+                __dlm_mle_detach_hb_events(dlm, mle);
-                     mle->master, mle->new_master);
+                __dlm_put_mle(mle);
-                /* if there is a lockres associated with this
+        }
-                 * mle, find it and set its owner to UNKNOWN */
+}
-                hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
-                res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                           mle->u.name.len, hash);
-                if (res) {
-                        /* unfortunately if we hit this rare case, our
-                         * lock ordering is messed.  we need to drop
-                         * the master lock so that we can take the
-                         * lockres lock, meaning that we will have to
-                         * restart from the head of list. */
-                        spin_unlock(&dlm->master_lock);
-                        /* move lockres onto recovery list */
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
-                        spin_lock(&res->spinlock);
+{
-                        dlm_set_lockres_owner(dlm, res,
+        struct dlm_master_list_entry *mle;
-                                        DLM_LOCK_RES_OWNER_UNKNOWN);
+        struct dlm_lock_resource *res;
-                        dlm_move_lockres_to_recovery_list(dlm, res);
+        struct hlist_head *bucket;
-                        spin_unlock(&res->spinlock);
+        struct hlist_node *list;
-                        dlm_lockres_put(res);
+        unsigned int i;
-                        /* about to get rid of mle, detach from heartbeat */
+        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
-                        __dlm_mle_detach_hb_events(dlm, mle);
+top:
+        assert_spin_locked(&dlm->spinlock);
-                        /* dump the mle */
+        /* clean the master list */
-                        spin_lock(&dlm->master_lock);
+        spin_lock(&dlm->master_lock);
-                        __dlm_put_mle(mle);
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-                        spin_unlock(&dlm->master_lock);
+                bucket = dlm_master_hash(dlm, i);
+                hlist_for_each(list, bucket) {
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
+                                          master_hash_node);
+                        BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                               mle->type != DLM_MLE_MASTER &&
+                               mle->type != DLM_MLE_MIGRATION);
+                        /* MASTER mles are initiated locally. The waiting
+                         * process will notice the node map change shortly.
+                         * Let that happen as normal. */
+                        if (mle->type == DLM_MLE_MASTER)
+                                continue;
+                        /* BLOCK mles are initiated by other nodes. Need to
+                         * clean up if the dead node would have been the
+                         * master. */
+                        if (mle->type == DLM_MLE_BLOCK) {
+                                dlm_clean_block_mle(dlm, mle, dead_node);
+                                continue;
+                        }
-                        /* restart */
+                        /* Everything else is a MIGRATION mle */
-                        goto top;
-                }
+                        /* The rule for MIGRATION mles is that the master
+                         * becomes UNKNOWN if *either* the original or the new
+                         * master dies. All UNKNOWN lockres' are sent to
+                         * whichever node becomes the recovery master. The new
+                         * master is responsible for determining if there is
+                         * still a master for this lockres, or if he needs to
+                         * take over mastery. Either way, this node should
+                         * expect another message to resolve this. */
+                        if (mle->master != dead_node &&
+                            mle->new_master != dead_node)
+                                continue;
+                        /* If we have reached this point, this mle needs to be
+                         * removed from the list and freed. */
+                        dlm_clean_migration_mle(dlm, mle);
+                        mlog(0, "%s: node %u died during migration from "
+                             "%u to %u!\n", dlm->name, dead_node, mle->master,
+                             mle->new_master);
+                        /* If we find a lockres associated with the mle, we've
+                         * hit this rare case that messes up our lock ordering.
+                         * If so, we need to drop the master lock so that we can
+                         * take the lockres lock, meaning that we will have to
+                         * restart from the head of list. */
+                        res = dlm_reset_mleres_owner(dlm, mle);
+                        if (res)
+                                /* restart */
+                                goto top;
-                /* this may be the last reference */
+                        /* This may be the last reference */
-                __dlm_put_mle(mle);
+                        __dlm_put_mle(mle);
+                }
        }
        spin_unlock(&dlm->master_lock);
 }
 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         u8 old_master)
 {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d490b66ad9d7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        if (!__dlm_lockres_unused(res)) {
-                spin_unlock(&res->spinlock);
                mlog(0, "%s:%.*s: tried to purge but not unused\n",
                     dlm->name, res->lockname.len, res->lockname.name);
-                return -ENOTEMPTY;
+                __dlm_print_one_lock_resource(res);
+                spin_unlock(&res->spinlock);
+                BUG();
        }
+        if (res->state & DLM_LOCK_RES_MIGRATING) {
+                mlog(0, "%s:%.*s: Delay dropref as this lockres is "
+                     "being remastered\n", dlm->name, res->lockname.len,
+                     res->lockname.name);
+                /* Re-add the lockres to the end of the purge list */
+                if (!list_empty(&res->purge)) {
+                        list_del_init(&res->purge);
+                        list_add_tail(&res->purge, &dlm->purge_list);
+                }
+                spin_unlock(&res->spinlock);
+                return 0;
+        }
        master = (res->owner == dlm->node_num);
        if (!master)
                res->state |= DLM_LOCK_RES_DROPPING_REF;
        spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7219a86d34cc..e15fc7d50827 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+        .flags          = 0,
+};
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .get_osb        = ocfs2_get_dentry_osb,
        .post_unlock    = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_rename_lops, osb);
 }
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+                                         struct ocfs2_super *osb)
+{
+        /* nfs_sync lockres doesn't come from a slab so we call init
+         * once on it manually.  */
+        ocfs2_lock_res_init_once(res);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+                                   &ocfs2_nfs_sync_lops, osb);
+}
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp)
 {
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+        int status;
+        struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+        if (ocfs2_is_hard_readonly(osb))
+                return -EROFS;
+        if (ocfs2_mount_local(osb))
+                return 0;
+        status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+                                    0, 0);
+        if (status < 0)
+                mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+        return status;
+}
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+        struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+        if (!ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres,
+                                     ex ? LKM_EXMODE : LKM_PRMODE);
+}
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+        ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
        osb->cconn = conn;
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
+        ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
        osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 {
        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+        ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
 }
 int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3f8d9986b8e0..e1fd5721cd7f 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 2f27b332d8b3..de3da8eb558c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -31,6 +31,7 @@
 #include "ocfs2.h"
+#include "alloc.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dcache.h"
@@ -38,6 +39,7 @@
 #include "inode.h"
 #include "buffer_head_io.h"
+#include "suballoc.h"
 struct ocfs2_inode_handle
 {
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                struct ocfs2_inode_handle *handle)
 {
        struct inode *inode;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        u64 blkno = handle->ih_blkno;
+        int status, set;
        struct dentry *result;
        mlog_entry("(0x%p, 0x%p)\n", sb, handle);
-        if (handle->ih_blkno == 0) {
+        if (blkno == 0) {
-                mlog_errno(-ESTALE);
+                mlog(0, "nfs wants inode with blkno: 0\n");
-                return ERR_PTR(-ESTALE);
+                result = ERR_PTR(-ESTALE);
+                goto bail;
+        }
+        inode = ocfs2_ilookup(sb, blkno);
+        /*
+         * If the inode exists in memory, we only need to check it's
+         * generation number
+         */
+        if (inode)
+                goto check_gen;
+        /*
+         * This will synchronize us against ocfs2_delete_inode() on
+         * all nodes
+         */
+        status = ocfs2_nfs_sync_lock(osb, 1);
+        if (status < 0) {
+                mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+                goto check_err;
+        }
+        status = ocfs2_test_inode_bit(osb, blkno, &set);
+        if (status < 0) {
+                if (status == -EINVAL) {
+                        /*
+                         * The blkno NFS gave us doesn't even show up
+                         * as an inode, we return -ESTALE to be
+                         * nice
+                         */
+                        mlog(0, "test inode bit failed %d\n", status);
+                        status = -ESTALE;
+                } else {
+                        mlog(ML_ERROR, "test inode bit failed %d\n", status);
+                }
+                goto unlock_nfs_sync;
+        }
+        /* If the inode allocator bit is clear, this inode must be stale */
+        if (!set) {
+                mlog(0, "inode %llu suballoc bit is clear\n", blkno);
+                status = -ESTALE;
+                goto unlock_nfs_sync;
        }
-        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
+        inode = ocfs2_iget(osb, blkno, 0, 0);
-        if (IS_ERR(inode))
+unlock_nfs_sync:
-                return (void *)inode;
+        ocfs2_nfs_sync_unlock(osb, 1);
+check_err:
+        if (status < 0) {
+                if (status == -ESTALE) {
+                        mlog(0, "stale inode ino: %llu generation: %u\n",
+                             blkno, handle->ih_generation);
+                }
+                result = ERR_PTR(status);
+                goto bail;
+        }
+        if (IS_ERR(inode)) {
+                mlog_errno(PTR_ERR(inode));
+                result = (void *)inode;
+                goto bail;
+        }
+check_gen:
        if (handle->ih_generation != inode->i_generation) {
                iput(inode);
-                return ERR_PTR(-ESTALE);
+                mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
+                     handle->ih_generation);
+                result = ERR_PTR(-ESTALE);
+                goto bail;
        }
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
                result->d_op = &ocfs2_dentry_ops;
+        else
+                mlog_errno(PTR_ERR(result));
+bail:
        mlog_exit_ptr(result);
        return result;
 }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 229e707bc050..10e1fa87396a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "dir.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+        struct ocfs2_find_inode_args args;
+        args.fi_blkno = blkno;
+        args.fi_flags = 0;
+        args.fi_ino = ino_from_blkno(sb, blkno);
+        args.fi_sysfile_type = 0;
+        return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                         int sysfile_type)
 {
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)le64_to_cpu(fe->i_blkno));
-        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        inode->i_nlink = ocfs2_read_links_count(fe);
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
+        OCFS2_I(inode)->ip_last_used_slot = 0;
+        OCFS2_I(inode)->ip_last_used_group = 0;
        mlog_exit_void();
 }
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
-                                        ocfs2_quota_trans_credits(inode->i_sb));
+                                   ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
                goto bail_unlock_dir;
        }
+        /* Remove any dir index tree */
+        if (S_ISDIR(inode->i_mode)) {
+                status = ocfs2_dx_dir_truncate(inode, di_bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail_unlock_dir;
+                }
+        }
        /*Free extended attribute resources associated with this inode.*/
        status = ocfs2_xattr_remove(inode, di_bh);
        if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
                goto bail;
        }
+        /*
+         * Synchronize us against ocfs2_get_dentry. We take this in
+         * shared mode so that all nodes can still concurrently
+         * process deletes.
+         */
+        status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+        if (status < 0) {
+                mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+                ocfs2_cleanup_delete_inode(inode, 0);
+                goto bail_unblock;
+        }
        /* Lock down the inode. This gives us an up to date view of
         * it's metadata (for verification), and allows us to
         * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
                if (status != -ENOENT)
                        mlog_errno(status);
                ocfs2_cleanup_delete_inode(inode, 0);
-                goto bail_unblock;
+                goto bail_unlock_nfs_sync;
        }
        /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
 bail_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
+bail_unlock_nfs_sync:
+        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
        if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        fe->i_size = cpu_to_le64(i_size_read(inode));
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_uid = cpu_to_le32(inode->i_uid);
        fe->i_gid = cpu_to_le32(inode->i_gid);
        fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
        ocfs2_set_inode_flags(inode);
        i_size_write(inode, le64_to_cpu(fe->i_size));
-        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        inode->i_nlink = ocfs2_read_links_count(fe);
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
        inode->i_mode = le16_to_cpu(fe->i_mode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index eb3c302b38d3..ea71525aad41 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
        struct inode                    vfs_inode;
        struct jbd2_inode               ip_jinode;
+        /* Only valid if the inode is the dir. */
+        u32                             ip_last_used_slot;
+        u64                             ip_last_used_group;
 };
 /*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE           0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 57d7d25a2b9a..a20a0f1e37fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+                                            int slot_num,
+                                            struct ocfs2_dinode *la_dinode,
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec);
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
        return __ocfs2_wait_on_mount(osb, 1);
 }
 /*
- * The recovery_list is a simple linked list of node numbers to recover.
+ * This replay_map is to track online/offline slots, so we could recover
- * It is protected by the recovery_lock.
+ * offline slots during recovery and mount
 */
-struct ocfs2_recovery_map {
+enum ocfs2_replay_state {
-        unsigned int rm_used;
+        REPLAY_UNNEEDED = 0,    /* Replay is not needed, so ignore this map */
-        unsigned int *rm_entries;
+        REPLAY_NEEDED,          /* Replay slots marked in rm_replay_slots */
+        REPLAY_DONE             /* Replay was already queued */
 };
+struct ocfs2_replay_map {
+        unsigned int rm_slots;
+        enum ocfs2_replay_state rm_state;
+        unsigned char rm_replay_slots[0];
+};
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+        if (!osb->replay_map)
+                return;
+        /* If we've already queued the replay, we don't have any more to do */
+        if (osb->replay_map->rm_state == REPLAY_DONE)
+                return;
+        osb->replay_map->rm_state = state;
+}
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map;
+        int i, node_num;
+        /* If replay map is already set, we don't do it again */
+        if (osb->replay_map)
+                return 0;
+        replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+                             (osb->max_slots * sizeof(char)), GFP_KERNEL);
+        if (!replay_map) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        spin_lock(&osb->osb_lock);
+        replay_map->rm_slots = osb->max_slots;
+        replay_map->rm_state = REPLAY_UNNEEDED;
+        /* set rm_replay_slots for offline slot(s) */
+        for (i = 0; i < replay_map->rm_slots; i++) {
+                if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+                        replay_map->rm_replay_slots[i] = 1;
+        }
+        osb->replay_map = replay_map;
+        spin_unlock(&osb->osb_lock);
+        return 0;
+}
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map = osb->replay_map;
+        int i;
+        if (!replay_map)
+                return;
+        if (replay_map->rm_state != REPLAY_NEEDED)
+                return;
+        for (i = 0; i < replay_map->rm_slots; i++)
+                if (replay_map->rm_replay_slots[i])
+                        ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+                                                        NULL, NULL);
+        replay_map->rm_state = REPLAY_DONE;
+}
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+        struct ocfs2_replay_map *replay_map = osb->replay_map;
+        if (!osb->replay_map)
+                return;
+        kfree(replay_map);
+        osb->replay_map = NULL;
+}
 int ocfs2_recovery_init(struct ocfs2_super *osb)
 {
        struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
        },
 };
+static struct ocfs2_triggers dr_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+static struct ocfs2_triggers dl_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
+};
 static int __ocfs2_journal_access(handle_t *handle,
                                  struct inode *inode,
                                  struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
                                      type);
 }
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
+                                      type);
+}
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type)
 {
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 }
 /* Called by the mount code to queue recovery the last part of
- * recovery for it's own slot. */
+ * recovery for it's own and offline slot(s). */
 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 {
        struct ocfs2_journal *journal = osb->journal;
-        if (osb->dirty) {
+        /* No need to queue up our truncate_log as regular cleanup will catch
-                /* No need to queue up our truncate_log as regular
+         * that */
-                 * cleanup will catch that. */
+        ocfs2_queue_recovery_completion(journal, osb->slot_num,
-                ocfs2_queue_recovery_completion(journal,
+                                        osb->local_alloc_copy, NULL, NULL);
-                                                osb->slot_num,
+        ocfs2_schedule_truncate_log_flush(osb, 0);
-                                                osb->local_alloc_copy,
-                                                NULL,
-                                                NULL);
-                ocfs2_schedule_truncate_log_flush(osb, 0);
-                osb->local_alloc_copy = NULL;
+        osb->local_alloc_copy = NULL;
-                osb->dirty = 0;
+        osb->dirty = 0;
-        }
+        /* queue to recover orphan slots for all offline slots */
+        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+        ocfs2_queue_replay_slots(osb);
+        ocfs2_free_replay_slots(osb);
 }
 void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
                goto bail;
        }
+        status = ocfs2_compute_replay_slots(osb);
+        if (status < 0)
+                mlog_errno(status);
+        /* queue recovery for our own slot */
+        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+                                        NULL, NULL);
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
                /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
        ocfs2_super_unlock(osb, 1);
-        /* We always run recovery on our own orphan dir - the dead
+        /* queue recovery for offline slots */
-         * node(s) may have disallowd a previos inode delete. Re-processing
+        ocfs2_queue_replay_slots(osb);
-         * is therefore required. */
-        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
                goto restart;
        }
+        ocfs2_free_replay_slots(osb);
        osb->recovery_thread_task = NULL;
        mb(); /* sync with ocfs2_recovery_thread_running */
        wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
                goto done;
        }
+        /* we need to run complete recovery for offline orphan slots */
+        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
        mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
             node_num, slot_num,
             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 172850a9a12a..619dd7f6c053 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
 struct ocfs2_super;
 struct ocfs2_dinode;
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+struct ocfs2_recovery_map {
+        unsigned int rm_used;
+        unsigned int *rm_entries;
+};
 struct ocfs2_journal {
        enum ocfs2_journal_state   j_state;    /* Journals current state   */
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
 int ocfs2_recovery_init(struct ocfs2_super *osb);
 void ocfs2_recovery_exit(struct ocfs2_super *osb);
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
 /*
 *  Journal Control:
 *  Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
 /* dirblock */
 int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
 /* Anything that has no ecc */
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 }
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) */
+ * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+        /* 1 block for index, 2 allocs (data, metadata), 1 clusters
+         * worth of blocks for initial extent. */
+        return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+                ocfs2_clusters_to_blocks(sb, 1);
+}
-/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
- * group descriptor + mkdir/symlink blocks + quota update */
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
-static inline int ocfs2_mknod_credits(struct super_block *sb)
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+                                      int xattr_credits)
 {
-        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+        int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+        if (is_dir)
+                dir_credits += ocfs2_add_dir_index_credits(sb);
+        return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
               ocfs2_quota_trans_credits(sb);
 }
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir */
+ * update on dir + index leaf + dx root update for free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
               ocfs2_quota_trans_credits(sb);
 }
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
- * dir inode link */
+ * dir inode link + dir inode index leaf + dir index root */
 static inline int ocfs2_unlink_credits(struct super_block *sb)
 {
        /* The quota update from ocfs2_link_credits is unused here... */
-        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
 }
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor */
+ * inode alloc group descriptor + orphan dir index leaf */
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
- * directory + target unlink */
+ * directory + target unlink + 3 x dir index leaves */
 static inline int ocfs2_rename_credits(struct super_block *sb)
 {
-        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
 }
 /* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
                                          + OCFS2_INODE_UPDATE_CREDITS \
                                          + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +      \
+                                      OCFS2_SUBALLOC_FREE)
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+        int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+        credits += ocfs2_clusters_to_blocks(sb, 1);
+        credits += ocfs2_quota_trans_credits(sb);
+        return credits;
+}
 /*
 * Please note that the caller must make sure that root_el is the root
 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = ocfs2_mknod_credits(sb);
+        int blocks = ocfs2_mknod_credits(sb, 0, 0);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec70cdbe77fc..bac7e6abaf47 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
-#include <linux/debugfs.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
-#ifdef CONFIG_OCFS2_FS_STATS
-static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
-{
-        file->private_data = inode->i_private;
-        return 0;
-}
-#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
-#define LA_DEBUG_VER    1
-static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
-                                   size_t count, loff_t *ppos)
-{
-        static DEFINE_MUTEX(la_debug_mutex);
-        struct ocfs2_super *osb = file->private_data;
-        int written, ret;
-        char *buf = osb->local_alloc_debug_buf;
-        mutex_lock(&la_debug_mutex);
-        memset(buf, 0, LA_DEBUG_BUF_SZ);
-        written = snprintf(buf, LA_DEBUG_BUF_SZ,
-                           "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
-                           LA_DEBUG_VER,
-                           (unsigned long long)osb->la_last_gd,
-                           osb->local_alloc_default_bits,
-                           osb->local_alloc_bits, osb->local_alloc_state);
-        ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
-        mutex_unlock(&la_debug_mutex);
-        return ret;
-}
-static const struct file_operations ocfs2_la_debug_fops = {
-        .open =         ocfs2_la_debug_open,
-        .read =         ocfs2_la_debug_read,
-};
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-        osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
-        if (!osb->local_alloc_debug_buf)
-                return;
-        osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
-                                                     S_IFREG|S_IRUSR,
-                                                     osb->osb_debug_root,
-                                                     osb,
-                                                     &ocfs2_la_debug_fops);
-        if (!osb->local_alloc_debug) {
-                kfree(osb->local_alloc_debug_buf);
-                osb->local_alloc_debug_buf = NULL;
-        }
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-        if (osb->local_alloc_debug)
-                debugfs_remove(osb->local_alloc_debug);
-        if (osb->local_alloc_debug_buf)
-                kfree(osb->local_alloc_debug_buf);
-        osb->local_alloc_debug_buf = NULL;
-        osb->local_alloc_debug = NULL;
-}
-#else   /* CONFIG_OCFS2_FS_STATS */
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-        return;
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-        return;
-}
-#endif
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
-        ocfs2_init_la_debug(osb);
        if (osb->local_alloc_bits == 0)
                goto bail;
@@ -299,9 +218,6 @@ bail:
        if (inode)
                iput(inode);
-        if (status < 0)
-                ocfs2_shutdown_la_debug(osb);
        mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
        mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        cancel_delayed_work(&osb->la_enable_wq);
        flush_workqueue(ocfs2_wq);
-        ocfs2_shutdown_la_debug(osb);
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
        return ret;
 }
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
        sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
        if (ret2 < 0)
                mlog_errno(ret2);
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4b11762f249e..2220f93f668b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                    struct buffer_head **de_bh);
+                                    struct ocfs2_dir_lookup_result *lookup);
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                            struct buffer_head *de_bh,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
        struct ocfs2_super *osb;
        struct ocfs2_dinode *dirfe;
        struct buffer_head *new_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *xattr_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
        int want_clusters = 0;
+        int want_meta = 0;
        int xattr_credits = 0;
        struct ocfs2_security_xattr_info si = {
                .enable = 1,
        };
        int did_quota_inode = 0;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
                return status;
        }
-        if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+        if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
                status = -EMLINK;
                goto leave;
        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-        if (!dirfe->i_links_count) {
+        if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
        /* get a spot inside the dir. */
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                              dentry->d_name.len, &de_bh);
+                                              dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
        /* calculate meta data/clusters for setting security and acl xattr */
        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
-                                        &si, &want_clusters,
+                                       &si, &want_clusters,
-                                        &xattr_credits, &xattr_ac);
+                                       &xattr_credits, &want_meta);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        /* Reserve a cluster if creating an extent based directory. */
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
                want_clusters += 1;
+                /* Dir indexing requires extra space as well */
+                if (ocfs2_supports_indexed_dirs(osb))
+                        want_meta++;
+        }
+        status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
-                                   xattr_credits);
+                                                            S_ISDIR(mode),
+                                                            xattr_credits));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
        if (S_ISDIR(mode)) {
                status = ocfs2_fill_new_dir(osb, handle, dir, inode,
-                                            new_fe_bh, data_ac);
+                                            new_fe_bh, data_ac, meta_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
                        mlog_errno(status);
                        goto leave;
                }
-                le16_add_cpu(&dirfe->i_links_count, 1);
+                ocfs2_add_links_count(dirfe, 1);
                status = ocfs2_journal_dirty(handle, parent_fe_bh);
                if (status < 0) {
                        mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
-                                xattr_ac, data_ac);
+                                meta_ac, data_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
        if (si.enable) {
                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
-                                                 xattr_ac, data_ac);
+                                                 meta_ac, data_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-                                 de_bh);
+                                 &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -423,11 +437,12 @@ leave:
                mlog(0, "Disk is full\n");
        brelse(new_fe_bh);
-        brelse(de_bh);
        brelse(parent_fe_bh);
        kfree(si.name);
        kfree(si.value);
+        ocfs2_free_dir_lookup_result(&lookup);
        if ((status < 0) && inode) {
                clear_nlink(inode);
                iput(inode);
@@ -439,8 +454,8 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if (xattr_ac)
+        if (meta_ac)
-                ocfs2_free_alloc_context(xattr_ac);
+                ocfs2_free_alloc_context(meta_ac);
        mlog_exit(status);
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
+        u16 feat;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
-                                       &fe_blkno);
+                                       inode_ac, &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_mode = cpu_to_le16(inode->i_mode);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_last_eb_blk = 0;
        strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_dtime = 0;
        /*
-         * If supported, directories start with inline data.
+         * If supported, directories start with inline data. If inline
+         * isn't supported, but indexing is, we start them as indexed.
         */
+        feat = le16_to_cpu(fe->i_dyn_features);
        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
-                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
                fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        int err;
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                           dentry->d_name.name,
-                                           dentry->d_name.len, &de_bh);
+                                           dentry->d_name.len, &lookup);
        if (err < 0) {
                mlog_errno(err);
                goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+        if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
                err = -EMLINK;
                goto out_unlock_inode;
        }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        err = ocfs2_journal_dirty(handle, fe_bh);
        if (err < 0) {
-                le16_add_cpu(&fe->i_links_count, -1);
+                ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
-                              parent_fe_bh, de_bh);
+                              parent_fe_bh, &lookup);
        if (err) {
-                le16_add_cpu(&fe->i_links_count, -1);
+                ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
 out:
        ocfs2_inode_unlock(dir, 1);
-        brelse(de_bh);
        brelse(fe_bh);
        brelse(parent_fe_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(err);
        return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_node_bh = NULL;
        handle_t *handle = NULL;
-        struct ocfs2_dir_entry *dirent = NULL;
-        struct buffer_head *dirent_bh = NULL;
        char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-        struct buffer_head *orphan_entry_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
        }
        status = ocfs2_find_files_on_disk(dentry->d_name.name,
-                                          dentry->d_name.len, &blkno,
+                                          dentry->d_name.len, &blkno, dir,
-                                          dir, &dirent_bh, &dirent);
+                                          &lookup);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
        child_locked = 1;
        if (S_ISDIR(inode->i_mode)) {
-                if (!ocfs2_empty_dir(inode)) {
+                if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
-                        status = -ENOTEMPTY;
-                        goto leave;
-                } else if (inode->i_nlink != 2) {
                        status = -ENOTEMPTY;
                        goto leave;
                }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
-                                                  orphan_name,
+                                                  orphan_name, &orphan_insert);
-                                                  &orphan_entry_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
-                                          orphan_entry_bh, orphan_dir);
+                                          &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
        }
        /* delete the name from the parent dir */
-        status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+        status = ocfs2_delete_entry(handle, dir, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (S_ISDIR(inode->i_mode))
                drop_nlink(inode);
        drop_nlink(inode);
-        fe->i_links_count = cpu_to_le16(inode->i_nlink);
+        ocfs2_set_links_count(fe, inode->i_nlink);
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
@@ -916,9 +929,10 @@ leave:
        }
        brelse(fe_bh);
-        brelse(dirent_bh);
        brelse(parent_node_bh);
-        brelse(orphan_entry_bh);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(status);
@@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
                        struct inode *new_dir,
                        struct dentry *new_dentry)
 {
-        int status = 0, rename_lock = 0, parents_locked = 0;
+        int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
-        int old_child_locked = 0, new_child_locked = 0;
+        int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *orphan_dir = NULL;
@@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
        handle_t *handle = NULL;
        struct buffer_head *old_dir_bh = NULL;
        struct buffer_head *new_dir_bh = NULL;
-        struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
-                *new_de = NULL;
-        struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
-        struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
-                                                    // this is the 1st dirent bh
        nlink_t old_dir_nlink = old_dir->i_nlink;
        struct ocfs2_dinode *old_di;
+        struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+        struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+        struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        struct ocfs2_dir_lookup_result target_insert = { NULL, };
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
        if (S_ISDIR(old_inode->i_mode)) {
                u64 old_inode_parent;
+                update_dot_dot = 1;
                status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
-                                                  old_inode, &old_inode_de_bh,
+                                                  old_inode,
-                                                  &old_inode_dot_dot_de);
+                                                  &old_inode_dot_dot_res);
                if (status) {
                        status = -EIO;
                        goto bail;
@@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                if (!new_inode && new_dir != old_dir &&
-                    new_dir->i_nlink >= OCFS2_LINK_MAX) {
+                    new_dir->i_nlink >= ocfs2_link_max(osb)) {
                        status = -EMLINK;
                        goto bail;
                }
@@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
         * to delete it */
        status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
                                          new_dentry->d_name.len,
-                                          &newfe_blkno, new_dir, &new_de_bh,
+                                          &newfe_blkno, new_dir,
-                                          &new_de);
+                                          &target_lookup_res);
        /* The only error we allow here is -ENOENT because the new
         * file not existing is perfectly valid. */
        if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
                mlog_errno(status);
                goto bail;
        }
+        if (status == 0)
+                target_exists = 1;
-        if (!new_de && new_inode) {
+        if (!target_exists && new_inode) {
                /*
                 * Target was unlinked by another node while we were
                 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
        /* In case we need to overwrite an existing file, we blow it
         * away first */
-        if (new_de) {
+        if (target_exists) {
                /* VFS didn't think there existed an inode here, but
                 * someone else in the cluster must have raced our
                 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
                newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
-                mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
+                mlog(0, "aha rename over existing... new_blkno=%llu "
-                     "newfebh=%p bhblocknr=%llu\n", new_de,
+                     "newfebh=%p bhblocknr=%llu\n",
                     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
                     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
@@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                          new_inode,
                                                          orphan_name,
-                                                          &orphan_entry_bh);
+                                                          &orphan_insert);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
                status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
                                                      new_dentry->d_name.name,
                                                      new_dentry->d_name.len,
-                                                      &insert_entry_bh);
+                                                      &target_insert);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
                goto bail;
        }
-        if (new_de) {
+        if (target_exists) {
                if (S_ISDIR(new_inode->i_mode)) {
-                        if (!ocfs2_empty_dir(new_inode) ||
+                        if (new_inode->i_nlink != 2 ||
-                            new_inode->i_nlink != 2) {
+                            !ocfs2_empty_dir(new_inode)) {
                                status = -ENOTEMPTY;
                                goto bail;
                        }
@@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                if (S_ISDIR(new_inode->i_mode) ||
-                    (newfe->i_links_count == cpu_to_le16(1))){
+                    (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                                  newfe, orphan_name,
-                                                  orphan_entry_bh, orphan_dir);
+                                                  &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
                }
                /* change the dirent to point to the correct inode */
-                status = ocfs2_update_entry(new_dir, handle, new_de_bh,
+                status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
-                                            new_de, old_inode);
+                                            old_inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
                new_dir->i_version++;
                if (S_ISDIR(new_inode->i_mode))
-                        newfe->i_links_count = 0;
+                        ocfs2_set_links_count(newfe, 0);
                else
-                        le16_add_cpu(&newfe->i_links_count, -1);
+                        ocfs2_add_links_count(newfe, -1);
                status = ocfs2_journal_dirty(handle, newfe_bh);
                if (status < 0) {
@@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
                                         OCFS2_I(old_inode)->ip_blkno,
-                                         new_dir_bh, insert_entry_bh);
+                                         new_dir_bh, &target_insert);
        }
        old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
         * because the insert might have changed the type of directory
         * we're dealing with.
         */
-        old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+        status = ocfs2_find_entry(old_dentry->d_name.name,
-                                     old_dentry->d_name.len,
+                                  old_dentry->d_name.len, old_dir,
-                                     old_dir, &old_de);
+                                  &old_entry_lookup);
-        if (!old_de_bh) {
+        if (status)
-                status = -EIO;
                goto bail;
-        }
-        status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+        status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
                new_inode->i_ctime = CURRENT_TIME;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-        if (old_inode_de_bh) {
-                status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
+        if (update_dot_dot) {
-                                            old_inode_dot_dot_de, new_dir);
+                status = ocfs2_update_entry(old_inode, handle,
+                                            &old_inode_dot_dot_res, new_dir);
                old_dir->i_nlink--;
                if (new_inode) {
                        new_inode->i_nlink--;
@@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
                } else {
                        struct ocfs2_dinode *fe;
                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                         old_dir_bh,
+                                                      old_dir_bh,
-                                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
-                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+                        ocfs2_set_links_count(fe, old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
@@ -1429,13 +1444,17 @@ bail:
        if (new_inode)
                iput(new_inode);
+        ocfs2_free_dir_lookup_result(&target_lookup_res);
+        ocfs2_free_dir_lookup_result(&old_entry_lookup);
+        ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+        ocfs2_free_dir_lookup_result(&target_insert);
        brelse(newfe_bh);
        brelse(old_inode_bh);
        brelse(old_dir_bh);
        brelse(new_dir_bh);
-        brelse(new_de_bh);
-        brelse(old_de_bh);
-        brelse(old_inode_de_bh);
        brelse(orphan_entry_bh);
        brelse(insert_entry_bh);
@@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
        struct inode *inode = NULL;
        struct super_block *sb;
        struct buffer_head *new_fe_bh = NULL;
-        struct buffer_head *de_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_dinode *dirfe;
@@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
                .enable = 1,
        };
        int did_quota = 0, did_quota_inode = 0;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-        if (!dirfe->i_links_count) {
+        if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto bail;
@@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                              dentry->d_name.len, &de_bh);
+                                              dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
-                                 de_bh);
+                                 &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1772,9 +1791,9 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
-        brelse(de_bh);
        kfree(si.name);
        kfree(si.value);
+        ocfs2_free_dir_lookup_result(&lookup);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
@@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                    struct buffer_head **de_bh)
+                                    struct ocfs2_dir_lookup_result *lookup)
 {
        struct inode *orphan_dir_inode;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                              orphan_dir_bh, name,
-                                              OCFS2_ORPHAN_NAMELEN, de_bh);
+                                              OCFS2_ORPHAN_NAMELEN, lookup);
        if (status < 0) {
                ocfs2_inode_unlock(orphan_dir_inode, 1);
@@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                            struct buffer_head *de_bh,
+                            struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
 {
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
         * underneath us... */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-                le16_add_cpu(&orphan_fe->i_links_count, 1);
+                ocfs2_add_links_count(orphan_fe, 1);
-        orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
                                   OCFS2_I(inode)->ip_blkno,
-                                   orphan_dir_bh, de_bh);
+                                   orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        char name[OCFS2_ORPHAN_NAMELEN + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
-        struct buffer_head *target_de_bh = NULL;
+        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        struct ocfs2_dir_entry *target_de = NULL;
        mlog_entry_void();
@@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
             OCFS2_ORPHAN_NAMELEN);
        /* find it's spot in the orphan directory */
-        target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+        status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
-                                        orphan_dir_inode, &target_de);
+                                  &lookup);
-        if (!target_de_bh) {
+        if (status) {
-                status = -ENOENT;
                mlog_errno(status);
                goto leave;
        }
        /* remove it from the orphan directory */
-        status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+        status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
-                                    target_de_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        /* do the i_nlink dance! :) */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-                le16_add_cpu(&orphan_fe->i_links_count, -1);
+                ocfs2_add_links_count(orphan_fe, -1);
-        orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        }
 leave:
-        brelse(target_de_bh);
+        ocfs2_free_dir_lookup_result(&lookup);
        mlog_exit(status);
        return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 946d3c34b90b..1386281950db 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
 struct ocfs2_quota_recovery;
 struct ocfs2_dentry_lock;
 struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
        atomic_t vol_state;
        struct mutex recovery_lock;
        struct ocfs2_recovery_map *recovery_map;
+        struct ocfs2_replay_map *replay_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
        u64 la_last_gd;
-#ifdef CONFIG_OCFS2_FS_STATS
-        struct dentry *local_alloc_debug;
-        char *local_alloc_debug_buf;
-#endif
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
+        struct ocfs2_lock_res osb_nfs_sync_lockres;
        struct ocfs2_dlm_debug *osb_dlm_debug;
        struct dentry *osb_debug_root;
+        struct dentry *osb_ctxt;
        wait_queue_head_t recovery_event;
@@ -344,6 +343,12 @@ struct ocfs2_super
        /* used to protect metaecc calculation check of xattr. */
        spinlock_t osb_xattr_lock;
+        unsigned int                    osb_dx_mask;
+        u32                             osb_dx_seed[4];
+        /* the group we used to allocate inodes. */
+        u64                             osb_inode_alloc_group;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+                return 1;
+        return 0;
+}
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+        if (ocfs2_supports_indexed_dirs(osb))
+                return OCFS2_DX_LINK_MAX;
+        return OCFS2_LINK_MAX;
+}
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+        u32 nlink = le16_to_cpu(di->i_links_count);
+        u32 hi = le16_to_cpu(di->i_links_count_hi);
+        if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+                nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+        return nlink;
+}
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+        u16 lo, hi;
+        lo = nlink;
+        hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+        di->i_links_count = cpu_to_le16(lo);
+        di->i_links_count_hi = cpu_to_le16(hi);
+}
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+        u32 links = ocfs2_read_links_count(di);
+        links += n;
+        ocfs2_set_links_count(di, links);
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+#define OCFS2_IS_VALID_DX_ROOT(ptr)                                     \
+        (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+#define OCFS2_IS_VALID_DX_LEAF(ptr)                                     \
+        (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
        return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
 }
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+                                               u64 blocks)
+{
+        int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+        unsigned int clusters;
+        clusters = ocfs2_blocks_to_clusters(sb, blocks);
+        return (u64)clusters << bits;
+}
 static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
                                                u64 bytes)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 2332ef740f4f..7ab6e9e5e77c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -66,6 +66,8 @@
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
 #define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE         "DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE         "DXLEAF1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -95,7 +97,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
+                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -151,6 +154,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS     0x0400
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define OCFS2_DIR_REC_LEN(name_len)     (((name_len) + OCFS2_DIR_MEMBER_LEN + \
                                          OCFS2_DIR_ROUND) & \
                                         ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN   OCFS2_DIR_REC_LEN(1)
 #define OCFS2_LINK_MAX          32000
+#define OCFS2_DX_LINK_MAX       ((1U << 31) - 1U)
+#define OCFS2_LINKS_HI_SHIFT    16
+#define OCFS2_DX_ENTRIES_MAX    (0xffffffffU)
 #define S_SHIFT                 12
 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
 /*B8*/  __le16 s_xattr_inline_size;     /* extended attribute inline size
                                           for this fs*/
        __le16 s_reserved0;
-        __le32 s_reserved1;
+        __le32 s_dx_seed[3];            /* seed[0-2] for dx dir hash.
-/*C0*/  __le64 s_reserved2[16];         /* Fill out superblock */
+                                         * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];         /* Fill out superblock */
 /*140*/
        /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
                                           belongs to */
        __le16 i_suballoc_bit;          /* Bit offset in suballocator
                                           block group */
-/*10*/  __le16 i_reserved0;
+/*10*/  __le16 i_links_count_hi;        /* High 16 bits of links count */
        __le16 i_xattr_inline_size;
        __le32 i_clusters;              /* Cluster count */
        __le32 i_uid;                   /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
-/*88*/  __le64 i_reserved2[6];
+/*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
+        __le64 i_reserved2[5];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
 /*40*/
 };
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+        __le32          dx_major_hash;  /* Used to find logical
+                                         * cluster in index */
+        __le32          dx_minor_hash;  /* Lower bits used to find
+                                         * block in cluster */
+        __le64          dx_dirent_blk;  /* Physical block in unindexed
+                                         * tree holding this dirent. */
+};
+struct ocfs2_dx_entry_list {
+        __le32          de_reserved;
+        __le16          de_count;       /* Maximum number of entries
+                                         * possible in de_entries */
+        __le16          de_num_used;    /* Current number of
+                                         * de_entries entries */
+        struct  ocfs2_dx_entry          de_entries[0];  /* Indexed dir entries
+                                                         * in a packed array of
+                                                         * length de_num_used */
+};
+#define OCFS2_DX_FLAG_INLINE    0x01
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+        __u8            dr_signature[8];        /* Signature for verification */
+        struct ocfs2_block_check dr_check;      /* Error checking */
+        __le16          dr_suballoc_slot;       /* Slot suballocator this
+                                                 * block belongs to. */
+        __le16          dr_suballoc_bit;        /* Bit offset in suballocator
+                                                 * block group */
+        __le32          dr_fs_generation;       /* Must match super block */
+        __le64          dr_blkno;               /* Offset on disk, in blocks */
+        __le64          dr_last_eb_blk;         /* Pointer to last
+                                                 * extent block */
+        __le32          dr_clusters;            /* Clusters allocated
+                                                 * to the indexed tree. */
+        __u8            dr_flags;               /* OCFS2_DX_FLAG_* flags */
+        __u8            dr_reserved0;
+        __le16          dr_reserved1;
+        __le64          dr_dir_blkno;           /* Pointer to parent inode */
+        __le32          dr_num_entries;         /* Total number of
+                                                 * names stored in
+                                                 * this directory.*/
+        __le32          dr_reserved2;
+        __le64          dr_free_blk;            /* Pointer to head of free
+                                                 * unindexed block list. */
+        __le64          dr_reserved3[15];
+        union {
+                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+                                                   * bits for maximum space
+                                                   * efficiency. */
+                struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+                                                        * entries. We grow out
+                                                        * to extents if this
+                                                        * gets too big. */
+        };
+};
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+        __u8            dl_signature[8];/* Signature for verification */
+        struct ocfs2_block_check dl_check;      /* Error checking */
+        __le64          dl_blkno;       /* Offset on disk, in blocks */
+        __le32          dl_fs_generation;/* Must match super block */
+        __le32          dl_reserved0;
+        __le64          dl_reserved1;
+        struct ocfs2_dx_entry_list      dl_list;
+};
 /*
 * On disk allocator group structure for OCFS2
 */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
        int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+        return size / sizeof(struct ocfs2_dx_entry);
+}
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+        return size / sizeof(struct ocfs2_dx_entry);
+}
 static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 {
        u16 size;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index eb6f50c9ceca..a53ce87481bf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_LOCK_TYPE_QINFO,
+        OCFS2_LOCK_TYPE_NFS_SYNC,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_QINFO:
                        c = 'Q';
                        break;
+                case OCFS2_LOCK_TYPE_NFS_SYNC:
+                        c = 'Y';
+                        break;
                default:
                        c = '\0';
        }
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a69628603e18..b4ca5911caaf 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -48,7 +48,8 @@
 #include "buffer_head_io.h"
 #define NOT_ALLOC_NEW_GROUP             0
-#define ALLOC_NEW_GROUP                 1
+#define ALLOC_NEW_GROUP                 0x1
+#define ALLOC_GROUPS_FROM_GLOBAL        0x2
 #define OCFS2_MAX_INODES_TO_STEAL       1024
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                   u64 max_block);
+                                   u64 max_block,
+                                   u64 *last_alloc_group,
+                                   int flags);
 static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u16 *bg_bit_off);
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                             int flags,
                                             struct ocfs2_alloc_context **ac);
 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                   u64 max_block)
+                                   u64 max_block,
+                                   u64 *last_alloc_group,
+                                   int flags)
 {
        int status, credits;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        cl = &fe->id2.i_chain;
        status = ocfs2_reserve_clusters_with_limit(osb,
                                                   le16_to_cpu(cl->cl_cpg),
-                                                   max_block, &ac);
+                                                   max_block, flags, &ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        if (last_alloc_group && *last_alloc_group != 0) {
+                mlog(0, "use old allocation group %llu for block group alloc\n",
+                     (unsigned long long)*last_alloc_group);
+                ac->ac_last_group = *last_alloc_group;
+        }
        status = ocfs2_claim_clusters(osb,
                                      handle,
                                      ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
        status = 0;
+        /* save the new last alloc group so that the caller can cache it. */
+        if (last_alloc_group)
+                *last_alloc_group = ac->ac_last_group;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
                                       u32 slot,
-                                       int alloc_new_group)
+                                       u64 *last_alloc_group,
+                                       int flags)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
-                if (alloc_new_group != ALLOC_NEW_GROUP) {
+                if (!(flags & ALLOC_NEW_GROUP)) {
                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
                             "and we don't alloc a new group for it.\n",
                             slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                }
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
-                                                 ac->ac_max_block);
+                                                 ac->ac_max_block,
+                                                 last_alloc_group, flags);
                if (status < 0) {
                        if (status != -ENOSPC)
                                mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
-                                             slot, ALLOC_NEW_GROUP);
+                                             slot, NULL, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
                status = ocfs2_reserve_suballoc_bits(osb, ac,
                                                     INODE_ALLOC_SYSTEM_INODE,
-                                                     slot, NOT_ALLOC_NEW_GROUP);
+                                                     slot, NULL,
+                                                     NOT_ALLOC_NEW_GROUP);
                if (status >= 0) {
                        ocfs2_set_inode_steal_slot(osb, slot);
                        break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 {
        int status;
        s16 slot = ocfs2_get_inode_steal_slot(osb);
+        u64 alloc_group;
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                goto inode_steal;
        atomic_set(&osb->s_num_inodes_stolen, 0);
+        alloc_group = osb->osb_inode_alloc_group;
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                             osb->slot_num, ALLOC_NEW_GROUP);
+                                             osb->slot_num,
+                                             &alloc_group,
+                                             ALLOC_NEW_GROUP |
+                                             ALLOC_GROUPS_FROM_GLOBAL);
        if (status >= 0) {
                status = 0;
+                spin_lock(&osb->osb_lock);
+                osb->osb_inode_alloc_group = alloc_group;
+                spin_unlock(&osb->osb_lock);
+                mlog(0, "after reservation, new allocation group is "
+                     "%llu\n", (unsigned long long)alloc_group);
                /*
                 * Some inodes must be freed by us, so try to allocate
                 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                             OCFS2_INVALID_SLOT,
+                                             OCFS2_INVALID_SLOT, NULL,
                                             ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
 * things a bit. */
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                             int flags,
                                             struct ocfs2_alloc_context **ac)
 {
        int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
        (*ac)->ac_max_block = max_block;
        status = -ENOSPC;
-        if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac)
 {
-        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+                                                 ALLOC_NEW_GROUP, ac);
 }
 /*
@@ -1618,8 +1651,41 @@ bail:
        return status;
 }
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+                                      struct buffer_head *parent_fe_bh,
+                                      struct ocfs2_alloc_context *ac)
+{
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        /*
+         * Try to allocate inodes from some specific group.
+         *
+         * If the parent dir has recorded the last group used in allocation,
+         * cool, use it. Otherwise if we try to allocate new inode from the
+         * same slot the parent dir belongs to, use the same chunk.
+         *
+         * We are very careful here to avoid the mistake of setting
+         * ac_last_group to a group descriptor from a different (unlocked) slot.
+         */
+        if (OCFS2_I(dir)->ip_last_used_group &&
+            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+                ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(fe->i_blkno),
+                                        le16_to_cpu(fe->i_suballoc_bit));
+}
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+                                             struct ocfs2_alloc_context *ac)
+{
+        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                          struct inode *dir,
+                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        BUG_ON(ac->ac_bits_wanted != 1);
        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
        status = ocfs2_claim_suballoc_bits(osb,
                                           ac,
                                           handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
        ac->ac_bits_given++;
+        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
 bail:
        mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
        return ret;
 }
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+                                       u16 *suballoc_slot, u16 *suballoc_bit)
+{
+        int status;
+        struct buffer_head *inode_bh = NULL;
+        struct ocfs2_dinode *inode_fe;
+        mlog_entry("blkno: %llu\n", blkno);
+        /* dirty read disk */
+        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+        if (status < 0) {
+                mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
+                goto bail;
+        }
+        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+                mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
+                status = -EINVAL;
+                goto bail;
+        }
+        if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+                     blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+                status = -EINVAL;
+                goto bail;
+        }
+        if (suballoc_slot)
+                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+        if (suballoc_bit)
+                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+bail:
+        brelse(inode_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+                                   struct inode *suballoc,
+                                   struct buffer_head *alloc_bh, u64 blkno,
+                                   u16 bit, int *res)
+{
+        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_group_desc *group;
+        struct buffer_head *group_bh = NULL;
+        u64 bg_blkno;
+        int status;
+        mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+                     (unsigned int)bit,
+                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                status = -EINVAL;
+                goto bail;
+        }
+        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                                             &group_bh);
+        if (status < 0) {
+                mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
+                goto bail;
+        }
+        group = (struct ocfs2_group_desc *) group_bh->b_data;
+        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+bail:
+        brelse(group_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+        int status;
+        u16 suballoc_bit = 0, suballoc_slot = 0;
+        struct inode *inode_alloc_inode;
+        struct buffer_head *alloc_bh = NULL;
+        mlog_entry("blkno: %llu", blkno);
+        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+                                             &suballoc_bit);
+        if (status < 0) {
+                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+                goto bail;
+        }
+        inode_alloc_inode =
+                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+                                            suballoc_slot);
+        if (!inode_alloc_inode) {
+                /* the error code could be inaccurate, but we are not able to
+                 * get the correct one. */
+                status = -EINVAL;
+                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+                     (u32)suballoc_slot);
+                goto bail;
+        }
+        mutex_lock(&inode_alloc_inode->i_mutex);
+        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+        if (status < 0) {
+                mutex_unlock(&inode_alloc_inode->i_mutex);
+                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+                     (u32)suballoc_slot, status);
+                goto bail;
+        }
+        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+                                         blkno, suballoc_bit, res);
+        if (status < 0)
+                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+        ocfs2_inode_unlock(inode_alloc_inode, 0);
+        mutex_unlock(&inode_alloc_inode->i_mutex);
+        iput(inode_alloc_inode);
+        brelse(alloc_bh);
+bail:
+        mlog_exit(status);
+        return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e3c13c77f9e8..8c9a78a43164 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
                         u64 *blkno_start);
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                          struct inode *dir,
+                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac);
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7ac83a81ee55..79ff8d9d37e0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
        {Opt_err, NULL}
 };
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+        int out = 0;
+        int i;
+        struct ocfs2_cluster_connection *cconn = osb->cconn;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        out += snprintf(buf + out, len - out,
+                        "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+                        "Device", osb->dev_str, osb->uuid_str,
+                        osb->fs_generation, osb->vol_label);
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %d  Flags: 0x%lX\n", "Volume",
+                        atomic_read(&osb->vol_state), osb->osb_flags);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Block: %lu  Cluster: %d\n", "Sizes",
+                        osb->sb->s_blocksize, osb->s_clustersize);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Compat: 0x%X  Incompat: 0x%X  "
+                        "ROcompat: 0x%X\n",
+                        "Features", osb->s_feature_compat,
+                        osb->s_feature_incompat, osb->s_feature_ro_compat);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+                        osb->s_mount_opt, osb->s_atime_quantum);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+                        "Cluster",
+                        (*osb->osb_cluster_stack == '\0' ?
+                         "o2cb" : osb->osb_cluster_stack),
+                        cconn->cc_namelen, cconn->cc_name,
+                        cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+        spin_lock(&osb->dc_task_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+                        "WorkSeq: %lu\n", "DownCnvt",
+                        task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+                        osb->dc_wake_sequence, osb->dc_work_sequence);
+        spin_unlock(&osb->dc_task_lock);
+        spin_lock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+                        "Recovery",
+                        (osb->recovery_thread_task ?
+                         task_pid_nr(osb->recovery_thread_task) : -1));
+        if (rm->rm_used == 0)
+                out += snprintf(buf + out, len - out, " None\n");
+        else {
+                for (i = 0; i < rm->rm_used; i++)
+                        out += snprintf(buf + out, len - out, " %d",
+                                        rm->rm_entries[i]);
+                out += snprintf(buf + out, len - out, "\n");
+        }
+        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+                        task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+                        atomic_read(&osb->needs_checkpoint));
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+                        "Journal", osb->journal->j_state,
+                        atomic_read(&osb->journal->j_num_trans),
+                        osb->journal->j_trans_id);
+        out += snprintf(buf + out, len - out,
+                        "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+                        "SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+                        "Stats",
+                        atomic_read(&osb->alloc_stats.bitmap_data),
+                        atomic_read(&osb->alloc_stats.local_data),
+                        atomic_read(&osb->alloc_stats.bg_allocs),
+                        atomic_read(&osb->alloc_stats.moves),
+                        atomic_read(&osb->alloc_stats.bg_extends));
+        out += snprintf(buf + out, len - out,
+                        "%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+                        "Default: %u bits\n",
+                        "LocalAlloc", osb->local_alloc_state,
+                        (unsigned long long)osb->la_last_gd,
+                        osb->local_alloc_bits, osb->local_alloc_default_bits);
+        spin_lock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out,
+                        "%10s => Slot: %d  NumStolen: %d\n", "Steal",
+                        osb->s_inode_steal_slot,
+                        atomic_read(&osb->s_num_inodes_stolen));
+        spin_unlock(&osb->osb_lock);
+        out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+                        "Slots", "Num", "RecoGen");
+        for (i = 0; i < osb->max_slots; ++i) {
+                out += snprintf(buf + out, len - out,
+                                "%10s  %c %3d  %10d\n",
+                                " ",
+                                (i == osb->slot_num ? '*' : ' '),
+                                i, osb->slot_recovery_generations[i]);
+        }
+        return out;
+}
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+        struct ocfs2_super *osb = inode->i_private;
+        char *buf = NULL;
+        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!buf)
+                goto bail;
+        i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+        file->private_data = buf;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+        kfree(file->private_data);
+        return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                       i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+        return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+static struct file_operations ocfs2_osb_debug_fops = {
+        .open =         ocfs2_osb_debug_open,
+        .release =      ocfs2_debug_release,
+        .read =         ocfs2_debug_read,
+        .llseek =       generic_file_llseek,
+};
 /*
 * write_super and sync_fs ripped right out of ext3.
 */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
+        osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+                                            osb->osb_debug_root,
+                                            osb,
+                                            &ocfs2_osb_debug_fops);
+        if (!osb->osb_ctxt) {
+                status = -EINVAL;
+                mlog_errno(status);
+                goto read_super_error;
+        }
        status = ocfs2_mount_volume(sb);
        if (osb->root_inode)
                inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        debugfs_remove(osb->osb_ctxt);
        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
        sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+        osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+        for (i = 0; i < 3; i++)
+                osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+        osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
        osb->sb = sb;
        /* Save off for ocfs2_rw_direct */
        osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
         * lock, and it's marked as dirty, set the bit in the recover
         * map and launch a recovery thread for it. */
        status = ocfs2_mark_dead_nodes(osb);
+        if (status < 0) {
+                mlog_errno(status);
+                goto finally;
+        }
+        status = ocfs2_compute_replay_slots(osb);
        if (status < 0)
                mlog_errno(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2563df89fc2a..15631019dc63 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
                          struct ocfs2_security_xattr_info *si,
                          int *want_clusters,
                          int *xattr_credits,
-                          struct ocfs2_alloc_context **xattr_ac)
+                          int *want_meta)
 {
        int ret = 0;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
            (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                *want_meta = *want_meta + 1;
-                if (ret) {
-                        mlog_errno(ret);
-                        return ret;
-                }
                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
        }
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 5a1ebc789f7e..1ca7e9a1b7bc 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
                             int *, int *, struct ocfs2_alloc_context **);
 int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
                          int, struct ocfs2_security_xattr_info *,
-                          int *, int *, struct ocfs2_alloc_context **);
+                          int *, int *, int *);
 /*
 * xattrs can live inside an inode, as part of an external xattr block,
author	Len Brown <len.brown@intel.com>	2009-04-05 02:14:15 -0400
committer	Len Brown <len.brown@intel.com>	2009-04-05 02:14:15 -0400
commit	478c6a43fcbc6c11609f8cee7c7b57223907754f (patch)
tree	a7f7952099da60d33032aed6de9c0c56c9f8779e /fs/ocfs2
parent	8a3f257c704e02aee9869decd069a806b45be3f1 (diff)
parent	6bb597507f9839b13498781e481f5458aea33620 (diff)