Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

author: David Woodhouse <dwmw2@infradead.org> 2008-02-03 02:29:41 -0500
committer: David Woodhouse <dwmw2@infradead.org> 2008-02-03 02:30:32 -0500
commit: c1f3ee120bb61045b1c0a3ead620d1d65af47130 (patch)
tree: 908430bf2b47fe8e96ac623ae7ab6dd5698d0938 /fs/ocfs2
parent: e619a75ff6201b567a539e787aa9af9bc63a3187 (diff)
parent: 9135f1901ee6449dfe338adf6e40e9c2025b8150 (diff)
51 files changed, 2024 insertions, 1589 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
        ioctl.o                 \
        journal.o               \
        localalloc.o            \
+        locks.o                 \
        mmap.o                  \
        namei.o                 \
+        resize.o                \
        slot_map.o              \
        suballoc.o              \
        super.o                 \
        symlink.o               \
        sysfile.o               \
        uptodate.o              \
-        ver.o                   \
+        ver.o
-        vote.o
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4ba7f0bdc248..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2389,6 +2389,18 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                        goto out;
                }
+                /*
+                 * Caller might still want to make changes to the
+                 * tree root, so re-add it to the journal here.
+                 */
+                ret = ocfs2_journal_access(handle, inode,
+                                           path_root_bh(left_path),
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
                                                right_path, subtree_root,
                                                dealloc, &deleted);
@@ -3289,16 +3301,6 @@ static int ocfs2_insert_path(struct inode *inode,
        int ret, subtree_index;
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
-        /*
-         * Pass both paths to the journal. The majority of inserts
-         * will be touching all components anyway.
-         */
-        ret = ocfs2_journal_access_path(inode, handle, right_path);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (left_path) {
                int credits = handle->h_buffer_credits;
@@ -3323,6 +3325,16 @@ static int ocfs2_insert_path(struct inode *inode,
                }
        }
+        /*
+         * Pass both paths to the journal. The majority of inserts
+         * will be touching all components anyway.
+         */
+        ret = ocfs2_journal_access_path(inode, handle, right_path);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
        if (insert->ins_split != SPLIT_NONE) {
                /*
                 * We could call ocfs2_insert_at_leaf() for some types
@@ -3331,6 +3343,17 @@ static int ocfs2_insert_path(struct inode *inode,
                 */
                ocfs2_split_record(inode, left_path, right_path,
                                   insert_rec, insert->ins_split);
+                /*
+                 * Split might have modified either leaf and we don't
+                 * have a guarantee that the later edge insert will
+                 * dirty this for us.
+                 */
+                if (left_path)
+                        ret = ocfs2_journal_dirty(handle,
+                                                  path_leaf_bh(left_path));
+                        if (ret)
+                                mlog_errno(ret);
        } else
                ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
                                     insert, inode);
@@ -3430,6 +3453,17 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                        mlog_errno(ret);
                        goto out;
                }
+                /*
+                 * ocfs2_rotate_tree_right() might have extended the
+                 * transaction without re-journaling our tree root.
+                 */
+                ret = ocfs2_journal_access(handle, inode, di_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
        } else if (type->ins_appending == APPEND_TAIL
                   && type->ins_contig != CONTIG_LEFT) {
                ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
@@ -3941,12 +3975,12 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 {
        int ret = 0;
        struct ocfs2_extent_list *el = path_leaf_el(path);
-        struct buffer_head *eb_bh, *last_eb_bh = NULL;
+        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
        struct ocfs2_merge_ctxt ctxt;
        struct ocfs2_extent_list *rightmost_el;
-        if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+        if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
                ret = -EIO;
                mlog_errno(ret);
                goto out;
@@ -3960,14 +3994,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                goto out;
        }
-        eb_bh = path_leaf_bh(path);
-        ret = ocfs2_journal_access(handle, inode, eb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
                                                            split_index,
                                                            split_rec);
@@ -4029,8 +4055,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                        mlog_errno(ret);
        }
-        ocfs2_journal_dirty(handle, eb_bh);
 out:
        brelse(last_eb_bh);
        return ret;
@@ -4707,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        mutex_lock(&data_alloc_inode->i_mutex);
-        status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
+        status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto out_mutex;
@@ -4729,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 out_unlock:
        brelse(data_alloc_bh);
-        ocfs2_meta_unlock(data_alloc_inode, 1);
+        ocfs2_inode_unlock(data_alloc_inode, 1);
 out_mutex:
        mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5053,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
                mlog_errno(ret);
                goto out_mutex;
@@ -5094,7 +5118,7 @@ out_journal:
        ocfs2_commit_trans(osb, handle);
 out_unlock:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
 out_mutex:
        mutex_unlock(&inode->i_mutex);
@@ -6093,8 +6117,6 @@ start:
        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        BUG_ON(clusters_to_del == 0);
        mutex_lock(&tl_inode->i_mutex);
        tl_sem = 1;
        /* ocfs2_truncate_log_needs_flush guarantees us at least one
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c69c1b300155..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
 #include <asm/byteorder.h>
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 {
        int err = 0;
        unsigned int ext_flags;
-        u64 p_blkno, past_eof;
+        u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+        u64 p_blkno, count, past_eof;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+        err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
                                          &ext_flags);
        if (err) {
                mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
+        if (max_blocks < count)
+                count = max_blocks;
        /*
         * ocfs2 never allocates in this function - the only time we
         * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
                map_bh(bh_result, inode->i_sb, p_blkno);
+        bh_result->b_size = count << inode->i_blkbits;
        if (!ocfs2_sparse_alloc(osb)) {
                if (p_blkno == 0) {
                        err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
                           struct buffer_head *di_bh)
 {
        void *kaddr;
-        unsigned int size;
+        loff_t size;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
        if (size > PAGE_CACHE_SIZE ||
            size > ocfs2_max_inline_data(inode->i_sb)) {
                ocfs2_error(inode->i_sb,
-                            "Inode %llu has with inline data has bad size: %u",
+                            "Inode %llu has with inline data has bad size: %Lu",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)size);
                return -EROFS;
        }
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
        mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
-        ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
+        ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
        if (ret != 0) {
                if (ret == AOP_TRUNCATED_PAGE)
                        unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
        if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
                ret = AOP_TRUNCATED_PAGE;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
                goto out_alloc;
        }
-        ret = ocfs2_data_lock_with_page(inode, 0, page);
-        if (ret != 0) {
-                if (ret == AOP_TRUNCATED_PAGE)
-                        unlock = 0;
-                mlog_errno(ret);
-                goto out_alloc;
-        }
        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                ret = ocfs2_readpage_inline(inode, page);
        else
                ret = block_read_full_page(page, ocfs2_get_block);
        unlock = 0;
-        ocfs2_data_unlock(inode, 0);
 out_alloc:
        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-out_meta_unlock:
+out_inode_unlock:
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
 out:
        if (unlock)
                unlock_page(page);
@@ -331,6 +330,62 @@ out:
        return ret;
 }
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
+ *
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
+ */
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+                           struct list_head *pages, unsigned nr_pages)
+{
+        int ret, err = -EIO;
+        struct inode *inode = mapping->host;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        loff_t start;
+        struct page *last;
+        /*
+         * Use the nonblocking flag for the dlm code to avoid page
+         * lock inversion, but don't bother with retrying.
+         */
+        ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+        if (ret)
+                return err;
+        if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+                ocfs2_inode_unlock(inode, 0);
+                return err;
+        }
+        /*
+         * Don't bother with inline-data. There isn't anything
+         * to read-ahead in that case anyway...
+         */
+        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                goto out_unlock;
+        /*
+         * Check whether a remote node truncated this file - we just
+         * drop out in that case as it's not worth handling here.
+         */
+        last = list_entry(pages->prev, struct page, lru);
+        start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+        if (start >= i_size_read(inode))
+                goto out_unlock;
+        err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+out_unlock:
+        up_read(&oi->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 0);
+        return err;
+}
 /* Note: Because we don't support holes, our allocation has
 * already happened (allocation writes zeros to the file data)
 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
         * accessed concurrently from multiple nodes.
         */
        if (!INODE_JOURNAL(inode)) {
-                err = ocfs2_meta_lock(inode, NULL, 0);
+                err = ocfs2_inode_lock(inode, NULL, 0);
                if (err) {
                        if (err != -ENOENT)
                                mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
        if (!INODE_JOURNAL(inode)) {
                up_read(&OCFS2_I(inode)->ip_alloc_sem);
-                ocfs2_meta_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, 0);
        }
        if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-                /*
-                 * We get PR data locks even for O_DIRECT.  This
-                 * allows concurrent O_DIRECT I/O but doesn't let
-                 * O_DIRECT with extending and buffered zeroing writes
-                 * race.  If they did race then the buffered zeroing
-                 * could be written back after the O_DIRECT I/O.  It's
-                 * one thing to tell people not to mix buffered and
-                 * O_DIRECT writes, but expecting them to understand
-                 * that file extension is also an implicit buffered
-                 * write is too much.  By getting the PR we force
-                 * writeback of the buffered zeroing before
-                 * proceeding.
-                 */
-                ret = ocfs2_data_lock(inode, 0);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ocfs2_data_unlock(inode, 0);
-        }
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
                                            nr_segs, 
                                            ocfs2_direct_IO_get_blocks,
                                            ocfs2_dio_end_io);
-out:
        mlog_exit(ret);
        return ret;
 }
@@ -729,6 +762,27 @@ static void ocfs2_clear_page_regions(struct page *page,
 }
 /*
+ * Nonsparse file systems fully allocate before we get to the write
+ * code. This prevents ocfs2_write() from tagging the write as an
+ * allocating one, which means ocfs2_map_page_blocks() might try to
+ * read-in the blocks at the tail of our file. Avoid reading them by
+ * testing i_size against each block offset.
+ */
+static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+                                 unsigned int block_start)
+{
+        u64 offset = page_offset(page) + block_start;
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                return 1;
+        if (i_size_read(inode) > offset)
+                return 1;
+        return 0;
+}
+/*
 * Some of this taken from block_prepare_write(). We already have our
 * mapping by now though, and the entire write will be allocating or
 * it won't, so not much need to use BH_New.
@@ -781,6 +835,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
                                set_buffer_uptodate(bh);
                } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                           !buffer_new(bh) &&
+                           ocfs2_should_read_blk(inode, page, block_start) &&
                           (block_start < from || block_end > to)) {
                        ll_rw_block(READ, 1, &bh);
                        *wait_bh++=bh;
@@ -1492,7 +1547,7 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
 {
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        if (new_size < le16_to_cpu(di->id2.i_data.id_count))
+        if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
                return 1;
        return 0;
 }
@@ -1732,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
        struct buffer_head *di_bh = NULL;
        struct inode *inode = mapping->host;
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -1747,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_fail;
-        }
        ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
                                       fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
-                goto out_fail_data;
+                goto out_fail;
        }
        brelse(di_bh);
        return 0;
-out_fail_data:
-        ocfs2_data_unlock(inode, 1);
 out_fail:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        brelse(di_bh);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        return ret;
 }
@@ -1886,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
        ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
-        ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        return ret;
 }
 const struct address_space_operations ocfs2_aops = {
        .readpage       = ocfs2_readpage,
+        .readpages      = ocfs2_readpages,
        .writepage      = ocfs2_writepage,
        .write_begin    = ocfs2_write_begin,
        .write_end      = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                 * information for this bh as it's not marked locally
                 * uptodate. */
                ret = -EIO;
-                brelse(bh);
+                put_bh(bh);
        }
        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                 * for this bh as it's not marked locally
                                 * uptodate. */
                                status = -EIO;
-                                brelse(bh);
+                                put_bh(bh);
                                bhs[i] = NULL;
                                continue;
                        }
@@ -280,3 +280,64 @@ bail:
        mlog_exit(status);
        return status;
 }
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+                                        sector_t blkno)
+{
+        int i;
+        u64 backup_blkno;
+        if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+                return;
+        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+                backup_blkno = ocfs2_backup_super_blkno(sb, i);
+                if (backup_blkno == blkno)
+                        return;
+        }
+        BUG();
+}
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+                                struct buffer_head *bh)
+{
+        int ret = 0;
+        mlog_entry_void();
+        BUG_ON(buffer_jbd(bh));
+        ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+                ret = -EROFS;
+                goto out;
+        }
+        lock_buffer(bh);
+        set_buffer_uptodate(bh);
+        /* remove from dirty list before I/O. */
+        clear_buffer_dirty(bh);
+        get_bh(bh); /* for end_buffer_write_sync() */
+        bh->b_end_io = end_buffer_write_sync;
+        submit_bh(WRITE, bh);
+        wait_on_buffer(bh);
+        if (!buffer_uptodate(bh)) {
+                ret = -EIO;
+                put_bh(bh);
+        }
+out:
+        mlog_exit(ret);
+        return ret;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
                      int                  flags,
                      struct inode        *inode);
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+                                struct buffer_head *bh);
 #define OCFS2_BH_CACHED            1
 #define OCFS2_BH_READAHEAD         8
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9cc7c0418b70..f02ccb34604d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -267,7 +267,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
                current_page = cs / spp;
                page = reg->hr_slot_data[current_page];
-                vec_len = min(PAGE_CACHE_SIZE,
+                vec_len = min(PAGE_CACHE_SIZE - vec_start,
                              (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
                mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
 #define O2HB_LIVE_THRESHOLD        2
 /* number of equal samples to be seen as dead */
 extern unsigned int o2hb_dead_threshold;
-#define O2HB_DEFAULT_DEAD_THRESHOLD        7
+#define O2HB_DEFAULT_DEAD_THRESHOLD        31
 /* Otherwise MAX_WRITE_TIMEOUT will be zero... */
 #define O2HB_MIN_DEAD_THRESHOLD   2
 #define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df945..23c732f27529 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
        .kobj   = {.ktype = &mlog_ktype},
 };
-int mlog_sys_init(struct kset *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_kset)
 {
        int i = 0;
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
        mlog_attr_ptrs[i] = NULL;
        kobject_set_name(&mlog_kset.kobj, "logmask");
-        kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
+        mlog_kset.kobj.kset = o2cb_kset;
        return kset_register(&mlog_kset);
 }
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index cd046060114e..597e064bb94f 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,7 +212,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_errno(st) do {                                             \
        int _st = (st);                                                 \
        if (_st != -ERESTARTSYS && _st != -EINTR &&                     \
-            _st != AOP_TRUNCATED_PAGE)                                  \
+            _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)                \
                mlog(ML_ERROR, "status = %lld\n", (long long)_st);      \
 } while (0)
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd09..0c095ce7723d 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/fs.h>
 #include "ocfs2_nodemanager.h"
 #include "masklog.h"
 #include "sys.h"
-struct o2cb_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(char *buf);
-        ssize_t (*store)(const char *buf, size_t count);
-};
-#define O2CB_ATTR(_name, _mode, _show, _store)  \
-struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
-#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
-static ssize_t o2cb_interface_revision_show(char *buf)
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+                            char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
 }
+static struct kobj_attribute attr_version =
-static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+        __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
 static struct attribute *o2cb_attrs[] = {
-        &o2cb_attr_interface_revision.attr,
+        &attr_version.attr,
        NULL,
 };
-static ssize_t
+static struct attribute_group o2cb_attr_group = {
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
+        .attrs = o2cb_attrs,
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
-           const char * buffer, size_t count);
-static struct sysfs_ops o2cb_sysfs_ops = {
-        .show   = o2cb_show,
-        .store  = o2cb_store,
 };
-static struct kobj_type o2cb_subsys_type = {
+static struct kset *o2cb_kset;
-        .default_attrs  = o2cb_attrs,
-        .sysfs_ops      = &o2cb_sysfs_ops,
-};
-/* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL, NULL);
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
-{
-        struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-        struct kset *sbs = to_kset(kobj);
-        BUG_ON(sbs != &o2cb_subsys);
-        if (o2cb_attr->show)
-                return o2cb_attr->show(buffer);
-        return -EIO;
-}
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
-             const char * buffer, size_t count)
-{
-        struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-        struct kset *sbs = to_kset(kobj);
-        BUG_ON(sbs != &o2cb_subsys);
-        if (o2cb_attr->store)
-                return o2cb_attr->store(buffer, count);
-        return -EIO;
-}
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
-        subsystem_unregister(&o2cb_subsys);
+        kset_unregister(o2cb_kset);
 }
 int o2cb_sys_init(void)
 {
        int ret;
-        o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
+        o2cb_kset = kset_create_and_add("o2cb", NULL, NULL);
-        ret = subsystem_register(&o2cb_subsys);
+        if (!o2cb_kset)
+                return -ENOMEM;
+        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
-                return ret;
+                goto error;
-        ret = mlog_sys_init(&o2cb_subsys);
+        ret = mlog_sys_init(o2cb_kset);
        if (ret)
-                subsystem_unregister(&o2cb_subsys);
+                goto error;
+        return 0;
+error:
+        kset_unregister(o2cb_kset);
        return ret;
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 685c18065c82..ee50c9610e7f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -58,6 +58,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/kref.h>
+#include <linux/net.h>
 #include <net/tcp.h>
 #include <asm/uaccess.h>
@@ -71,14 +72,6 @@
 #include "tcp_internal.h"
-/* 
- * The linux network stack isn't sparse endian clean.. It has macros like
- * ntohs() which perform the endian checks and structs like sockaddr_in
- * which aren't annotated.  So __force is found here to get the build
- * clean.  When they emerge from the dark ages and annotate the code
- * we can remove these.
- */
 #define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,    \
                          NIPQUAD(sc->sc_node->nd_ipv4_address),        \
@@ -616,8 +609,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
                del_timer_sync(&sc->sc_idle_timeout);
                o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
                sc_put(sc);
-                sc->sc_sock->ops->shutdown(sc->sc_sock,
+                kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
-                                           RCV_SHUTDOWN|SEND_SHUTDOWN);
        }
        /* not fatal so failed connects before the other guy has our
@@ -1500,7 +1492,7 @@ static void o2net_start_connect(struct work_struct *work)
        myaddr.sin_family = AF_INET;
        myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
-        myaddr.sin_port = (__force u16)htons(0); /* any port */
+        myaddr.sin_port = htons(0); /* any port */
        ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
                              sizeof(myaddr));
@@ -1701,11 +1693,11 @@ static int o2net_accept_one(struct socket *sock)
        if (ret < 0)
                goto out;
-        node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+        node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
        if (node == NULL) {
                mlog(ML_NOTICE, "attempt to connect from unknown node at "
                     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port));
+                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1714,7 +1706,7 @@ static int o2net_accept_one(struct socket *sock)
                mlog(ML_NOTICE, "unexpected connect attempted from a lower "
                     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port), node->nd_num);
+                     ntohs(sin.sin_port), node->nd_num);
                ret = -EINVAL;
                goto out;
        }
@@ -1725,7 +1717,7 @@ static int o2net_accept_one(struct socket *sock)
                mlog(ML_CONN, "attempt to connect from node '%s' at "
                     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port));
+                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1742,7 +1734,7 @@ static int o2net_accept_one(struct socket *sock)
                mlog(ML_NOTICE, "attempt to connect from node '%s' at "
                     "%u.%u.%u.%u:%d but it already has an open connection\n",
                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-                     ntohs((__force __be16)sin.sin_port));
+                     ntohs(sin.sin_port));
                goto out;
        }
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
 /* same as hb delay, we're waiting for another node to recognize our hb */
 #define O2NET_RECONNECT_DELAY_MS_DEFAULT        2000
-#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT        5000
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT        2000
-#define O2NET_IDLE_TIMEOUT_MS_DEFAULT           10000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT           30000
 /* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 10:
+ *      - Meta/data locks combined
+ *
+ * New in version 9:
+ *      - All votes removed
+ *
 * New in version 8:
 *      - Replace delete inode votes with a cluster lock
 *
@@ -60,7 +66,7 @@
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 8ULL
+#define O2NET_PROTOCOL_VERSION 10ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
 #include "ver.h"
-#define CLUSTER_BUILD_VERSION "1.3.3"
+#define CLUSTER_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 3094ddb7a254..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
 /*
 * Walk the inode alias list, and find a dentry which has a given
 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
- * is looking for a dentry_lock reference. The vote thread is looking
+ * is looking for a dentry_lock reference. The downconvert thread is
- * to unhash aliases, so we allow it to skip any that already have
+ * looking to unhash aliases, so we allow it to skip any that already
- * that property.
+ * have that property.
 */
 struct dentry *ocfs2_find_local_alias(struct inode *inode,
                                      u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
        dl->dl_count = 0;
        /*
         * Does this have to happen below, for all attaches, in case
-         * the struct inode gets blown away by votes?
+         * the struct inode gets blown away by the downconvert thread?
         */
        dl->dl_inode = igrab(inode);
        dl->dl_parent_blkno = parent_blkno;
@@ -318,9 +318,9 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
                                   struct ocfs2_dentry_lock *dl)
 {
+        iput(dl->dl_inode);
        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
        ocfs2_lock_res_free(&dl->dl_lockres);
-        iput(dl->dl_inode);
        kfree(dl);
 }
@@ -344,12 +344,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
-        mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED),
+        if (!dl) {
-                        "dentry: %.*s\n", dentry->d_name.len,
+                /*
-                        dentry->d_name.name);
+                 * No dentry lock is ok if we're disconnected or
+                 * unhashed.
+                 */
+                if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
+                    !d_unhashed(dentry)) {
+                        unsigned long long ino = 0ULL;
+                        if (inode)
+                                ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
+                        mlog(ML_ERROR, "Dentry is missing cluster lock. "
+                             "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
+                             ino, dentry->d_flags, dentry->d_name.len,
+                             dentry->d_name.name);
+                }
-        if (!dl)
                goto out;
+        }
        mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
                        dentry->d_name.len, dentry->d_name.name,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 6a2f143e269c..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -208,9 +208,9 @@ out:
        return NULL;
 }
-struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
-                                        struct inode *dir,
+                                               struct inode *dir,
-                                        struct ocfs2_dir_entry **res_dir)
+                                               struct ocfs2_dir_entry **res_dir)
 {
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
        mlog_entry("dirino=%llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+        error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
        if (lock_level && error >= 0) {
                /* We release EX lock which used to update atime
                 * and get PR lock again to reduce contention
                 * on commonly accessed directories. */
-                ocfs2_meta_unlock(inode, 1);
+                ocfs2_inode_unlock(inode, 1);
                lock_level = 0;
-                error = ocfs2_meta_lock(inode, NULL, 0);
+                error = ocfs2_inode_lock(inode, NULL, 0);
        }
        if (error < 0) {
                if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
        error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
                                      dirent, filldir, NULL);
-        ocfs2_meta_unlock(inode, lock_level);
+        ocfs2_inode_unlock(inode, lock_level);
 bail_nolock:
        mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
 #include "dlmfsver.h"
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 62e4a7daa286..a54d33d95ada 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -908,7 +908,7 @@ lookup:
                 * but they might own this lockres.  wait on them. */
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit < O2NM_MAX_NODES) {
-                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
                             "recover before lock mastery can begin\n",
                             dlm->name, namelen, (char *)lockid, bit);
                        wait_on_recovery = 1;
@@ -962,7 +962,7 @@ redo_request:
                spin_lock(&dlm->spinlock);
                bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit < O2NM_MAX_NODES) {
-                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+                        mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
                             "recover before lock mastery can begin\n",
                             dlm->name, namelen, (char *)lockid, bit);
                        wait_on_recovery = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
                }
        }
+        /* Clean up join state on node death. */
+        if (dlm->joining_node == idx) {
+                mlog(0, "Clearing join state for node %u\n", idx);
+                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+        }
        /* check to see if the node is already considered dead */
        if (!test_bit(idx, dlm->live_nodes_map)) {
                mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
        clear_bit(idx, dlm->live_nodes_map);
-        /* Clean up join state on node death. */
-        if (dlm->joining_node == idx) {
-                mlog(0, "Clearing join state for node %u\n", idx);
-                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-        }
        /* make sure local cleanup occurs before the heartbeat events */
        if (!test_bit(idx, dlm->recovery_map))
                dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
        if (!dlm_grab(dlm))
                return;
+        /*
+         * This will notify any dlm users that a node in our domain
+         * went away without notifying us first.
+         */
+        if (test_bit(idx, dlm->domain_map))
+                dlm_fire_domain_eviction_callbacks(dlm, idx);
        spin_lock(&dlm->spinlock);
        __dlm_hb_node_down(dlm, idx);
        spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
 #include "dlmver.h"
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 41c76ff2fcfb..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
 /*
 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
        struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
        /*
-         * Optionally called in the downconvert (or "vote") thread
+         * Optionally called in the downconvert thread after a
-         * after a successful downconvert. The lockres will not be
+         * successful downconvert. The lockres will not be referenced
-         * referenced after this callback is called, so it is safe to
+         * after this callback is called, so it is safe to free
-         * free memory, etc.
+         * memory, etc.
         *
         * The exact semantics of when this is called are controlled
         * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
        .flags          = 0,
 };
-static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
        .get_osb        = ocfs2_get_inode_osb,
        .check_downconvert = ocfs2_check_meta_downconvert,
        .set_lvb        = ocfs2_set_meta_lvb,
-        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
-};
-static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
-        .get_osb        = ocfs2_get_inode_osb,
        .downconvert_worker = ocfs2_data_convert_worker,
-        .flags          = 0,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
 };
 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+        .get_osb        = ocfs2_get_file_osb,
+        .flags          = 0,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
-                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
                lockres->l_type == OCFS2_LOCK_TYPE_RW ||
                lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                "resource %s: %s\n", dlm_errname(_stat), _func, \
                _lockres->l_name, dlm_errmsg(_stat));           \
 } while (0)
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+static int ocfs2_downconvert_thread(void *arg);
-                                 struct ocfs2_lock_res *lockres);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
-static int ocfs2_meta_lock_update(struct inode *inode,
+                                        struct ocfs2_lock_res *lockres);
+static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+                                      int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+                                  struct ocfs2_lock_res *lockres,
+                                  int new_level,
+                                  int lvb);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+                                        struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+                                struct ocfs2_lock_res *lockres);
 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
                                  u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                        ops = &ocfs2_inode_rw_lops;
                        break;
                case OCFS2_LOCK_TYPE_META:
-                        ops = &ocfs2_inode_meta_lops;
+                        ops = &ocfs2_inode_inode_lops;
-                        break;
-                case OCFS2_LOCK_TYPE_DATA:
-                        ops = &ocfs2_inode_data_lops;
                        break;
                case OCFS2_LOCK_TYPE_OPEN:
                        ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
        return OCFS2_SB(inode->i_sb);
 }
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_file_private *fp = lockres->l_priv;
+        return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
 static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
 {
        __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_rename_lops, osb);
 }
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+                              struct ocfs2_file_private *fp)
+{
+        struct inode *inode = fp->fp_file->f_mapping->host;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        ocfs2_lock_res_init_once(lockres);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+                              inode->i_generation, lockres->l_name);
+        ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+                                   OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+                                   fp);
+        lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
        mlog_entry_void();
@@ -670,7 +700,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
 {
        mlog_entry_void();
-        BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+        BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
        if (lockres->l_requested > LKM_NLMODE &&
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
             lockres->l_name, level, lockres->l_level,
             ocfs2_lock_type_string(lockres->l_type));
+        /*
+         * We can skip the bast for locks which don't enable caching -
+         * they'll be dropped at the earliest possible time anyway.
+         */
+        if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+                return;
        spin_lock_irqsave(&lockres->l_lock, flags);
        needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
        if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
        wake_up(&lockres->l_event);
-        ocfs2_kick_vote_thread(osb);
+        ocfs2_wake_downconvert_thread(osb);
 }
 static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
 }
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+                                             struct ocfs2_lock_res *lockres)
+{
+        int ret;
+        ret = wait_for_completion_interruptible(&mw->mw_complete);
+        if (ret)
+                lockres_remove_mask_waiter(lockres, mw);
+        else
+                ret = mw->mw_status;
+        /* Re-arm the completion in case we want to wait on it again */
+        INIT_COMPLETION(mw->mw_complete);
+        return ret;
+}
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                              struct ocfs2_lock_res *lockres,
                              int level,
@@ -980,18 +1032,6 @@ again:
                goto unlock;
        }
-        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
-                /* lock has not been created yet. */
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                goto again;
-        }
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
            !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
                /* is the lock is currently blocked on behalf of
@@ -1006,7 +1046,14 @@ again:
                        mlog(ML_ERROR, "lockres %s has action %u pending\n",
                             lockres->l_name, lockres->l_action);
-                lockres->l_action = OCFS2_AST_CONVERT;
+                if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+                        lockres->l_action = OCFS2_AST_ATTACH;
+                        lkm_flags &= ~LKM_CONVERT;
+                } else {
+                        lockres->l_action = OCFS2_AST_CONVERT;
+                        lkm_flags |= LKM_CONVERT;
+                }
                lockres->l_requested = level;
                lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -1021,7 +1068,7 @@ again:
                status = dlmlock(osb->dlm,
                                 level,
                                 &lockres->l_lksb,
-                                 lkm_flags|LKM_CONVERT,
+                                 lkm_flags,
                                 lockres->l_name,
                                 OCFS2_LOCK_ID_MAX_LEN - 1,
                                 ocfs2_locking_ast,
@@ -1094,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
        ocfs2_dec_holders(lockres, level);
-        ocfs2_vote_on_unlock(osb, lockres);
+        ocfs2_downconvert_on_unlock(osb, lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        mlog_exit_void();
 }
@@ -1152,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
         * We don't want to use LKM_LOCAL on a meta data lock as they
         * don't use a generation in their lock names.
         */
-        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
-        if (ret) {
-                mlog_errno(ret);
-                goto bail;
-        }
-        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
        if (ret) {
                mlog_errno(ret);
                goto bail;
@@ -1316,76 +1357,221 @@ out:
        mlog_exit_void();
 }
-int ocfs2_data_lock_full(struct inode *inode,
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
-                         int write,
+                                     int level)
-                         int arg_flags)
 {
-        int status = 0, level;
+        int ret;
-        struct ocfs2_lock_res *lockres;
+        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        unsigned long flags;
+        struct ocfs2_mask_waiter mw;
-        BUG_ON(!inode);
+        ocfs2_init_mask_waiter(&mw);
-        mlog_entry_void();
+retry_cancel:
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+                ret = ocfs2_prepare_cancel_convert(osb, lockres);
+                if (ret) {
+                        spin_unlock_irqrestore(&lockres->l_lock, flags);
+                        ret = ocfs2_cancel_convert(osb, lockres);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        goto retry_cancel;
+                }
+                lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
-        mlog(0, "inode %llu take %s DATA lock\n",
+                ocfs2_wait_for_mask(&mw);
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                goto retry_cancel;
-             write ? "EXMODE" : "PRMODE");
+        }
-        /* We'll allow faking a readonly data lock for
+        ret = -ERESTARTSYS;
-         * rodevices. */
+        /*
-        if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
+         * We may still have gotten the lock, in which case there's no
-                if (write) {
+         * point to restarting the syscall.
-                        status = -EROFS;
+         */
-                        mlog_errno(status);
+        if (lockres->l_level == level)
+                ret = 0;
+        mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+             lockres->l_flags, lockres->l_level, lockres->l_action);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+        return ret;
+}
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * seperate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ *   what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ *   no-lock at unlock time. This also means flock locks never go on
+ *   the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ *   sure to allow cancellation of a misbehaving applications flock()
+ *   request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ *   can simplify the code by requiring the caller to guarantee
+ *   serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+        int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+        unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+        unsigned long flags;
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_lock_res *lockres = &fp->fp_flock;
+        struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+        struct ocfs2_mask_waiter mw;
+        ocfs2_init_mask_waiter(&mw);
+        if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+            (lockres->l_level > LKM_NLMODE)) {
+                mlog(ML_ERROR,
+                     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+                     "level: %u\n", lockres->l_name, lockres->l_flags,
+                     lockres->l_level);
+                return -EINVAL;
+        }
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+                lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                /*
+                 * Get the lock at NLMODE to start - that way we
+                 * can cancel the upconvert request if need be.
+                 */
+                ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
                }
-                goto out;
+                ret = ocfs2_wait_for_mask(&mw);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                spin_lock_irqsave(&lockres->l_lock, flags);
        }
-        if (ocfs2_mount_local(osb))
+        lockres->l_action = OCFS2_AST_CONVERT;
-                goto out;
+        lkm_flags |= LKM_CONVERT;
+        lockres->l_requested = level;
+        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
-        lockres = &OCFS2_I(inode)->ip_data_lockres;
+        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        level = write ? LKM_EXMODE : LKM_PRMODE;
+        ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+                      lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                      ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+        if (ret != DLM_NORMAL) {
+                if (trylock && ret == DLM_NOTQUEUED)
+                        ret = -EAGAIN;
+                else {
+                        ocfs2_log_dlm_error("dlmlock", ret, lockres);
+                        ret = -EINVAL;
+                }
-        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+                ocfs2_recover_from_dlm_error(lockres, 1);
-                                    0, arg_flags);
+                lockres_remove_mask_waiter(lockres, &mw);
-        if (status < 0 && status != -EAGAIN)
+                goto out;
-                mlog_errno(status);
+        }
+        ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+        if (ret == -ERESTARTSYS) {
+                /*
+                 * Userspace can cause deadlock itself with
+                 * flock(). Current behavior locally is to allow the
+                 * deadlock, but abort the system call if a signal is
+                 * received. We follow this example, otherwise a
+                 * poorly written program could sit in kernel until
+                 * reboot.
+                 *
+                 * Handling this is a bit more complicated for Ocfs2
+                 * though. We can't exit this function with an
+                 * outstanding lock request, so a cancel convert is
+                 * required. We intentionally overwrite 'ret' - if the
+                 * cancel fails and the lock was granted, it's easier
+                 * to just bubble sucess back up to the user.
+                 */
+                ret = ocfs2_flock_handle_signal(lockres, level);
+        }
 out:
-        mlog_exit(status);
-        return status;
+        mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+             lockres->l_name, ex, trylock, ret);
+        return ret;
 }
-/* see ocfs2_meta_lock_with_page() */
+void ocfs2_file_unlock(struct file *file)
-int ocfs2_data_lock_with_page(struct inode *inode,
-                              int write,
-                              struct page *page)
 {
        int ret;
+        unsigned long flags;
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_lock_res *lockres = &fp->fp_flock;
+        struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+        struct ocfs2_mask_waiter mw;
-        ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+        ocfs2_init_mask_waiter(&mw);
-        if (ret == -EAGAIN) {
-                unlock_page(page);
+        if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
-                if (ocfs2_data_lock(inode, write) == 0)
+                return;
-                        ocfs2_data_unlock(inode, write);
-                ret = AOP_TRUNCATED_PAGE;
+        if (lockres->l_level == LKM_NLMODE)
+                return;
+        mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+             lockres->l_name, lockres->l_flags, lockres->l_level,
+             lockres->l_action);
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        /*
+         * Fake a blocking ast for the downconvert code.
+         */
+        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+        lockres->l_blocking = LKM_EXMODE;
+        ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+        if (ret) {
+                mlog_errno(ret);
+                return;
        }
-        return ret;
+        ret = ocfs2_wait_for_mask(&mw);
+        if (ret)
+                mlog_errno(ret);
 }
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
-                                 struct ocfs2_lock_res *lockres)
+                                        struct ocfs2_lock_res *lockres)
 {
        int kick = 0;
        mlog_entry_void();
        /* If we know that another node is waiting on our lock, kick
-         * the vote thread * pre-emptively when we reach a release
+         * the downconvert thread * pre-emptively when we reach a release
         * condition. */
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
                switch(lockres->l_blocking) {
@@ -1403,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
        }
        if (kick)
-                ocfs2_kick_vote_thread(osb);
+                ocfs2_wake_downconvert_thread(osb);
-        mlog_exit_void();
-}
-void ocfs2_data_unlock(struct inode *inode,
-                       int write)
-{
-        int level = write ? LKM_EXMODE : LKM_PRMODE;
-        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
-        mlog(0, "inode %llu drop %s DATA lock\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             write ? "EXMODE" : "PRMODE");
-        if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
-            !ocfs2_mount_local(osb))
-                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
        mlog_exit_void();
 }
@@ -1447,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
 /* Call this with the lockres locked. I am reasonably sure we don't
 * need ip_lock in this function as anyone who would be changing those
- * values is supposed to be blocked in ocfs2_meta_lock right now. */
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
 static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_meta_lvb *lvb;
        mlog_entry_void();
@@ -1501,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_meta_lvb *lvb;
        mlog_entry_void();
@@ -1609,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
 }
 /* may or may not return a bh if it went to disk. */
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh)
 {
        int status = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_dinode *fe;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1726,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 * returns < 0 error if the callback will never be called, otherwise
 * the result of the lock will be communicated via the callback.
 */
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
                         struct buffer_head **ret_bh,
                         int ex,
                         int arg_flags)
@@ -1761,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
                wait_event(osb->recovery_event,
                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
-        lockres = &OCFS2_I(inode)->ip_meta_lockres;
+        lockres = &OCFS2_I(inode)->ip_inode_lockres;
        level = ex ? LKM_EXMODE : LKM_PRMODE;
        dlm_flags = 0;
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1800,11 +1966,11 @@ local:
        }
        /* This is fun. The caller may want a bh back, or it may
-         * not. ocfs2_meta_lock_update definitely wants one in, but
+         * not. ocfs2_inode_lock_update definitely wants one in, but
         * may or may not read one, depending on what's in the
         * LVB. The result of all of this is that we've *only* gone to
         * disk if we have to, so the complexity is worthwhile. */
-        status = ocfs2_meta_lock_update(inode, &local_bh);
+        status = ocfs2_inode_lock_update(inode, &local_bh);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1826,7 +1992,7 @@ bail:
                        *ret_bh = NULL;
                }
                if (acquired)
-                        ocfs2_meta_unlock(inode, ex);
+                        ocfs2_inode_unlock(inode, ex);
        }
        if (local_bh)
@@ -1837,19 +2003,20 @@ bail:
 }
 /*
- * This is working around a lock inversion between tasks acquiring DLM locks
+ * This is working around a lock inversion between tasks acquiring DLM
- * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * locks while holding a page lock and the downconvert thread which
- * while acquiring page locks.
+ * blocks dlm lock acquiry while acquiring page locks.
 *
 * ** These _with_page variantes are only intended to be called from aop
 * methods that hold page locks and return a very specific *positive* error
 * code that aop methods pass up to the VFS -- test for errors with != 0. **
 *
- * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * The DLM is called such that it returns -EAGAIN if it would have
- * waiting for the vote thread.  In that case we unlock our page so the vote
+ * blocked waiting for the downconvert thread.  In that case we unlock
- * thread can make progress.  Once we've done this we have to return
+ * our page so the downconvert thread can make progress.  Once we've
- * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
- * into the VFS who will then immediately retry the aop call.
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
 *
 * We do a blocking lock and immediate unlock before returning, though, so that
 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1857,32 +2024,32 @@ bail:
 * ping locks back and forth, but that's a risk we're willing to take to avoid
 * the lock inversion simply.
 */
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
                              struct buffer_head **ret_bh,
                              int ex,
                              struct page *page)
 {
        int ret;
-        ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
+        ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
        if (ret == -EAGAIN) {
                unlock_page(page);
-                if (ocfs2_meta_lock(inode, ret_bh, ex) == 0)
+                if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
-                        ocfs2_meta_unlock(inode, ex);
+                        ocfs2_inode_unlock(inode, ex);
                ret = AOP_TRUNCATED_PAGE;
        }
        return ret;
 }
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level)
 {
        int ret;
        mlog_entry_void();
-        ret = ocfs2_meta_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1895,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
        if (ocfs2_should_update_atime(inode, vfsmnt)) {
                struct buffer_head *bh = NULL;
-                ocfs2_meta_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, 0);
-                ret = ocfs2_meta_lock(inode, &bh, 1);
+                ret = ocfs2_inode_lock(inode, &bh, 1);
                if (ret < 0) {
                        mlog_errno(ret);
                        return ret;
@@ -1913,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
        return ret;
 }
-void ocfs2_meta_unlock(struct inode *inode,
+void ocfs2_inode_unlock(struct inode *inode,
                       int ex)
 {
        int level = ex ? LKM_EXMODE : LKM_PRMODE;
-        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry_void();
@@ -2325,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
-        /* launch vote thread */
+        /* launch downconvert thread */
-        osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
+        osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
-        if (IS_ERR(osb->vote_task)) {
+        if (IS_ERR(osb->dc_task)) {
-                status = PTR_ERR(osb->vote_task);
+                status = PTR_ERR(osb->dc_task);
-                osb->vote_task = NULL;
+                osb->dc_task = NULL;
                mlog_errno(status);
                goto bail;
        }
@@ -2358,8 +2525,8 @@ local:
 bail:
        if (status < 0) {
                ocfs2_dlm_shutdown_debug(osb);
-                if (osb->vote_task)
+                if (osb->dc_task)
-                        kthread_stop(osb->vote_task);
+                        kthread_stop(osb->dc_task);
        }
        mlog_exit(status);
@@ -2374,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
        ocfs2_drop_osb_locks(osb);
-        if (osb->vote_task) {
+        if (osb->dc_task) {
-                kthread_stop(osb->vote_task);
+                kthread_stop(osb->dc_task);
-                osb->vote_task = NULL;
+                osb->dc_task = NULL;
        }
        ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2532,7 +2699,7 @@ out:
 /* Mark the lockres as being dropped. It will no longer be
 * queued if blocking, but we still may have to wait on it
- * being dequeued from the vote thread before we can consider
+ * being dequeued from the downconvert thread before we can consider
 * it safe to drop. 
 *
 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2595,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
        status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_data_lockres);
+                              &OCFS2_I(inode)->ip_inode_lockres);
-        if (err < 0)
-                mlog_errno(err);
-        if (err < 0 && !status)
-                status = err;
-        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
                mlog_errno(err);
        if (err < 0 && !status)
@@ -2855,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
        inode = ocfs2_lock_res_inode(lockres);
        mapping = inode->i_mapping;
+        if (S_ISREG(inode->i_mode))
+                goto out;
        /*
         * We need this before the filemap_fdatawrite() so that it can
         * transfer the dirty bit from the PTE to the
@@ -2880,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
                filemap_fdatawait(mapping);
        }
+out:
        return UNBLOCK_CONTINUE;
 }
@@ -2908,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
 /*
 * Does the final reference drop on our dentry lock. Right now this
- * happens in the vote thread, but we could choose to simplify the
+ * happens in the downconvert thread, but we could choose to simplify the
 * dlmglue API and push these off to the ocfs2_wq in the future.
 */
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3047,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
        mlog(0, "lockres %s blocked.\n", lockres->l_name);
        /* Detect whether a lock has been marked as going away while
-         * the vote thread was processing other things. A lock can
+         * the downconvert thread was processing other things. A lock can
         * still be marked with OCFS2_LOCK_FREEING after this check,
         * but short circuiting here will still save us some
         * performance. */
@@ -3096,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
        lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
-        spin_lock(&osb->vote_task_lock);
+        spin_lock(&osb->dc_task_lock);
        if (list_empty(&lockres->l_blocked_list)) {
                list_add_tail(&lockres->l_blocked_list,
                              &osb->blocked_lock_list);
                osb->blocked_lock_count++;
        }
-        spin_unlock(&osb->vote_task_lock);
+        spin_unlock(&osb->dc_task_lock);
        mlog_exit_void();
 }
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+        unsigned long processed;
+        struct ocfs2_lock_res *lockres;
+        mlog_entry_void();
+        spin_lock(&osb->dc_task_lock);
+        /* grab this early so we know to try again if a state change and
+         * wake happens part-way through our work  */
+        osb->dc_work_sequence = osb->dc_wake_sequence;
+        processed = osb->blocked_lock_count;
+        while (processed) {
+                BUG_ON(list_empty(&osb->blocked_lock_list));
+                lockres = list_entry(osb->blocked_lock_list.next,
+                                     struct ocfs2_lock_res, l_blocked_list);
+                list_del_init(&lockres->l_blocked_list);
+                osb->blocked_lock_count--;
+                spin_unlock(&osb->dc_task_lock);
+                BUG_ON(!processed);
+                processed--;
+                ocfs2_process_blocked_lock(osb, lockres);
+                spin_lock(&osb->dc_task_lock);
+        }
+        spin_unlock(&osb->dc_task_lock);
+        mlog_exit_void();
+}
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
+{
+        int empty = 0;
+        spin_lock(&osb->dc_task_lock);
+        if (list_empty(&osb->blocked_lock_list))
+                empty = 1;
+        spin_unlock(&osb->dc_task_lock);
+        return empty;
+}
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+        int should_wake = 0;
+        spin_lock(&osb->dc_task_lock);
+        if (osb->dc_work_sequence != osb->dc_wake_sequence)
+                should_wake = 1;
+        spin_unlock(&osb->dc_task_lock);
+        return should_wake;
+}
+int ocfs2_downconvert_thread(void *arg)
+{
+        int status = 0;
+        struct ocfs2_super *osb = arg;
+        /* only quit once we've been asked to stop and there is no more
+         * work available */
+        while (!(kthread_should_stop() &&
+                ocfs2_downconvert_thread_lists_empty(osb))) {
+                wait_event_interruptible(osb->dc_event,
+                                         ocfs2_downconvert_thread_should_wake(osb) ||
+                                         kthread_should_stop());
+                mlog(0, "downconvert_thread: awoken\n");
+                ocfs2_downconvert_thread_do_work(osb);
+        }
+        osb->dc_task = NULL;
+        return status;
+}
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+        spin_lock(&osb->dc_task_lock);
+        /* make sure the voting thread gets a swipe at whatever changes
+         * the caller may have made to the voting state */
+        osb->dc_wake_sequence++;
+        spin_unlock(&osb->dc_task_lock);
+        wake_up(&osb->dc_event);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
        __be32       lvb_reserved2;
 };
-/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
 /* Instruct the dlm not to queue ourselves on the other node. */
 #define OCFS2_META_LOCK_NOQUEUE         (0x02)
-/* don't block waiting for the vote thread, instead return -EAGAIN */
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
 #define OCFS2_LOCK_NONBLOCK             (0x04)
 int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               struct inode *inode);
 void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
                                u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+                              struct ocfs2_file_private *fp);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
-int ocfs2_data_lock_full(struct inode *inode,
-                         int write,
-                         int arg_flags);
-#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
-int ocfs2_data_lock_with_page(struct inode *inode,
-                              int write,
-                              struct page *page);
-void ocfs2_data_unlock(struct inode *inode,
-                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
 int ocfs2_open_lock(struct inode *inode);
 int ocfs2_try_open_lock(struct inode *inode, int write);
 void ocfs2_open_unlock(struct inode *inode);
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
                         struct buffer_head **ret_bh,
                         int ex,
                         int arg_flags);
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
                              struct buffer_head **ret_bh,
                              int ex,
                              struct page *page);
 /* 99% of the time we don't want to supply any additional flags --
 * those are for very specific cases only. */
-#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0)
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
-void ocfs2_meta_unlock(struct inode *inode,
+void ocfs2_inode_unlock(struct inode *inode,
                       int ex);
 int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
                               struct ocfs2_lock_res *lockres);
-/* for the vote thread */
+/* for the downconvert thread */
 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                                struct ocfs2_lock_res *lockres);
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
        *var = cpu_to_le64(le64_to_cpu(*var) + val);
 }
-static inline void le32_and_cpu(__le32 *var, u32 val)
-{
-        *var = cpu_to_le32(le32_to_cpu(*var) & val);
-}
 static inline void be32_add_cpu(__be32 *var, u32 val)
 {
        *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        }
-        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
+        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
        if (IS_ERR(inode))
                return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        mlog(0, "find parent of directory %llu\n",
             (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        status = ocfs2_meta_lock(dir, NULL, 0);
+        status = ocfs2_inode_lock(dir, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
                goto bail_unlock;
        }
-        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
        if (IS_ERR(inode)) {
                mlog(ML_ERROR, "Unable to create inode %llu\n",
                     (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        parent->d_op = &ocfs2_dentry_ops;
 bail_unlock:
-        ocfs2_meta_unlock(dir, 0);
+        ocfs2_inode_unlock(dir, 0);
 bail:
        mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f92fe91ff260..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
 #include "inode.h"
 #include "ioctl.h"
 #include "journal.h"
+#include "locks.h"
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
        return sync_mapping_buffers(inode->i_mapping);
 }
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+        struct ocfs2_file_private *fp;
+        fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+        if (!fp)
+                return -ENOMEM;
+        fp->fp_file = file;
+        mutex_init(&fp->fp_mutex);
+        ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+        file->private_data = fp;
+        return 0;
+}
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (fp) {
+                ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+                ocfs2_lock_res_free(&fp->fp_flock);
+                kfree(fp);
+                file->private_data = NULL;
+        }
+}
 static int ocfs2_file_open(struct inode *inode, struct file *file)
 {
        int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        oi->ip_open_count++;
        spin_unlock(&oi->ip_lock);
-        status = 0;
+        status = ocfs2_init_file_private(inode, file);
+        if (status) {
+                /*
+                 * We want to set open count back if we're failing the
+                 * open.
+                 */
+                spin_lock(&oi->ip_lock);
+                oi->ip_open_count--;
+                spin_unlock(&oi->ip_lock);
+        }
 leave:
        mlog_exit(status);
        return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
        spin_unlock(&oi->ip_lock);
+        ocfs2_free_file_private(inode, file);
        mlog_exit(0);
        return 0;
 }
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+        return ocfs2_init_file_private(inode, file);
+}
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+        ocfs2_free_file_private(inode, file);
+        return 0;
+}
 static int ocfs2_sync_file(struct file *file,
                           struct dentry *dentry,
                           int datasync)
@@ -382,28 +436,23 @@ static int ocfs2_truncate_file(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        /* This forces other nodes to sync and drop their pages. Do
+        /*
-         * this even if we have a truncate without allocation change -
+         * The inode lock forced other nodes to sync and drop their
-         * ocfs2 cluster sizes can be much greater than page size, so
+         * pages, which (correctly) happens even if we have a truncate
-         * we have to truncate them anyway.  */
+         * without allocation change - ocfs2 cluster sizes can be much
-        status = ocfs2_data_lock(inode, 1);
+         * greater than page size, so we have to truncate them
-        if (status < 0) {
+         * anyway.
-                up_write(&OCFS2_I(inode)->ip_alloc_sem);
+         */
-                mlog_errno(status);
-                goto bail;
-        }
        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
        truncate_inode_pages(inode->i_mapping, new_i_size);
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
-                                               i_size_read(inode), 0);
+                                               i_size_read(inode), 1);
                if (status)
                        mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
        if (status < 0) {
                mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
-                goto bail_unlock_data;
+                goto bail_unlock_sem;
        }
        /* TODO: orphan dir cleanup here. */
-bail_unlock_data:
+bail_unlock_sem:
-        ocfs2_data_unlock(inode, 1);
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
             "clusters_to_add = %u, extents_to_split = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+             (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
             le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
             le32_to_cpu(fe->i_clusters),
             (unsigned long long)le64_to_cpu(fe->i_size));
        mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-             OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
        if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size)
 {
-        int ret = 0, data_locked = 0;
+        int ret = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
            && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
                goto out_update_size;
-        /* 
-         * protect the pages that ocfs2_zero_extend is going to be
-         * pulling into the page cache.. we do this before the
-         * metadata extend so that we don't get into the situation
-         * where we've extended the metadata but can't get the data
-         * lock to zero.
-         */
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out;
-        }
-        data_locked = 1;
        /*
         * The alloc sem blocks people in read/write from reading our
         * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
                        up_write(&oi->ip_alloc_sem);
                        mlog_errno(ret);
-                        goto out_unlock;
+                        goto out;
                }
        }
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
        if (ret < 0) {
                mlog_errno(ret);
-                goto out_unlock;
+                goto out;
        }
 out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
        if (ret < 0)
                mlog_errno(ret);
-out_unlock:
-        if (data_locked)
-                ocfs2_data_unlock(inode, 1);
 out:
        return ret;
 }
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        status = ocfs2_meta_lock(inode, &bh, 1);
+        status = ocfs2_inode_lock(inode, &bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
                ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        mlog_entry_void();
-        ret = ocfs2_meta_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret) {
                if (ret != -ENOENT)
                        mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        ret = generic_permission(inode, mask, NULL);
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
 out:
        mlog_exit(ret);
        return ret;
@@ -1521,6 +1550,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct address_space *mapping = inode->i_mapping;
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1529,10 +1559,20 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
-                                            byte_start + byte_len, 1);
+                                            byte_start + byte_len, 0);
-                if (ret)
+                if (ret) {
                        mlog_errno(ret);
-                return ret;
+                        goto out;
+                }
+                /*
+                 * There's no need to get fancy with the page cache
+                 * truncate of an inline-data inode. We're talking
+                 * about less than a page here, which will be cached
+                 * in the dinode buffer anyway.
+                 */
+                unmap_mapping_range(mapping, 0, 0, 0);
+                truncate_inode_pages(mapping, 0);
+                goto out;
        }
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
@@ -1619,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
                goto out;
        }
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
                mlog_errno(ret);
                goto out_rw_unlock;
@@ -1627,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
                ret = -EPERM;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        switch (sr->l_whence) {
@@ -1641,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
                break;
        default:
                ret = -EINVAL;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        sr->l_whence = 0;
@@ -1652,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
            || (sr->l_start + llen) < 0
            || (sr->l_start + llen) > max_off) {
                ret = -EINVAL;
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        size = sr->l_start + sr->l_len;
        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
                if (sr->l_len <= 0) {
                        ret = -EINVAL;
-                        goto out_meta_unlock;
+                        goto out_inode_unlock;
                }
        }
@@ -1667,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
                ret = __ocfs2_write_remove_suid(inode, di_bh);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_meta_unlock;
+                        goto out_inode_unlock;
                }
        }
@@ -1693,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        if (ret) {
                mlog_errno(ret);
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        /*
@@ -1703,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_meta_unlock;
+                goto out_inode_unlock;
        }
        if (change_size && i_size_read(inode) < size)
@@ -1716,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        ocfs2_commit_trans(osb, handle);
-out_meta_unlock:
+out_inode_unlock:
        brelse(di_bh);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 out_rw_unlock:
        ocfs2_rw_unlock(inode, 1);
@@ -1788,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
         * if we need to make modifications here.
         */
        for(;;) {
-                ret = ocfs2_meta_lock(inode, NULL, meta_level);
+                ret = ocfs2_inode_lock(inode, NULL, meta_level);
                if (ret < 0) {
                        meta_level = -1;
                        mlog_errno(ret);
@@ -1806,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                 * set inode->i_size at the end of a write. */
                if (should_remove_suid(dentry)) {
                        if (meta_level == 0) {
-                                ocfs2_meta_unlock(inode, meta_level);
+                                ocfs2_inode_unlock(inode, meta_level);
                                meta_level = 1;
                                continue;
                        }
@@ -1875,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                *ppos = saved_pos;
 out_unlock:
-        ocfs2_meta_unlock(inode, meta_level);
+        ocfs2_inode_unlock(inode, meta_level);
 out:
        return ret;
@@ -1891,9 +1931,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        ssize_t written = 0;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
-        loff_t *ppos = &iocb->ki_pos;
+        loff_t old_size, *ppos = &iocb->ki_pos;
+        u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, %u, '%.*s')\n", file,
                   (unsigned int)nr_segs,
@@ -1949,6 +1991,13 @@ relock:
                goto relock;
        }
+        /*
+         * To later detect whether a journal commit for sync writes is
+         * necessary, we sample i_size, and cluster count here.
+         */
+        old_size = i_size_read(inode);
+        old_clusters = OCFS2_I(inode)->ip_clusters;
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -1978,6 +2027,21 @@ out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+        if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
+                /*
+                 * The generic write paths have handled getting data
+                 * to disk, but since we don't make use of the dirty
+                 * inode list, a manual journal commit is necessary
+                 * here.
+                 */
+                if (old_size != i_size_read(inode) ||
+                    old_clusters != OCFS2_I(inode)->ip_clusters) {
+                        ret = journal_force_commit(osb->journal->j_journal);
+                        if (ret < 0)
+                                written = ret;
+                }
+        }
        /* 
         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
         * function pointer which is called when o_direct io completes so that
@@ -2064,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        /*
         * See the comment in ocfs2_file_aio_read()
         */
-        ret = ocfs2_meta_lock(inode, NULL, 0);
+        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret < 0) {
                mlog_errno(ret);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
@@ -2125,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
         * like i_size. This allows the checks down below
         * generic_file_aio_read() a chance of actually working. 
         */
-        ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+        ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, lock_level);
+        ocfs2_inode_unlock(inode, lock_level);
        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
        if (ret == -EINVAL)
@@ -2169,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
 };
 const struct file_operations ocfs2_fops = {
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .mmap           = ocfs2_mmap,
@@ -2181,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
 };
 const struct file_operations ocfs2_dops = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
+        .release        = ocfs2_dir_release,
+        .open           = ocfs2_dir_open,
        .ioctl          = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+        .flock          = ocfs2_flock,
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+struct ocfs2_file_private {
+        struct file             *fp_file;
+        struct mutex            fp_mutex;
+        struct ocfs2_lock_res   fp_flock;
+};
 enum ocfs2_alloc_restarted {
        RESTART_NONE = 0,
        RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
 #include <linux/highmem.h>
 #include <linux/kmod.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
 #include <dlm/dlmapi.h>
 #define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
-#include "vote.h"
 #include "buffer_head_io.h"
-#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
-#define OCFS2_HB_NODE_UP_PRI       OCFS2_HB_NODE_DOWN_PRI
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
        spin_lock_init(&osb->node_map_lock);
-        ocfs2_node_map_init(&osb->mounted_map);
        ocfs2_node_map_init(&osb->recovery_map);
-        ocfs2_node_map_init(&osb->umount_map);
        ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
                return;
        }
-        if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
-                /* If a node is in the umount map, then we've been
-                 * expecting him to go down and we know ahead of time
-                 * that recovery is not necessary. */
-                ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-                return;
-        }
        ocfs2_recovery_thread(osb, node_num);
-        ocfs2_remove_node_from_vote_queues(osb, node_num);
-}
-static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
-                                  int node_num,
-                                  void *data)
-{
-        ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
 }
 /* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
        ocfs2_do_node_down(node_num, osb);
 }
-static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
-                                int node_num,
-                                void *data)
-{
-        struct ocfs2_super *osb = data;
-        BUG_ON(osb->node_num == node_num);
-        mlog(0, "node up event for %d\n", node_num);
-        ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
 void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
 {
-        o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
-                            ocfs2_hb_node_down_cb, osb,
-                            OCFS2_HB_NODE_DOWN_PRI);
-        o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
-                            ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
        /* Not exactly a heartbeat callback, but leads to essentially
         * the same path so we set it up here. */
        dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
                              osb);
 }
-/* Most functions here are just stubs for now... */
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
-{
-        int status;
-        if (ocfs2_mount_local(osb))
-                return 0;
-        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
-        if (status < 0) {
-                mlog_errno(status);
-                o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-        }
-bail:
-        return status;
-}
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
-{
-        if (ocfs2_mount_local(osb))
-                return;
-        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-        o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
-}
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 {
        int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
        spin_lock(&osb->node_map_lock);
-        __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
        if (!test_bit(num, osb->recovery_map.map)) {
            __ocfs2_node_map_set_bit(&osb->recovery_map, num);
            set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
 void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 /* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 1d5e0cb0fda1..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
        u64             fi_blkno;
        unsigned long   fi_ino;
        unsigned int    fi_flags;
+        unsigned int    fi_sysfile_type;
 };
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
 static int ocfs2_read_locked_inode(struct inode *inode,
                                   struct ocfs2_find_inode_args *args);
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+                         int sysfile_type)
 {
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
        args.fi_blkno = blkno;
        args.fi_flags = flags;
        args.fi_ino = ino_from_blkno(sb, blkno);
+        args.fi_sysfile_type = sysfile_type;
        inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
                             ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        inode->i_ino = args->fi_ino;
        OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+        if (args->fi_sysfile_type != 0)
+                lockdep_set_class(&inode->i_mutex,
+                        &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
        mlog_exit(0);
        return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                 */
                BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
-                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                                  OCFS2_LOCK_TYPE_RW, inode->i_generation,
                                  inode);
-        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
-                                  OCFS2_LOCK_TYPE_DATA, inode->i_generation,
-                                  inode);
        ocfs2_set_inode_flags(inode);
        status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
                generation = osb->fs_generation;
-        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                        mlog_errno(status);
                        return status;
                }
-                status = ocfs2_meta_lock(inode, NULL, 0);
+                status = ocfs2_inode_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
                        mlog_errno(status);
@@ -455,8 +458,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
+                mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                     (unsigned long long)args->fi_blkno, 7,
                     fe->i_signature);
                goto bail;
        }
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 bail:
        if (can_lock)
-                ocfs2_meta_unlock(inode, 0);
+                ocfs2_inode_unlock(inode, 0);
        if (status < 0)
                make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        mutex_lock(&inode_alloc_inode->i_mutex);
-        status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1);
+        status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
        if (status < 0) {
                mutex_unlock(&inode_alloc_inode->i_mutex);
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-        le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
        status = ocfs2_journal_dirty(handle, di_bh);
        if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_meta_unlock(inode_alloc_inode, 1);
+        ocfs2_inode_unlock(inode_alloc_inode, 1);
        mutex_unlock(&inode_alloc_inode->i_mutex);
        brelse(inode_alloc_bh);
 bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
         * delete_inode operation. We do this now to avoid races with
         * recovery completion on other nodes. */
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (status < 0) {
                mutex_unlock(&orphan_dir_inode->i_mutex);
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
        }
        /* we do this while holding the orphan dir lock because we
-         * don't want recovery being run from another node to vote for
+         * don't want recovery being run from another node to try an
-         * an inode delete on us -- this will result in two nodes
+         * inode delete underneath us -- this will result in two nodes
         * truncating the same file! */
        status = ocfs2_truncate_for_delete(osb, inode, di_bh);
        if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
                mlog_errno(status);
 bail_unlock_dir:
-        ocfs2_meta_unlock(orphan_dir_inode, 1);
+        ocfs2_inode_unlock(orphan_dir_inode, 1);
        mutex_unlock(&orphan_dir_inode->i_mutex);
        brelse(orphan_dir_bh);
 bail:
@@ -744,7 +747,7 @@ bail:
 }
 /* There is a series of simple checks that should be done before a
- * vote is even considered. Encapsulate those in this function. */
+ * trylock is even considered. Encapsulate those in this function. */
 static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 {
        int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
                goto bail;
        }
-        /* If we're coming from process_vote we can't go into our own
+        /* If we're coming from downconvert_thread we can't go into our own
         * voting [hello, deadlock city!], so unforuntately we just
         * have to skip deleting this guy. That's OK though because
         * the node who's doing the actual deleting should handle it
         * anyway. */
-        if (current == osb->vote_task) {
+        if (current == osb->dc_task) {
                mlog(0, "Skipping delete of %lu because we're currently "
-                     "in process_vote\n", inode->i_ino);
+                     "in downconvert\n", inode->i_ino);
                goto bail;
        }
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
                goto bail_unlock;
        }
-        /* If we have voted "yes" on the wipe of this inode for
+        /* If we have allowd wipe of this inode for another node, it
-         * another node, it will be marked here so we can safely skip
+         * will be marked here so we can safely skip it. Recovery will
-         * it. Recovery will cleanup any inodes we might inadvertantly
+         * cleanup any inodes we might inadvertantly skip here. */
-         * skip here. */
        if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
                mlog(0, "Skipping delete of %lu because another node "
                     "has done this for us.\n", inode->i_ino);
@@ -863,7 +865,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        status = ocfs2_try_open_lock(inode, 1);
        if (status == -EAGAIN) {
                status = 0;
-                mlog(0, "Skipping delete of %llu because it is in use on"
+                mlog(0, "Skipping delete of %llu because it is in use on "
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
                goto bail;
        }
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
        /* Lock down the inode. This gives us an up to date view of
         * it's metadata (for verification), and allows us to
-         * serialize delete_inode votes. 
+         * serialize delete_inode on multiple nodes.
         *
         * Even though we might be doing a truncate, we don't take the
         * allocation lock here as it won't be needed - nobody will
         * have the file open.
         */
-        status = ocfs2_meta_lock(inode, &di_bh, 1);
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
         * before we go ahead and wipe the inode. */
        status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
        if (!wipe || status < 0) {
-                /* Error and inode busy vote both mean we won't be
+                /* Error and remote inode busy both mean we won't be
                 * removing the inode, so they take almost the same
                 * path. */
                if (status < 0)
                        mlog_errno(status);
-                /* Someone in the cluster has voted to not wipe this
+                /* Someone in the cluster has disallowed a wipe of
-                 * inode, or it was never completely orphaned. Write
+                 * this inode, or it was never completely
-                 * out the pages and exit now. */
+                 * orphaned. Write out the pages and exit now. */
                ocfs2_cleanup_delete_inode(inode, 1);
                goto bail_unlock_inode;
        }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
        OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
 bail_unlock_inode:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
 bail_unblock:
        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
-        /* For remove delete_inode vote, we hold open lock before,
+        /* To preven remote deletes we hold open lock before, now it
-         * now it is time to unlock PR and EX open locks. */
+         * is time to unlock PR and EX open locks. */
        ocfs2_open_unlock(inode);
        /* Do these before all the other work so that we don't bounce
-         * the vote thread while waiting to destroy the locks. */
+         * the downconvert thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
-        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
-        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
        /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
                mlog_errno(status);
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
-        ocfs2_lock_res_free(&oi->ip_meta_lockres);
+        ocfs2_lock_res_free(&oi->ip_inode_lockres);
-        ocfs2_lock_res_free(&oi->ip_data_lockres);
        ocfs2_lock_res_free(&oi->ip_open_lockres);
        ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
        }
        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        /* Let ocfs2_meta_lock do the work of updating our struct
+        /* Let ocfs2_inode_lock do the work of updating our struct
         * inode for us. */
-        status = ocfs2_meta_lock(inode, NULL, 0);
+        status = ocfs2_inode_lock(inode, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
 bail:
        mlog_exit(status);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
        u64                     ip_blkno;
        struct ocfs2_lock_res           ip_rw_lockres;
-        struct ocfs2_lock_res           ip_meta_lockres;
+        struct ocfs2_lock_res           ip_inode_lockres;
-        struct ocfs2_lock_res           ip_data_lockres;
        struct ocfs2_lock_res           ip_open_lockres;
        /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_SYSFILE           0x4
+#define OCFS2_FI_FLAG_SYSFILE           0x1
-#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x8
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY   0x2
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
 #include "ocfs2_fs.h"
 #include "ioctl.h"
+#include "resize.h"
 #include <linux/ext2_fs.h>
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
        int status;
-        status = ocfs2_meta_lock(inode, NULL, 0);
+        status = ocfs2_inode_lock(inode, NULL, 0);
        if (status < 0) {
                mlog_errno(status);
                return status;
        }
        ocfs2_get_inode_flags(OCFS2_I(inode));
        *flags = OCFS2_I(inode)->ip_attr;
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
        mlog_exit(status);
        return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_meta_lock(inode, &bh, 1);
+        status = ocfs2_inode_lock(inode, &bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 bail:
        mutex_unlock(&inode->i_mutex);
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
        unsigned int cmd, unsigned long arg)
 {
        unsigned int flags;
+        int new_clusters;
        int status;
        struct ocfs2_space_resv sr;
+        struct ocfs2_new_group_input input;
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
                        return -EFAULT;
                return ocfs2_change_file_space(filp, cmd, &sr);
+        case OCFS2_IOC_GROUP_EXTEND:
+                if (!capable(CAP_SYS_RESOURCE))
+                        return -EPERM;
+                if (get_user(new_clusters, (int __user *)arg))
+                        return -EFAULT;
+                return ocfs2_group_extend(inode, new_clusters);
+        case OCFS2_IOC_GROUP_ADD:
+        case OCFS2_IOC_GROUP_ADD64:
+                if (!capable(CAP_SYS_RESOURCE))
+                        return -EPERM;
+                if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+                        return -EFAULT;
+                return ocfs2_group_add(inode, &input);
        default:
                return -ENOTTY;
        }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_RESVSP64:
        case OCFS2_IOC_UNRESVSP:
        case OCFS2_IOC_UNRESVSP64:
+        case OCFS2_IOC_GROUP_EXTEND:
+        case OCFS2_IOC_GROUP_ADD:
+        case OCFS2_IOC_GROUP_ADD64:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f9d01e25298d..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
 #include "localalloc.h"
 #include "slot_map.h"
 #include "super.h"
-#include "vote.h"
 #include "sysfile.h"
 #include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
        mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
             journal->j_trans_id, flushed);
-        ocfs2_kick_vote_thread(osb);
+        ocfs2_wake_downconvert_thread(osb);
        wake_up(&journal->j_checkpointed);
 finally:
        mlog_exit(status);
@@ -174,6 +173,12 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 * transaction. extend_trans will either extend the current handle by
 * nblocks, or commit it and start a new one with nblocks credits.
 *
+ * This might call journal_restart() which will commit dirty buffers
+ * and then restart the transaction. Before calling
+ * ocfs2_extend_trans(), any changed blocks should have been
+ * dirtied. After calling it, all blocks which need to be changed must
+ * go through another set of journal_access/journal_dirty calls.
+ *
 * WARNING: This will not release any semaphores or disk locks taken
 * during the transaction, so make sure they were taken *before*
 * start_trans or we'll have ordering deadlocks.
@@ -193,11 +198,15 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+#ifdef OCFS2_DEBUG_FS
+        status = 1;
+#else
        status = journal_extend(handle, nblocks);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
+#endif
        if (status > 0) {
                mlog(0, "journal_extend failed, trying journal_restart\n");
@@ -304,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
        return err;
 }
-#define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * 5)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
        journal_t *journal = osb->journal->j_journal;
+        unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+        if (osb->osb_commit_interval)
+                commit_interval = osb->osb_commit_interval;
        spin_lock(&journal->j_state_lock);
-        journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+        journal->j_commit_interval = commit_interval;
        if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
                journal->j_flags |= JFS_BARRIER;
        else
@@ -327,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        struct ocfs2_dinode *di = NULL;
        struct buffer_head *bh = NULL;
        struct ocfs2_super *osb;
-        int meta_lock = 0;
+        int inode_lock = 0;
        mlog_entry_void();
@@ -357,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        /* Skip recovery waits here - journal inode metadata never
         * changes in a live cluster so it can be considered an
         * exception to the rule. */
-        status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
                if (status != -ERESTARTSYS)
                        mlog(ML_ERROR, "Could not get lock on journal!\n");
                goto done;
        }
-        meta_lock = 1;
+        inode_lock = 1;
        di = (struct ocfs2_dinode *)bh->b_data;
        if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
@@ -404,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        status = 0;
 done:
        if (status < 0) {
-                if (meta_lock)
+                if (inode_lock)
-                        ocfs2_meta_unlock(inode, 1);
+                        ocfs2_inode_unlock(inode, 1);
                if (bh != NULL)
                        brelse(bh);
                if (inode) {
@@ -534,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        OCFS2_I(inode)->ip_open_count--;
        /* unlock our journal */
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
        brelse(journal->j_bh);
        journal->j_bh = NULL;
@@ -873,8 +886,8 @@ restart:
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
-         * node(s) may have voted "no" on an inode delete earlier. A
+         * node(s) may have disallowd a previos inode delete. Re-processing
-         * revote is therefore required. */
+         * is therefore required. */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
                                        NULL);
@@ -963,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
-                mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+                mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
                if (status != -ERESTARTSYS)
                        mlog(ML_ERROR, "Could not lock journal!\n");
                goto done;
@@ -1037,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 done:
        /* drop the lock on this nodes journal */
        if (got_lock)
-                ocfs2_meta_unlock(inode, 1);
+                ocfs2_inode_unlock(inode, 1);
        if (inode)
                iput(inode);
@@ -1152,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
        SET_INODE_JOURNAL(inode);
        flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
-        status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
+        status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
        if (status < 0) {
                if (status != -EAGAIN)
                        mlog_errno(status);
                goto bail;
        }
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 bail:
        if (inode)
                iput(inode);
@@ -1231,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
        /* Skip bad inodes so that recovery can continue */
        iter = ocfs2_iget(p->osb, ino,
-                          OCFS2_FI_FLAG_ORPHAN_RECOVERY);
+                          OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
        if (IS_ERR(iter))
                return 0;
@@ -1267,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
        }       
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
+        status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1277,12 +1290,13 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                                   ocfs2_orphan_filldir);
        if (status) {
                mlog_errno(status);
-                goto out;
+                goto out_cluster;
        }
        *head = priv.head;
-        ocfs2_meta_unlock(orphan_dir_inode, 0);
+out_cluster:
+        ocfs2_inode_unlock(orphan_dir_inode, 0);
 out:
        mutex_unlock(&orphan_dir_inode->i_mutex);
        iput(orphan_dir_inode);
@@ -1369,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                spin_lock(&oi->ip_lock);
-                /* Delete voting may have set these on the assumption
+                /* The remote delete code may have set these on the
-                 * that the other node would wipe them successfully.
+                 * assumption that the other node would wipe them
-                 * If they are still in the node's orphan dir, we need
+                 * successfully.  If they are still in the node's
-                 * to reset that state. */
+                 * orphan dir, we need to reset that state. */
                oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
                /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
 /* get one bit out of a suballocator: dinode + group descriptor +
 * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index d272847d5a07..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
-/*
- * Determine how large our local alloc window should be, in bits.
- *
- * These values (and the behavior in ocfs2_alloc_should_use_local) have
- * been chosen so that most allocations, including new block groups go
- * through local alloc.
- */
 static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
 {
-        BUG_ON(osb->s_clustersize_bits < 12);
+        BUG_ON(osb->s_clustersize_bits > 20);
-        return 2048 >> (osb->s_clustersize_bits - 12);
+        /* Size local alloc windows by the megabyte */
+        return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
 }
 /*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
        int la_bits = ocfs2_local_alloc_window_bits(osb);
+        int ret = 0;
        if (osb->local_alloc_state != OCFS2_LA_ENABLED)
-                return 0;
+                goto bail;
        /* la_bits should be at least twice the size (in clusters) of
         * a new block group. We want to be sure block group
         * allocations go through the local alloc, so allow an
         * allocation to take up to half the bitmap. */
        if (bits > (la_bits / 2))
-                return 0;
+                goto bail;
-        return 1;
+        ret = 1;
+bail:
+        mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
+             osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+        return ret;
 }
 int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
+        if (ocfs2_mount_local(osb))
+                goto bail;
+        if (osb->local_alloc_size == 0)
+                goto bail;
+        if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+                mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+                     "than max possible %u. Using defaults.\n",
+                     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
+                osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        }
        /* read the alloc off disk */
        inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
                                            osb->slot_num);
@@ -181,6 +193,9 @@ bail:
        if (inode)
                iput(inode);
+        mlog(0, "Local alloc window bits = %d\n",
+             ocfs2_local_alloc_window_bits(osb));
        mlog_exit(status);
        return status;
 }
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        mutex_lock(&main_bm_inode->i_mutex);
-        status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+        status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
        if (main_bm_bh)
                brelse(main_bm_bh);
-        ocfs2_meta_unlock(main_bm_inode, 1);
+        ocfs2_inode_unlock(main_bm_inode, 1);
 out_mutex:
        mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&main_bm_inode->i_mutex);
-        status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+        status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
        ocfs2_commit_trans(osb, handle);
 out_unlock:
-        ocfs2_meta_unlock(main_bm_inode, 1);
+        ocfs2_inode_unlock(main_bm_inode, 1);
 out_mutex:
        mutex_unlock(&main_bm_inode->i_mutex);
@@ -484,6 +499,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+#ifdef OCFS2_DEBUG_FS
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
            ocfs2_local_alloc_count_bits(alloc)) {
                ocfs2_error(osb->sb, "local alloc inode %llu says it has "
@@ -494,6 +510,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                status = -EIO;
                goto bail;
        }
+#endif
        free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
                le32_to_cpu(alloc->id1.bitmap1.i_used);
@@ -519,6 +536,9 @@ bail:
                iput(local_alloc_inode);
        }
+        mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
+             status);
        mlog_exit(status);
        return status;
 }
@@ -712,9 +732,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
        void *bitmap;
        struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
-        mlog_entry("total = %u, COUNT = %u, used = %u\n",
+        mlog_entry("total = %u, used = %u\n",
                   le32_to_cpu(alloc->id1.bitmap1.i_total),
-                   ocfs2_local_alloc_count_bits(alloc),
                   le32_to_cpu(alloc->id1.bitmap1.i_used));
        if (!alloc->id1.bitmap1.i_total) {
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "locks.h"
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+                          int cmd, struct file_lock *fl)
+{
+        int ret = 0, level = 0, trylock = 0;
+        struct ocfs2_file_private *fp = file->private_data;
+        struct ocfs2_lock_res *lockres = &fp->fp_flock;
+        if (fl->fl_type == F_WRLCK)
+                level = 1;
+        if (!IS_SETLKW(cmd))
+                trylock = 1;
+        mutex_lock(&fp->fp_mutex);
+        if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+            lockres->l_level > LKM_NLMODE) {
+                int old_level = 0;
+                if (lockres->l_level == LKM_EXMODE)
+                        old_level = 1;
+                if (level == old_level)
+                        goto out;
+                /*
+                 * Converting an existing lock is not guaranteed to be
+                 * atomic, so we can get away with simply unlocking
+                 * here and allowing the lock code to try at the new
+                 * level.
+                 */
+                flock_lock_file_wait(file,
+                                     &(struct file_lock){.fl_type = F_UNLCK});
+                ocfs2_file_unlock(file);
+        }
+        ret = ocfs2_file_lock(file, level, trylock);
+        if (ret) {
+                if (ret == -EAGAIN && trylock)
+                        ret = -EWOULDBLOCK;
+                else
+                        mlog_errno(ret);
+                goto out;
+        }
+        ret = flock_lock_file_wait(file, fl);
+out:
+        mutex_unlock(&fp->fp_mutex);
+        return ret;
+}
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+        int ret;
+        struct ocfs2_file_private *fp = file->private_data;
+        mutex_lock(&fp->fp_mutex);
+        ocfs2_file_unlock(file);
+        ret = flock_lock_file_wait(file, fl);
+        mutex_unlock(&fp->fp_mutex);
+        return ret;
+}
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (!(fl->fl_flags & FL_FLOCK))
+                return -ENOLCK;
+        if (__mandatory_lock(inode))
+                return -ENOLCK;
+        if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+            ocfs2_mount_local(osb))
+                return flock_lock_file_wait(file, fl);
+        if (fl->fl_type == F_UNLCK)
+                return ocfs2_do_funlock(file, cmd, fl);
+        else
+                return ocfs2_do_flock(file, inode, cmd, fl);
+}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de31..9743ef2324ec 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
 /* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
- * vote.h
+ * locks.h
 *
- * description here
+ * Function prototypes for Userspace file locking support
 *
 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
 *
@@ -23,26 +23,9 @@
 * Boston, MA 021110-1307, USA.
 */
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
-#ifndef VOTE_H
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
-#define VOTE_H
-int ocfs2_vote_thread(void *arg);
+#endif /* OCFS2_LOCKS_H */
-static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
-{
-        spin_lock(&osb->vote_task_lock);
-        /* make sure the voting thread gets a swipe at whatever changes
-         * the caller may have made to the voting state */
-        osb->vote_wake_sequence++;
-        spin_unlock(&osb->vote_task_lock);
-        wake_up(&osb->vote_event);
-}
-int ocfs2_request_mount_vote(struct ocfs2_super *osb);
-int ocfs2_request_umount_vote(struct ocfs2_super *osb);
-int ocfs2_register_net_handlers(struct ocfs2_super *osb);
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
-                                        int node_num);
-#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
         * node. Taking the data lock will also ensure that we don't
         * attempt page truncation as part of a downconvert.
         */
-        ret = ocfs2_meta_lock(inode, &di_bh, 1);
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
-        ret = ocfs2_data_lock(inode, 1);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_meta_unlock;
-        }
        ret = __ocfs2_page_mkwrite(inode, di_bh, page);
-        ocfs2_data_unlock(inode, 1);
-out_meta_unlock:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
        brelse(di_bh);
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 out:
        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int ret = 0, lock_level = 0;
-        ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
+        ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
                                    file->f_vfsmnt, &lock_level);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
+        ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
 out:
        vma->vm_ops = &ocfs2_file_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 729259016c18..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
             dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        status = ocfs2_meta_lock(dir, NULL, 0);
+        status = ocfs2_inode_lock(dir, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        if (status < 0)
                goto bail_add;
-        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
        if (IS_ERR(inode)) {
                ret = ERR_PTR(-EACCES);
                goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
        /* Don't drop the cluster lock until *after* the d_add --
         * unlink on another node will message us to remove that
         * dentry under this lock so otherwise we can race this with
-         * the vote thread and have a stale dentry. */
+         * the downconvert thread and have a stale dentry. */
-        ocfs2_meta_unlock(dir, 0);
+        ocfs2_inode_unlock(dir, 0);
 bail:
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
        /* get our super block */
        osb = OCFS2_SB(dir->i_sb);
-        status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+        status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
-        err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+        err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out;
        }
-        err = ocfs2_meta_lock(inode, &fe_bh, 1);
+        err = ocfs2_inode_lock(inode, &fe_bh, 1);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
        ocfs2_commit_trans(osb, handle);
 out_unlock_inode:
-        ocfs2_meta_unlock(inode, 1);
+        ocfs2_inode_unlock(inode, 1);
 out:
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (de_bh)
                brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
                return -EPERM;
        }
-        status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
+        status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        status = ocfs2_meta_lock(inode, &fe_bh, 1);
+        status = ocfs2_inode_lock(inode, &fe_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
        status = ocfs2_remote_dentry_delete(dentry);
        if (status < 0) {
-                /* This vote should succeed under all normal
+                /* This remote delete should succeed under all normal
                 * circumstances. */
                mlog_errno(status);
                goto leave;
@@ -841,13 +840,13 @@ leave:
                ocfs2_commit_trans(osb, handle);
        if (child_locked)
-                ocfs2_meta_unlock(inode, 1);
+                ocfs2_inode_unlock(inode, 1);
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
-                ocfs2_meta_unlock(orphan_dir, 1);
+                ocfs2_inode_unlock(orphan_dir, 1);
                mutex_unlock(&orphan_dir->i_mutex);
                iput(orphan_dir);
        }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                        inode1 = tmpinode;
                }
                /* lock id2 */
-                status = ocfs2_meta_lock(inode2, bh2, 1);
+                status = ocfs2_inode_lock(inode2, bh2, 1);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        }
        /* lock id1 */
-        status = ocfs2_meta_lock(inode1, bh1, 1);
+        status = ocfs2_inode_lock(inode1, bh1, 1);
        if (status < 0) {
                /*
                 * An error return must mean that no cluster locks
                 * were held on function exit.
                 */
                if (oi1->ip_blkno != oi2->ip_blkno)
-                        ocfs2_meta_unlock(inode2, 1);
+                        ocfs2_inode_unlock(inode2, 1);
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 {
-        ocfs2_meta_unlock(inode1, 1);
+        ocfs2_inode_unlock(inode1, 1);
        if (inode1 != inode2)
-                ocfs2_meta_unlock(inode2, 1);
+                ocfs2_inode_unlock(inode2, 1);
 }
 static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
        /*
         * Aside from allowing a meta data update, the locking here
-         * also ensures that the vote thread on other nodes won't have
+         * also ensures that the downconvert thread on other nodes
-         * to concurrently downconvert the inode and the dentry locks.
+         * won't have to concurrently downconvert the inode and the
+         * dentry locks.
         */
-        status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
+        status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1105,9 +1105,16 @@ static int ocfs2_rename(struct inode *old_dir,
                goto bail;
        }
-        if (!new_de && new_inode)
+        if (!new_de && new_inode) {
-                mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+                /*
-                     "directory!", new_inode->i_ino);
+                 * Target was unlinked by another node while we were
+                 * waiting to get to ocfs2_rename(). There isn't
+                 * anything we can do here to help the situation, so
+                 * bubble up the appropriate error.
+                 */
+                status = -ENOENT;
+                goto bail;
+        }
        /* In case we need to overwrite an existing file, we blow it
         * away first */
@@ -1136,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
-                status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
+                status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -1348,14 +1355,14 @@ bail:
                ocfs2_double_unlock(old_dir, new_dir);
        if (old_child_locked)
-                ocfs2_meta_unlock(old_inode, 1);
+                ocfs2_inode_unlock(old_inode, 1);
        if (new_child_locked)
-                ocfs2_meta_unlock(new_inode, 1);
+                ocfs2_inode_unlock(new_inode, 1);
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
-                ocfs2_meta_unlock(orphan_dir, 1);
+                ocfs2_inode_unlock(orphan_dir, 1);
                mutex_unlock(&orphan_dir->i_mutex);
                iput(orphan_dir);
        }
@@ -1523,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
        credits = ocfs2_calc_symlink_credits(sb);
        /* lock the parent directory */
-        status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+        status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -1650,7 +1657,7 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        ocfs2_meta_unlock(dir, 1);
+        ocfs2_inode_unlock(dir, 1);
        if (new_fe_bh)
                brelse(new_fe_bh);
@@ -1728,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1738,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                              orphan_dir_bh, name,
                                              OCFS2_ORPHAN_NAMELEN, de_bh);
        if (status < 0) {
-                ocfs2_meta_unlock(orphan_dir_inode, 1);
+                ocfs2_inode_unlock(orphan_dir_inode, 1);
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
                                               * about to be
                                               * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
 struct ocfs2_lock_res_ops;
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
        OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
        OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
        spinlock_t node_map_lock;
-        struct ocfs2_node_map mounted_map;
        struct ocfs2_node_map recovery_map;
-        struct ocfs2_node_map umount_map;
        u64 root_blkno;
        u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
        wait_queue_head_t checkpoint_event;
        atomic_t needs_checkpoint;
        struct ocfs2_journal *journal;
+        unsigned long osb_commit_interval;
+        int local_alloc_size;
        enum ocfs2_local_alloc_state local_alloc_state;
        struct buffer_head *local_alloc_bh;
        u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
        wait_queue_head_t recovery_event;
-        spinlock_t vote_task_lock;
+        spinlock_t dc_task_lock;
-        struct task_struct *vote_task;
+        struct task_struct *dc_task;
-        wait_queue_head_t vote_event;
+        wait_queue_head_t dc_event;
-        unsigned long vote_wake_sequence;
+        unsigned long dc_wake_sequence;
-        unsigned long vote_work_sequence;
+        unsigned long dc_work_sequence;
+        /*
+         * Any thread can add locks to the list, but the downconvert
+         * thread is the only one allowed to remove locks. Any change
+         * to this rule requires updating
+         * ocfs2_downconvert_thread_do_work().
+         */
        struct list_head blocked_lock_list;
        unsigned long blocked_lock_count;
-        struct list_head vote_list;
-        int vote_count;
-        u32 net_key;
-        spinlock_t net_response_lock;
-        unsigned int net_response_ids;
-        struct list_head net_response_list;
-        struct o2hb_callback_func osb_hb_up;
-        struct o2hb_callback_func osb_hb_down;
-        struct list_head        osb_net_handlers;
        wait_queue_head_t               osb_mount_event;
        /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
 #define OCFS2_IOC_RESVSP64      _IOW ('X', 42, struct ocfs2_space_resv)
 #define OCFS2_IOC_UNRESVSP64    _IOW ('X', 43, struct ocfs2_space_resv)
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+        __u64 group;            /* Group descriptor's blkno. */
+        __u32 clusters;         /* Total number of clusters in this group */
+        __u32 frees;            /* Total free clusters in this group */
+        __u16 chain;            /* Chain for this group */
+        __u16 reserved1;
+        __u32 reserved2;
+};
+#define OCFS2_IOC_GROUP_EXTEND  _IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD     _IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64   _IOW('o', 3,struct ocfs2_new_group_input)
 /*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
+/*
+ * Default local alloc size (in megabytes)
+ *
+ * The value chosen should be such that most allocations, including new
+ * block groups, use local alloc.
+ */
+#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
 struct ocfs2_system_inode_info {
        char    *si_name;
        int     si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_LOCK_TYPE_OPEN,
+        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_OPEN:
                        c = 'O';
                        break;
+                case OCFS2_LOCK_TYPE_FLOCK:
+                        c = 'F';
+                        break;
                default:
                        c = '\0';
        }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
+        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..37835ffcb039
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+                                       struct ocfs2_group_desc *gd,
+                                       int new_clusters,
+                                       u32 first_new_cluster,
+                                       u16 cl_cpg,
+                                       int set)
+{
+        int i;
+        u16 backups = 0;
+        u32 cluster;
+        u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+                if (gd_blkno < lgd_blkno)
+                        continue;
+                else if (gd_blkno > lgd_blkno)
+                        break;
+                if (set)
+                        ocfs2_set_bit(cluster % cl_cpg,
+                                      (unsigned long *)gd->bg_bitmap);
+                else
+                        ocfs2_clear_bit(cluster % cl_cpg,
+                                        (unsigned long *)gd->bg_bitmap);
+                backups++;
+        }
+        mlog_exit_void();
+        return backups;
+}
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+                                             struct inode *bm_inode,
+                                             struct buffer_head *bm_bh,
+                                             struct buffer_head *group_bh,
+                                             u32 first_new_cluster,
+                                             int new_clusters)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+        struct ocfs2_chain_rec *cr;
+        struct ocfs2_group_desc *group;
+        u16 chain, num_bits, backups = 0;
+        u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+        u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
+                   new_clusters, first_new_cluster);
+        ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        group = (struct ocfs2_group_desc *)group_bh->b_data;
+        /* update the group first. */
+        num_bits = new_clusters * cl_bpc;
+        le16_add_cpu(&group->bg_bits, num_bits);
+        le16_add_cpu(&group->bg_free_bits_count, num_bits);
+        /*
+         * check whether there are some new backup superblocks exist in
+         * this group and update the group bitmap accordingly.
+         */
+        if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+                                     OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+                backups = ocfs2_calc_new_backup_super(bm_inode,
+                                                     group,
+                                                     new_clusters,
+                                                     first_new_cluster,
+                                                     cl_cpg, 1);
+                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+        }
+        ret = ocfs2_journal_dirty(handle, group_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_rollback;
+        }
+        /* update the inode accordingly. */
+        ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_rollback;
+        }
+        chain = le16_to_cpu(group->bg_chain);
+        cr = (&cl->cl_recs[chain]);
+        le32_add_cpu(&cr->c_total, num_bits);
+        le32_add_cpu(&cr->c_free, num_bits);
+        le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+        le32_add_cpu(&fe->i_clusters, new_clusters);
+        if (backups) {
+                le32_add_cpu(&cr->c_free, -1 * backups);
+                le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+        }
+        spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+        OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+        spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+        i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+        ocfs2_journal_dirty(handle, bm_bh);
+out_rollback:
+        if (ret < 0) {
+                ocfs2_calc_new_backup_super(bm_inode,
+                                            group,
+                                            new_clusters,
+                                            first_new_cluster,
+                                            cl_cpg, 0);
+                le16_add_cpu(&group->bg_free_bits_count, backups);
+                le16_add_cpu(&group->bg_bits, -1 * num_bits);
+                le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+        }
+out:
+        mlog_exit(ret);
+        return ret;
+}
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+        int i, ret = 0;
+        u32 cluster;
+        u64 blkno;
+        struct buffer_head *backup = NULL;
+        struct ocfs2_dinode *backup_di = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        /* calculate the real backups we need to update. */
+        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                if (cluster > clusters)
+                        break;
+                ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+                memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+                backup_di = (struct ocfs2_dinode *)backup->b_data;
+                backup_di->i_blkno = cpu_to_le64(blkno);
+                ret = ocfs2_write_super_or_backup(osb, backup);
+                brelse(backup);
+                backup = NULL;
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        return ret;
+}
+static void ocfs2_update_super_and_backups(struct inode *inode,
+                                           int new_clusters)
+{
+        int ret;
+        u32 clusters = 0;
+        struct buffer_head *super_bh = NULL;
+        struct ocfs2_dinode *super_di = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        /*
+         * update the superblock last.
+         * It doesn't matter if the write failed.
+         */
+        ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
+                               &super_bh, 0, NULL);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        super_di = (struct ocfs2_dinode *)super_bh->b_data;
+        le32_add_cpu(&super_di->i_clusters, new_clusters);
+        clusters = le32_to_cpu(super_di->i_clusters);
+        ret = ocfs2_write_super_or_backup(osb, super_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+                ret = update_backups(inode, clusters, super_bh->b_data);
+out:
+        brelse(super_bh);
+        if (ret)
+                printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+                        " during fs resize. This condition is not fatal,"
+                        " but fsck.ocfs2 should be run to fix it\n",
+                        osb->dev_str);
+        return;
+}
+/*
+ * Extend the filesystem to the new number of clusters specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+        int ret;
+        handle_t *handle;
+        struct buffer_head *main_bm_bh = NULL;
+        struct buffer_head *group_bh = NULL;
+        struct inode *main_bm_inode = NULL;
+        struct ocfs2_dinode *fe = NULL;
+        struct ocfs2_group_desc *group = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        u16 cl_bpc;
+        u32 first_new_cluster;
+        u64 lgd_blkno;
+        mlog_entry_void();
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        if (new_clusters < 0)
+                return -EINVAL;
+        else if (new_clusters == 0)
+                return 0;
+        main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                    GLOBAL_BITMAP_SYSTEM_INODE,
+                                                    OCFS2_INVALID_SLOT);
+        if (!main_bm_inode) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&main_bm_inode->i_mutex);
+        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                mlog(ML_ERROR, "The disk is too old and small. "
+                     "Force to do offline resize.");
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (!OCFS2_IS_VALID_DINODE(fe)) {
+                OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
+                ret = -EIO;
+                goto out_unlock;
+        }
+        first_new_cluster = le32_to_cpu(fe->i_clusters);
+        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+                                              first_new_cluster - 1);
+        ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
+                               main_bm_inode);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        group = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+        if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+                le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        mlog(0, "extend the last group at %llu, new clusters = %d\n",
+             (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
+        handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+        if (IS_ERR(handle)) {
+                mlog_errno(PTR_ERR(handle));
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        /* update the last group descriptor and inode. */
+        ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+                                                main_bm_bh, group_bh,
+                                                first_new_cluster,
+                                                new_clusters);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        brelse(group_bh);
+        brelse(main_bm_bh);
+        ocfs2_inode_unlock(main_bm_inode, 1);
+out_mutex:
+        mutex_unlock(&main_bm_inode->i_mutex);
+        iput(main_bm_inode);
+out:
+        mlog_exit_void();
+        return ret;
+}
+static int ocfs2_check_new_group(struct inode *inode,
+                                 struct ocfs2_dinode *di,
+                                 struct ocfs2_new_group_input *input,
+                                 struct buffer_head *group_bh)
+{
+        int ret;
+        struct ocfs2_group_desc *gd;
+        u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+        unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
+                                le16_to_cpu(di->id2.i_chain.cl_bpc);
+        gd = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = -EIO;
+        if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+                mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno));
+        else if (di->i_blkno != gd->bg_parent_dinode)
+                mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
+                     "pointer (%llu, expected %llu)\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+                     (unsigned long long)le64_to_cpu(di->i_blkno));
+        else if (le16_to_cpu(gd->bg_bits) > max_bits)
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits));
+        else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+                     "claims that %u are free\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits),
+                     le16_to_cpu(gd->bg_free_bits_count));
+        else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+                     "max bitmap bits of %u\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits),
+                     8 * le16_to_cpu(gd->bg_size));
+        else if (le16_to_cpu(gd->bg_chain) != input->chain)
+                mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+                     "while input has %u set.\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_chain), input->chain);
+        else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+                     "input has %u clusters set\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits), input->clusters);
+        else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+                mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+                     "but it should have %u set\n",
+                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                     le16_to_cpu(gd->bg_bits),
+                     input->frees * cl_bpc);
+        else
+                ret = 0;
+        return ret;
+}
+static int ocfs2_verify_group_and_input(struct inode *inode,
+                                        struct ocfs2_dinode *di,
+                                        struct ocfs2_new_group_input *input,
+                                        struct buffer_head *group_bh)
+{
+        u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+        u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+        u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+        u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+        u32 total_clusters = le32_to_cpu(di->i_clusters);
+        int ret = -EINVAL;
+        if (cluster < total_clusters)
+                mlog(ML_ERROR, "add a group which is in the current volume.\n");
+        else if (input->chain >= cl_count)
+                mlog(ML_ERROR, "input chain exceeds the limit.\n");
+        else if (next_free != cl_count && next_free != input->chain)
+                mlog(ML_ERROR,
+                     "the add group should be in chain %u\n", next_free);
+        else if (total_clusters + input->clusters < total_clusters)
+                mlog(ML_ERROR, "add group's clusters overflow.\n");
+        else if (input->clusters > cl_cpg)
+                mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+        else if (input->frees > input->clusters)
+                mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+        else if (total_clusters % cl_cpg != 0)
+                mlog(ML_ERROR,
+                     "the last group isn't full. Use group extend first.\n");
+        else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+                mlog(ML_ERROR, "group blkno is invalid\n");
+        else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+                mlog(ML_ERROR, "group descriptor check failed.\n");
+        else
+                ret = 0;
+        return ret;
+}
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+        int ret;
+        handle_t *handle;
+        struct buffer_head *main_bm_bh = NULL;
+        struct inode *main_bm_inode = NULL;
+        struct ocfs2_dinode *fe = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *group_bh = NULL;
+        struct ocfs2_group_desc *group = NULL;
+        struct ocfs2_chain_list *cl;
+        struct ocfs2_chain_rec *cr;
+        u16 cl_bpc;
+        mlog_entry_void();
+        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+                return -EROFS;
+        main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                    GLOBAL_BITMAP_SYSTEM_INODE,
+                                                    OCFS2_INVALID_SLOT);
+        if (!main_bm_inode) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&main_bm_inode->i_mutex);
+        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                mlog(ML_ERROR, "The disk is too old and small."
+                     " Force to do offline resize.");
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+        if (ret < 0) {
+                mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+                     "from the device.", (unsigned long long)input->group);
+                goto out_unlock;
+        }
+        ocfs2_set_new_buffer_uptodate(inode, group_bh);
+        ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
+             (unsigned long long)input->group, input->chain, input->clusters);
+        handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+        if (IS_ERR(handle)) {
+                mlog_errno(PTR_ERR(handle));
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+        cl = &fe->id2.i_chain;
+        cr = &cl->cl_recs[input->chain];
+        ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        group = (struct ocfs2_group_desc *)group_bh->b_data;
+        group->bg_next_group = cr->c_blkno;
+        ret = ocfs2_journal_dirty(handle, group_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+                le16_add_cpu(&cl->cl_next_free_rec, 1);
+                memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+        }
+        cr->c_blkno = le64_to_cpu(input->group);
+        le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+        le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+        le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+        le32_add_cpu(&fe->id1.bitmap1.i_used,
+                     (input->clusters - input->frees) * cl_bpc);
+        le32_add_cpu(&fe->i_clusters, input->clusters);
+        ocfs2_journal_dirty(handle, main_bm_bh);
+        spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+        OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+        spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+        i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+        ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        brelse(group_bh);
+        brelse(main_bm_bh);
+        ocfs2_inode_unlock(main_bm_inode, 1);
+out_mutex:
+        mutex_unlock(&main_bm_inode->i_mutex);
+        iput(main_bm_inode);
+out:
+        mlog_exit_void();
+        return ret;
+}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..f38841abf10b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
+#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
                              s16 slot_num,
                              s16 node_num);
-/* Use the slot information we've collected to create a map of mounted
- * nodes. Should be holding an EX on super block. assumes slot info is
- * up to date. Note that we call this *after* we find a slot, so our
- * own node should be set in the map too... */
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
-{
-        int i;
-        struct ocfs2_slot_info *si = osb->slot_info;
-        spin_lock(&si->si_lock);
-        for (i = 0; i < si->si_size; i++)
-                if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
-                        ocfs2_node_map_set_bit(osb, &osb->mounted_map,
-                                              si->si_global_node_nums[i]);
-        spin_unlock(&si->si_lock);
-}
 /* post the slot information on disk into our slot_info struct. */
 void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 {
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 void ocfs2_clear_slot(struct ocfs2_slot_info *si,
                      s16 slot_num);
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
 static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
                                      int slot_num)
 {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
                                                   u64 bg_blkno,
                                                   u16 bg_bit_off);
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
-                                            u32 cluster);
 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u64 data_blkno,
                                                u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
        if (inode) {
                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
-                        ocfs2_meta_unlock(inode, 1);
+                        ocfs2_inode_unlock(inode, 1);
                mutex_unlock(&inode->i_mutex);
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 /* somewhat more expensive than our other checks, so use sparingly. */
-static int ocfs2_check_group_descriptor(struct super_block *sb,
+int ocfs2_check_group_descriptor(struct super_block *sb,
-                                        struct ocfs2_dinode *di,
+                                 struct ocfs2_dinode *di,
-                                        struct ocfs2_group_desc *gd)
+                                 struct ocfs2_group_desc *gd)
 {
        unsigned int max_bits;
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        mutex_lock(&alloc_inode->i_mutex);
-        status = ocfs2_meta_lock(alloc_inode, &bh, 1);
+        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
        if (status < 0) {
                mutex_unlock(&alloc_inode->i_mutex);
                iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 /* given a cluster offset, calculate which block group it belongs to
 * and return that block offset. */
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
-                                            u32 cluster)
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (min_clusters > (osb->bitmap_cpg - 1)) {
                        /* The only paths asking for contiguousness
                         * should know about this already. */
-                        mlog(ML_ERROR, "minimum allocation requested exceeds "
+                        mlog(ML_ERROR, "minimum allocation requested %u exceeds "
-                                       "group bitmap size!");
+                             "group bitmap size %u!\n", min_clusters,
+                             osb->bitmap_cpg);
                        status = -ENOSPC;
                        goto bail;
                }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
                                      struct ocfs2_alloc_context *ac);
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+/* somewhat more expensive than our other checks, so use sparingly. */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+                                 struct ocfs2_dinode *di,
+                                 struct ocfs2_group_desc *gd);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index be562ac3e89c..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
-#include "vote.h"
 #include "buffer_head_io.h"
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
 struct mount_options
 {
+        unsigned long   commit_interval;
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
+        unsigned int    localalloc_opt;
 };
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
        Opt_data_writeback,
        Opt_atime_quantum,
        Opt_slot,
+        Opt_commit,
+        Opt_localalloc,
+        Opt_localflocks,
        Opt_err,
 };
@@ -165,6 +169,9 @@ static match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_atime_quantum, "atime_quantum=%u"},
        {Opt_slot, "preferred_slot=%u"},
+        {Opt_commit, "commit=%u"},
+        {Opt_localalloc, "localalloc=%d"},
+        {Opt_localflocks, "localflocks"},
        {Opt_err, NULL}
 };
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        mlog_entry_void();
-        new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
+        new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
                mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        }
        osb->root_inode = new;
-        new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
+        new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
                mlog_errno(status);
@@ -438,14 +445,16 @@ unlock_osb:
        }
        if (!ret) {
-                if (!ocfs2_is_hard_readonly(osb))
-                        ocfs2_set_journal_params(osb);
                /* Only save off the new mount options in case of a successful
                 * remount. */
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
+                if (parsed_options.commit_interval)
+                        osb->osb_commit_interval = parsed_options.commit_interval;
+                if (!ocfs2_is_hard_readonly(osb))
+                        ocfs2_set_journal_params(osb);
        }
 out:
        return ret;
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
+        osb->osb_commit_interval = parsed_options.commit_interval;
+        osb->local_alloc_size = parsed_options.localalloc_opt;
        sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
        mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
                   options ? options : "(none)");
+        mopt->commit_interval = 0;
        mopt->mount_opt = 0;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
+        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
        if (!options) {
                status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
                        if (option)
                                mopt->slot = (s16)option;
                        break;
+                case Opt_commit:
+                        option = 0;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option < 0)
+                                return 0;
+                        if (option == 0)
+                                option = JBD_DEFAULT_MAX_COMMIT_AGE;
+                        mopt->commit_interval = HZ * option;
+                        break;
+                case Opt_localalloc:
+                        option = 0;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                                mopt->localalloc_opt = option;
+                        break;
+                case Opt_localflocks:
+                        /*
+                         * Changing this during remount could race
+                         * flock() requests, or "unbalance" existing
+                         * ones (e.g., a lock is taken in one mode but
+                         * dropped in the other). If users care enough
+                         * to flip locking modes during remount, we
+                         * could add a "local" flag to individual
+                         * flock structures for proper tracking of
+                         * state.
+                         */
+                        if (!is_remount)
+                                mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
                seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+        if (osb->osb_commit_interval)
+                seq_printf(s, ",commit=%u",
+                           (unsigned) (osb->osb_commit_interval / HZ));
+        if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+                seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+                seq_printf(s, ",localflocks,");
        return 0;
 }
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
                goto bail;
        }
-        status = ocfs2_meta_lock(inode, &bh, 0);
+        status = ocfs2_inode_lock(inode, &bh, 0);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
        brelse(bh);
-        ocfs2_meta_unlock(inode, 0);
+        ocfs2_inode_unlock(inode, 0);
        status = 0;
 bail:
        if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
        oi->ip_clusters = 0;
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
-        ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
-        ocfs2_lock_res_init_once(&oi->ip_data_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
        ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
                goto leave;
        }
-        status = ocfs2_register_hb_callbacks(osb);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = ocfs2_dlm_init(osb);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        /* requires vote_thread to be running. */
-        status = ocfs2_register_net_handlers(osb);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = ocfs2_super_lock(osb, 1);
        if (status < 0) {
                mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
                goto leave;
        }
-        ocfs2_populate_mounted_map(osb);
        /* load all node-local system inodes */
        status = ocfs2_init_local_system_inodes(osb);
        if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
        if (ocfs2_mount_local(osb))
                goto leave;
-        /* This should be sent *after* we recovered our journal as it
-         * will cause other nodes to unmark us as needing
-         * recovery. However, we need to send it *before* dropping the
-         * super block lock as otherwise their recovery threads might
-         * try to clean us up while we're live! */
-        status = ocfs2_request_mount_vote(osb);
-        if (status < 0)
-                mlog_errno(status);
 leave:
        if (unlock_super)
                ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
                        mlog_errno(tmp);
                        return;
                }
-                tmp = ocfs2_request_umount_vote(osb);
-                if (tmp < 0)
-                        mlog_errno(tmp);
        }
        if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_release_system_inodes(osb);
-        if (osb->dlm) {
+        if (osb->dlm)
-                ocfs2_unregister_net_handlers(osb);
                ocfs2_dlm_shutdown(osb);
-        }
-        ocfs2_clear_hb_callbacks(osb);
        debugfs_remove(osb->osb_debug_root);
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        int i, cbits, bbits;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        struct inode *inode = NULL;
-        struct buffer_head *bitmap_bh = NULL;
        struct ocfs2_journal *journal;
        __le32 uuid_net_key;
        struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->s_sectsize_bits = blksize_bits(sector_size);
        BUG_ON(!osb->s_sectsize_bits);
-        osb->net_response_ids = 0;
-        spin_lock_init(&osb->net_response_lock);
-        INIT_LIST_HEAD(&osb->net_response_list);
-        INIT_LIST_HEAD(&osb->osb_net_handlers);
        init_waitqueue_head(&osb->recovery_event);
-        spin_lock_init(&osb->vote_task_lock);
+        spin_lock_init(&osb->dc_task_lock);
-        init_waitqueue_head(&osb->vote_event);
+        init_waitqueue_head(&osb->dc_event);
-        osb->vote_work_sequence = 0;
+        osb->dc_work_sequence = 0;
-        osb->vote_wake_sequence = 0;
+        osb->dc_wake_sequence = 0;
        INIT_LIST_HEAD(&osb->blocked_lock_list);
        osb->blocked_lock_count = 0;
-        INIT_LIST_HEAD(&osb->vote_list);
        spin_lock_init(&osb->osb_lock);
        atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-        osb->net_key = le32_to_cpu(uuid_net_key);
        strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
        osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
-        /* We don't have a cluster lock on the bitmap here because
-         * we're only interested in static information and the extra
-         * complexity at mount time isn't worht it. Don't pass the
-         * inode in to the read function though as we don't want it to
-         * be put in the cache. */
-        status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
-                                  NULL);
        iput(inode);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        di = (struct ocfs2_dinode *) bitmap_bh->b_data;
+        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
-        osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
-        brelse(bitmap_bh);
-        mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
-             (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                goto bail;
        }
-        inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
+        inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
        if (IS_ERR(inode)) {
                mlog_errno(PTR_ERR(inode));
                inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
 #include "ver.h"
-#define OCFS2_BUILD_VERSION "1.3.3"
+#define OCFS2_BUILD_VERSION "1.5.0"
 #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.c
- *
- * description here
- *
- * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-#include <dlm/dlmapi.h>
-#define MLOG_MASK_PREFIX ML_VOTE
-#include <cluster/masklog.h>
-#include "ocfs2.h"
-#include "alloc.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "journal.h"
-#include "slot_map.h"
-#include "vote.h"
-#include "buffer_head_io.h"
-#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
-#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
-struct ocfs2_msg_hdr
-{
-        __be32 h_response_id; /* used to lookup message handle on sending
-                            * node. */
-        __be32 h_request;
-        __be64 h_blkno;
-        __be32 h_generation;
-        __be32 h_node_num;    /* node sending this particular message. */
-};
-struct ocfs2_vote_msg
-{
-        struct ocfs2_msg_hdr v_hdr;
-        __be32 v_reserved1;
-} __attribute__ ((packed));
-/* Responses are given these values to maintain backwards
- * compatibility with older ocfs2 versions */
-#define OCFS2_RESPONSE_OK               (0)
-#define OCFS2_RESPONSE_BUSY             (-16)
-#define OCFS2_RESPONSE_BAD_MSG          (-22)
-struct ocfs2_response_msg
-{
-        struct ocfs2_msg_hdr r_hdr;
-        __be32 r_response;
-} __attribute__ ((packed));
-struct ocfs2_vote_work {
-        struct list_head   w_list;
-        struct ocfs2_vote_msg w_msg;
-};
-enum ocfs2_vote_request {
-        OCFS2_VOTE_REQ_INVALID = 0,
-        OCFS2_VOTE_REQ_MOUNT,
-        OCFS2_VOTE_REQ_UMOUNT,
-        OCFS2_VOTE_REQ_LAST
-};
-static inline int ocfs2_is_valid_vote_request(int request)
-{
-        return OCFS2_VOTE_REQ_INVALID < request &&
-                request < OCFS2_VOTE_REQ_LAST;
-}
-typedef void (*ocfs2_net_response_callback)(void *priv,
-                                            struct ocfs2_response_msg *resp);
-struct ocfs2_net_response_cb {
-        ocfs2_net_response_callback     rc_cb;
-        void                            *rc_priv;
-};
-struct ocfs2_net_wait_ctxt {
-        struct list_head        n_list;
-        u32                     n_response_id;
-        wait_queue_head_t       n_event;
-        struct ocfs2_node_map   n_node_map;
-        int                     n_response; /* an agreggate response. 0 if
-                                             * all nodes are go, < 0 on any
-                                             * negative response from any
-                                             * node or network error. */
-        struct ocfs2_net_response_cb *n_callback;
-};
-static void ocfs2_process_mount_request(struct ocfs2_super *osb,
-                                        unsigned int node_num)
-{
-        mlog(0, "MOUNT vote from node %u\n", node_num);
-        /* The other node only sends us this message when he has an EX
-         * on the superblock, so our recovery threads (if having been
-         * launched) are waiting on it.*/
-        ocfs2_recovery_map_clear(osb, node_num);
-        ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
-        /* We clear the umount map here because a node may have been
-         * previously mounted, safely unmounted but never stopped
-         * heartbeating - in which case we'd have a stale entry. */
-        ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-static void ocfs2_process_umount_request(struct ocfs2_super *osb,
-                                         unsigned int node_num)
-{
-        mlog(0, "UMOUNT vote from node %u\n", node_num);
-        ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
-        ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
-}
-static void ocfs2_process_vote(struct ocfs2_super *osb,
-                               struct ocfs2_vote_msg *msg)
-{
-        int net_status, vote_response;
-        unsigned int node_num;
-        u64 blkno;
-        enum ocfs2_vote_request request;
-        struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
-        struct ocfs2_response_msg response;
-        /* decode the network mumbo jumbo into local variables. */
-        request = be32_to_cpu(hdr->h_request);
-        blkno = be64_to_cpu(hdr->h_blkno);
-        node_num = be32_to_cpu(hdr->h_node_num);
-        mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
-             request, (unsigned long long)blkno, node_num);
-        if (!ocfs2_is_valid_vote_request(request)) {
-                mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
-                     request, node_num);
-                vote_response = OCFS2_RESPONSE_BAD_MSG;
-                goto respond;
-        }
-        vote_response = OCFS2_RESPONSE_OK;
-        switch (request) {
-        case OCFS2_VOTE_REQ_UMOUNT:
-                ocfs2_process_umount_request(osb, node_num);
-                goto respond;
-        case OCFS2_VOTE_REQ_MOUNT:
-                ocfs2_process_mount_request(osb, node_num);
-                goto respond;
-        default:
-                /* avoids a gcc warning */
-                break;
-        }
-respond:
-        /* Response struture is small so we just put it on the stack
-         * and stuff it inline. */
-        memset(&response, 0, sizeof(struct ocfs2_response_msg));
-        response.r_hdr.h_response_id = hdr->h_response_id;
-        response.r_hdr.h_blkno = hdr->h_blkno;
-        response.r_hdr.h_generation = hdr->h_generation;
-        response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
-        response.r_response = cpu_to_be32(vote_response);
-        net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
-                                        osb->net_key,
-                                        &response,
-                                        sizeof(struct ocfs2_response_msg),
-                                        node_num,
-                                        NULL);
-        /* We still want to error print for ENOPROTOOPT here. The
-         * sending node shouldn't have unregistered his net handler
-         * without sending an unmount vote 1st */
-        if (net_status < 0
-            && net_status != -ETIMEDOUT
-            && net_status != -ENOTCONN)
-                mlog(ML_ERROR, "message to node %u fails with error %d!\n",
-                     node_num, net_status);
-}
-static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
-{
-        unsigned long processed;
-        struct ocfs2_lock_res *lockres;
-        struct ocfs2_vote_work *work;
-        mlog_entry_void();
-        spin_lock(&osb->vote_task_lock);
-        /* grab this early so we know to try again if a state change and
-         * wake happens part-way through our work  */
-        osb->vote_work_sequence = osb->vote_wake_sequence;
-        processed = osb->blocked_lock_count;
-        while (processed) {
-                BUG_ON(list_empty(&osb->blocked_lock_list));
-                lockres = list_entry(osb->blocked_lock_list.next,
-                                     struct ocfs2_lock_res, l_blocked_list);
-                list_del_init(&lockres->l_blocked_list);
-                osb->blocked_lock_count--;
-                spin_unlock(&osb->vote_task_lock);
-                BUG_ON(!processed);
-                processed--;
-                ocfs2_process_blocked_lock(osb, lockres);
-                spin_lock(&osb->vote_task_lock);
-        }
-        while (osb->vote_count) {
-                BUG_ON(list_empty(&osb->vote_list));
-                work = list_entry(osb->vote_list.next,
-                                  struct ocfs2_vote_work, w_list);
-                list_del(&work->w_list);
-                osb->vote_count--;
-                spin_unlock(&osb->vote_task_lock);
-                ocfs2_process_vote(osb, &work->w_msg);
-                kfree(work);
-                spin_lock(&osb->vote_task_lock);
-        }
-        spin_unlock(&osb->vote_task_lock);
-        mlog_exit_void();
-}
-static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
-{
-        int empty = 0;
-        spin_lock(&osb->vote_task_lock);
-        if (list_empty(&osb->blocked_lock_list) &&
-            list_empty(&osb->vote_list))
-                empty = 1;
-        spin_unlock(&osb->vote_task_lock);
-        return empty;
-}
-static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
-{
-        int should_wake = 0;
-        spin_lock(&osb->vote_task_lock);
-        if (osb->vote_work_sequence != osb->vote_wake_sequence)
-                should_wake = 1;
-        spin_unlock(&osb->vote_task_lock);
-        return should_wake;
-}
-int ocfs2_vote_thread(void *arg)
-{
-        int status = 0;
-        struct ocfs2_super *osb = arg;
-        /* only quit once we've been asked to stop and there is no more
-         * work available */
-        while (!(kthread_should_stop() &&
-                 ocfs2_vote_thread_lists_empty(osb))) {
-                wait_event_interruptible(osb->vote_event,
-                                         ocfs2_vote_thread_should_wake(osb) ||
-                                         kthread_should_stop());
-                mlog(0, "vote_thread: awoken\n");
-                ocfs2_vote_thread_do_work(osb);
-        }
-        osb->vote_task = NULL;
-        return status;
-}
-static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
-{
-        struct ocfs2_net_wait_ctxt *w;
-        w = kzalloc(sizeof(*w), GFP_NOFS);
-        if (!w) {
-                mlog_errno(-ENOMEM);
-                goto bail;
-        }
-        INIT_LIST_HEAD(&w->n_list);
-        init_waitqueue_head(&w->n_event);
-        ocfs2_node_map_init(&w->n_node_map);
-        w->n_response_id = response_id;
-        w->n_callback = NULL;
-bail:
-        return w;
-}
-static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
-{
-        unsigned int ret;
-        spin_lock(&osb->net_response_lock);
-        ret = ++osb->net_response_ids;
-        spin_unlock(&osb->net_response_lock);
-        return ret;
-}
-static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
-                                        struct ocfs2_net_wait_ctxt *w)
-{
-        spin_lock(&osb->net_response_lock);
-        list_del(&w->n_list);
-        spin_unlock(&osb->net_response_lock);
-}
-static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
-                                      struct ocfs2_net_wait_ctxt *w)
-{
-        spin_lock(&osb->net_response_lock);
-        list_add_tail(&w->n_list,
-                      &osb->net_response_list);
-        spin_unlock(&osb->net_response_lock);
-}
-static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
-                                        struct ocfs2_net_wait_ctxt *w,
-                                        int node_num)
-{
-        assert_spin_locked(&osb->net_response_lock);
-        ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
-        if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
-                wake_up(&w->n_event);
-}
-/* Intended to be called from the node down callback, we fake remove
- * the node from all our response contexts */
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
-                                        int node_num)
-{
-        struct list_head *p;
-        struct ocfs2_net_wait_ctxt *w = NULL;
-        spin_lock(&osb->net_response_lock);
-        list_for_each(p, &osb->net_response_list) {
-                w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-                __ocfs2_mark_node_responded(osb, w, node_num);
-        }
-        spin_unlock(&osb->net_response_lock);
-}
-static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
-                                struct ocfs2_vote_msg *request,
-                                unsigned int response_id,
-                                int *response,
-                                struct ocfs2_net_response_cb *callback)
-{
-        int status, i, remote_err;
-        struct ocfs2_net_wait_ctxt *w = NULL;
-        int dequeued = 0;
-        mlog_entry_void();
-        w = ocfs2_new_net_wait_ctxt(response_id);
-        if (!w) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        w->n_callback = callback;
-        /* we're pretty much ready to go at this point, and this fills
-         * in n_response which we need anyway... */
-        ocfs2_queue_net_wait_ctxt(osb, w);
-        i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
-        while (i != O2NM_INVALID_NODE_NUM) {
-                if (i != osb->node_num) {
-                        mlog(0, "trying to send request to node %i\n", i);
-                        ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
-                        remote_err = 0;
-                        status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
-                                                    osb->net_key,
-                                                    request,
-                                                    sizeof(*request),
-                                                    i,
-                                                    &remote_err);
-                        if (status == -ETIMEDOUT) {
-                                mlog(0, "remote node %d timed out!\n", i);
-                                status = -EAGAIN;
-                                goto bail;
-                        }
-                        if (remote_err < 0) {
-                                status = remote_err;
-                                mlog(0, "remote error %d on node %d!\n",
-                                     remote_err, i);
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
-                }
-                i++;
-                i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
-                mlog(0, "next is %d, i am %d\n", i, osb->node_num);
-        }
-        mlog(0, "done sending, now waiting on responses...\n");
-        wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
-        ocfs2_dequeue_net_wait_ctxt(osb, w);
-        dequeued = 1;
-        *response = w->n_response;
-        status = 0;
-bail:
-        if (w) {
-                if (!dequeued)
-                        ocfs2_dequeue_net_wait_ctxt(osb, w);
-                kfree(w);
-        }
-        mlog_exit(status);
-        return status;
-}
-static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
-                                                      u64 blkno,
-                                                      unsigned int generation,
-                                                      enum ocfs2_vote_request type)
-{
-        struct ocfs2_vote_msg *request;
-        struct ocfs2_msg_hdr *hdr;
-        BUG_ON(!ocfs2_is_valid_vote_request(type));
-        request = kzalloc(sizeof(*request), GFP_NOFS);
-        if (!request) {
-                mlog_errno(-ENOMEM);
-        } else {
-                hdr = &request->v_hdr;
-                hdr->h_node_num = cpu_to_be32(osb->node_num);
-                hdr->h_request = cpu_to_be32(type);
-                hdr->h_blkno = cpu_to_be64(blkno);
-                hdr->h_generation = cpu_to_be32(generation);
-        }
-        return request;
-}
-/* Complete the buildup of a new vote request and process the
- * broadcast return value. */
-static int ocfs2_do_request_vote(struct ocfs2_super *osb,
-                                 struct ocfs2_vote_msg *request,
-                                 struct ocfs2_net_response_cb *callback)
-{
-        int status, response = -EBUSY;
-        unsigned int response_id;
-        struct ocfs2_msg_hdr *hdr;
-        response_id = ocfs2_new_response_id(osb);
-        hdr = &request->v_hdr;
-        hdr->h_response_id = cpu_to_be32(response_id);
-        status = ocfs2_broadcast_vote(osb, request, response_id, &response,
-                                      callback);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = response;
-bail:
-        return status;
-}
-int ocfs2_request_mount_vote(struct ocfs2_super *osb)
-{
-        int status;
-        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
-        if (!request) {
-                status = -ENOMEM;
-                goto bail;
-        }
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-                    signal_pending(current)) {
-                        status = -ERESTARTSYS;
-                        goto bail;
-                }
-                if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num)) {
-                        status = 0;
-                        goto bail;
-                }
-                status = ocfs2_do_request_vote(osb, request, NULL);
-        }
-bail:
-        kfree(request);
-        return status;
-}
-int ocfs2_request_umount_vote(struct ocfs2_super *osb)
-{
-        int status;
-        struct ocfs2_vote_msg *request = NULL;
-        request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
-        if (!request) {
-                status = -ENOMEM;
-                goto bail;
-        }
-        status = -EAGAIN;
-        while (status == -EAGAIN) {
-                /* Do not check signals on this vote... We really want
-                 * this one to go all the way through. */
-                if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-                                           osb->node_num)) {
-                        status = 0;
-                        goto bail;
-                }
-                status = ocfs2_do_request_vote(osb, request, NULL);
-        }
-bail:
-        kfree(request);
-        return status;
-}
-/* TODO: This should eventually be a hash table! */
-static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
-                                                               u32 response_id)
-{
-        struct list_head *p;
-        struct ocfs2_net_wait_ctxt *w = NULL;
-        list_for_each(p, &osb->net_response_list) {
-                w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-                if (response_id == w->n_response_id)
-                        break;
-                w = NULL;
-        }
-        return w;
-}
-/* Translate response codes into local node errno values */
-static inline int ocfs2_translate_response(int response)
-{
-        int ret;
-        switch (response) {
-        case OCFS2_RESPONSE_OK:
-                ret = 0;
-                break;
-        case OCFS2_RESPONSE_BUSY:
-                ret = -EBUSY;
-                break;
-        default:
-                ret = -EINVAL;
-        }
-        return ret;
-}
-static int ocfs2_handle_response_message(struct o2net_msg *msg,
-                                         u32 len,
-                                         void *data, void **ret_data)
-{
-        unsigned int response_id, node_num;
-        int response_status;
-        struct ocfs2_super *osb = data;
-        struct ocfs2_response_msg *resp;
-        struct ocfs2_net_wait_ctxt * w;
-        struct ocfs2_net_response_cb *resp_cb;
-        resp = (struct ocfs2_response_msg *) msg->buf;
-        response_id = be32_to_cpu(resp->r_hdr.h_response_id);
-        node_num = be32_to_cpu(resp->r_hdr.h_node_num);
-        response_status = 
-                ocfs2_translate_response(be32_to_cpu(resp->r_response));
-        mlog(0, "received response message:\n");
-        mlog(0, "h_response_id = %u\n", response_id);
-        mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
-        mlog(0, "h_blkno = %llu\n",
-             (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
-        mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
-        mlog(0, "h_node_num = %u\n", node_num);
-        mlog(0, "r_response = %d\n", response_status);
-        spin_lock(&osb->net_response_lock);
-        w = __ocfs2_find_net_wait_ctxt(osb, response_id);
-        if (!w) {
-                mlog(0, "request not found!\n");
-                goto bail;
-        }
-        resp_cb = w->n_callback;
-        if (response_status && (!w->n_response)) {
-                /* we only really need one negative response so don't
-                 * set it twice. */
-                w->n_response = response_status;
-        }
-        if (resp_cb) {
-                spin_unlock(&osb->net_response_lock);
-                resp_cb->rc_cb(resp_cb->rc_priv, resp);
-                spin_lock(&osb->net_response_lock);
-        }
-        __ocfs2_mark_node_responded(osb, w, node_num);
-bail:
-        spin_unlock(&osb->net_response_lock);
-        return 0;
-}
-static int ocfs2_handle_vote_message(struct o2net_msg *msg,
-                                     u32 len,
-                                     void *data, void **ret_data)
-{
-        int status;
-        struct ocfs2_super *osb = data;
-        struct ocfs2_vote_work *work;
-        work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
-        if (!work) {
-                status = -ENOMEM;
-                mlog_errno(status);
-                goto bail;
-        }
-        INIT_LIST_HEAD(&work->w_list);
-        memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
-        mlog(0, "scheduling vote request:\n");
-        mlog(0, "h_response_id = %u\n",
-             be32_to_cpu(work->w_msg.v_hdr.h_response_id));
-        mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
-        mlog(0, "h_blkno = %llu\n",
-             (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
-        mlog(0, "h_generation = %u\n",
-             be32_to_cpu(work->w_msg.v_hdr.h_generation));
-        mlog(0, "h_node_num = %u\n",
-             be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-        spin_lock(&osb->vote_task_lock);
-        list_add_tail(&work->w_list, &osb->vote_list);
-        osb->vote_count++;
-        spin_unlock(&osb->vote_task_lock);
-        ocfs2_kick_vote_thread(osb);
-        status = 0;
-bail:
-        return status;
-}
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
-{
-        if (!osb->net_key)
-                return;
-        o2net_unregister_handler_list(&osb->osb_net_handlers);
-        if (!list_empty(&osb->net_response_list))
-                mlog(ML_ERROR, "net response list not empty!\n");
-        osb->net_key = 0;
-}
-int ocfs2_register_net_handlers(struct ocfs2_super *osb)
-{
-        int status = 0;
-        if (ocfs2_mount_local(osb))
-                return 0;
-        status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
-                                        osb->net_key,
-                                        sizeof(struct ocfs2_response_msg),
-                                        ocfs2_handle_response_message,
-                                        osb, NULL, &osb->osb_net_handlers);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
-                                        osb->net_key,
-                                        sizeof(struct ocfs2_vote_msg),
-                                        ocfs2_handle_vote_message,
-                                        osb, NULL, &osb->osb_net_handlers);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-bail:
-        if (status < 0)
-                ocfs2_unregister_net_handlers(osb);
-        return status;
-}
author	David Woodhouse <dwmw2@infradead.org>	2008-02-03 02:29:41 -0500
committer	David Woodhouse <dwmw2@infradead.org>	2008-02-03 02:30:32 -0500
commit	c1f3ee120bb61045b1c0a3ead620d1d65af47130 (patch)
tree	908430bf2b47fe8e96ac623ae7ab6dd5698d0938 /fs/ocfs2
parent	e619a75ff6201b567a539e787aa9af9bc63a3187 (diff)
parent	9135f1901ee6449dfe338adf6e40e9c2025b8150 (diff)