163 files changed, 8777 insertions, 4533 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index c509123bea49..028ae38ecc52 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -444,6 +444,32 @@ config OCFS2_FS
          For more information on OCFS2, see the file
          <file:Documentation/filesystems/ocfs2.txt>.
+config OCFS2_FS_O2CB
+        tristate "O2CB Kernelspace Clustering"
+        depends on OCFS2_FS
+        default y
+        help
+          OCFS2 includes a simple kernelspace clustering package, the OCFS2
+          Cluster Base.  It only requires a very small userspace component
+          to configure it. This comes with the standard ocfs2-tools package.
+          O2CB is limited to maintaining a cluster for OCFS2 file systems.
+          It cannot manage any other cluster applications.
+          It is always safe to say Y here, as the clustering method is
+          run-time selectable.
+config OCFS2_FS_USERSPACE_CLUSTER
+        tristate "OCFS2 Userspace Clustering"
+        depends on OCFS2_FS && DLM
+        default y
+        help
+          This option will allow OCFS2 to use userspace clustering services
+          in conjunction with the DLM in fs/dlm.  If you are using a
+          userspace cluster manager, say Y here.
+          It is safe to say Y, as the clustering method is run-time
+          selectable.
 config OCFS2_DEBUG_MASKLOG
        bool "OCFS2 logging support"
        depends on OCFS2_FS
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 0f60f6b35769..2d3e5d4fb9f7 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -22,7 +22,7 @@ MODULE_LICENSE("GPL");
 unsigned afs_debug;
 module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
+MODULE_PARM_DESC(debug, "AFS debugging mask");
 static char *rootcell;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 3e8683dbb13f..a99d46f3b26e 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -835,7 +835,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext2_xattr_cache);
+        ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
        if (!ce)
                return -ENOMEM;
        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index a6ea4d6a8bb2..42856541e9a5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1126,7 +1126,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext3_xattr_cache);
+        ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
        if (!ce) {
                ea_bdebug(bh, "out of memory");
                return;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d7962139c010..e9054c1c7d93 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1386,7 +1386,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext4_xattr_cache);
+        ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
        if (!ce) {
                ea_bdebug(bh, "out of memory");
                return;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index de8e64c03f73..7f7947e3dfbb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL
+        depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
        select FS_POSIX_ACL
        select CRC32
        help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 8fff11058cee..e2350df02a07 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
-        glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+        glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1047a8c7226a..3e9bd46f27e3 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -116,7 +116,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
                goto out;
        er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
-        er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+        er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
        error = -ENOMEM;
        if (!er.er_data)
                goto out;
@@ -222,7 +222,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
                return error;
        }
-        clone = posix_acl_clone(acl, GFP_KERNEL);
+        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
                goto out;
@@ -272,7 +272,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
-        clone = posix_acl_clone(acl, GFP_KERNEL);
+        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
                goto out;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e9456ebd3bb6..c19184f2e70e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -33,6 +33,7 @@
 * keep it small.
 */
 struct metapath {
+        struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
        __u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
@@ -135,9 +136,10 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                /* Get a free block, fill it with the stuffed data,
                   and write it out to disk */
+                unsigned int n = 1;
+                block = gfs2_alloc_block(ip, &n);
                if (isdir) {
-                        block = gfs2_alloc_meta(ip);
+                        gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
                        if (error)
                                goto out_brelse;
@@ -145,8 +147,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                                              dibh, sizeof(struct gfs2_dinode));
                        brelse(bh);
                } else {
-                        block = gfs2_alloc_data(ip);
                        error = gfs2_unstuffer_page(ip, dibh, block, page);
                        if (error)
                                goto out_brelse;
@@ -161,12 +161,11 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        if (ip->i_di.di_size) {
                *(__be64 *)(di + 1) = cpu_to_be64(block);
-                ip->i_di.di_blocks++;
+                gfs2_add_inode_blocks(&ip->i_inode, 1);
-                gfs2_set_inode_blocks(&ip->i_inode);
+                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
-                di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
        }
-        ip->i_di.di_height = 1;
+        ip->i_height = 1;
        di->di_height = cpu_to_be16(1);
 out_brelse:
@@ -176,114 +175,13 @@ out:
        return error;
 }
-/**
- * calc_tree_height - Calculate the height of a metadata tree
- * @ip: The GFS2 inode
- * @size: The proposed size of the file
- *
- * Work out how tall a metadata tree needs to be in order to accommodate a
- * file of a particular size. If size is less than the current size of
- * the inode, then the current size of the inode is used instead of the
- * supplied one.
- *
- * Returns: the height the tree should be
- */
-static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        u64 *arr;
-        unsigned int max, height;
-        if (ip->i_di.di_size > size)
-                size = ip->i_di.di_size;
-        if (gfs2_is_dir(ip)) {
-                arr = sdp->sd_jheightsize;
-                max = sdp->sd_max_jheight;
-        } else {
-                arr = sdp->sd_heightsize;
-                max = sdp->sd_max_height;
-        }
-        for (height = 0; height < max; height++)
-                if (arr[height] >= size)
-                        break;
-        return height;
-}
-/**
- * build_height - Build a metadata tree of the requested height
- * @ip: The GFS2 inode
- * @height: The height to build to
- *
- *
- * Returns: errno
- */
-static int build_height(struct inode *inode, unsigned height)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        unsigned new_height = height - ip->i_di.di_height;
-        struct buffer_head *dibh;
-        struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
-        struct gfs2_dinode *di;
-        int error;
-        __be64 *bp;
-        u64 bn;
-        unsigned n;
-        if (height <= ip->i_di.di_height)
-                return 0;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                return error;
-        for(n = 0; n < new_height; n++) {
-                bn = gfs2_alloc_meta(ip);
-                blocks[n] = gfs2_meta_new(ip->i_gl, bn);
-                gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
-        }
-        n = 0;
-        bn = blocks[0]->b_blocknr;
-        if (new_height > 1) {
-                for(; n < new_height-1; n++) {
-                        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
-                                          GFS2_FORMAT_IN);
-                        gfs2_buffer_clear_tail(blocks[n],
-                                               sizeof(struct gfs2_meta_header));
-                        bp = (__be64 *)(blocks[n]->b_data +
-                                     sizeof(struct gfs2_meta_header));
-                        *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
-                        brelse(blocks[n]);
-                        blocks[n] = NULL;
-                }
-        }
-        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-        gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
-                              dibh, sizeof(struct gfs2_dinode));
-        brelse(blocks[n]);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        di = (struct gfs2_dinode *)dibh->b_data;
-        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-        *(__be64 *)(di + 1) = cpu_to_be64(bn);
-        ip->i_di.di_height += new_height;
-        ip->i_di.di_blocks += new_height;
-        gfs2_set_inode_blocks(&ip->i_inode);
-        di->di_height = cpu_to_be16(ip->i_di.di_height);
-        di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
-        brelse(dibh);
-        return error;
-}
 /**
 * find_metapath - Find path through the metadata tree
- * @ip: The inode pointer
+ * @sdp: The superblock
 * @mp: The metapath to return the result in
 * @block: The disk block to look up
+ * @height: The pre-calculated height of the metadata tree
 *
 *   This routine returns a struct metapath structure that defines a path
 *   through the metadata of inode "ip" to get to block "block".
@@ -338,21 +236,29 @@ static int build_height(struct inode *inode, unsigned height)
 *
 */
-static void find_metapath(struct gfs2_inode *ip, u64 block,
+static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
-                          struct metapath *mp)
+                          struct metapath *mp, unsigned int height)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        u64 b = block;
        unsigned int i;
-        for (i = ip->i_di.di_height; i--;)
+        for (i = height; i--;)
-                mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+                mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
+}
+static inline unsigned int zero_metapath_length(const struct metapath *mp,
+                                                unsigned height)
+{
+        unsigned int i;
+        for (i = 0; i < height - 1; i++) {
+                if (mp->mp_list[i] != 0)
+                        return i;
+        }
+        return height;
 }
 /**
 * metapointer - Return pointer to start of metadata in a buffer
- * @bh: The buffer
 * @height: The metadata height (0 = dinode)
 * @mp: The metapath
 *
@@ -361,93 +267,302 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
 * metadata tree.
 */
-static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
+static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
-                               unsigned int height, const struct metapath *mp)
 {
+        struct buffer_head *bh = mp->mp_bh[height];
        unsigned int head_size = (height > 0) ?
                sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-        __be64 *ptr;
+        return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
-        *boundary = 0;
-        ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
-        if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size))
-                *boundary = 1;
-        return ptr;
 }
 /**
- * lookup_block - Get the next metadata block in metadata tree
+ * lookup_metapath - Walk the metadata tree to a specific point
- * @ip: The GFS2 inode
+ * @ip: The inode
- * @bh: Buffer containing the pointers to metadata blocks
- * @height: The height of the tree (0 = dinode)
 * @mp: The metapath
- * @create: Non-zero if we may create a new meatdata block
- * @new: Used to indicate if we did create a new metadata block
- * @block: the returned disk block number
 *
- * Given a metatree, complete to a particular height, checks to see if the next
+ * Assumes that the inode's buffer has already been looked up and
- * height of the tree exists. If not the next height of the tree is created.
+ * hooked onto mp->mp_bh[0] and that the metapath has been initialised
- * The block number of the next height of the metadata tree is returned.
+ * by find_metapath().
+ *
+ * If this function encounters part of the tree which has not been
+ * allocated, it returns the current height of the tree at the point
+ * at which it found the unallocated block. Blocks which are found are
+ * added to the mp->mp_bh[] list.
 *
+ * Returns: error or height of metadata tree
 */
-static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
-                        unsigned int height, struct metapath *mp, int create,
-                        int *new, u64 *block)
 {
-        int boundary;
+        unsigned int end_of_metadata = ip->i_height - 1;
-        __be64 *ptr = metapointer(bh, &boundary, height, mp);
+        unsigned int x;
+        __be64 *ptr;
+        u64 dblock;
+        int ret;
-        if (*ptr) {
+        for (x = 0; x < end_of_metadata; x++) {
-                *block = be64_to_cpu(*ptr);
+                ptr = metapointer(x, mp);
-                return boundary;
+                dblock = be64_to_cpu(*ptr);
-        }
+                if (!dblock)
+                        return x + 1;
-        *block = 0;
+                ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
+                if (ret)
+                        return ret;
+        }
-        if (!create)
+        return ip->i_height;
-                return 0;
+}
-        if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+static inline void release_metapath(struct metapath *mp)
-                *block = gfs2_alloc_data(ip);
+{
-        else
+        int i;
-                *block = gfs2_alloc_meta(ip);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
+                if (mp->mp_bh[i] == NULL)
+                        break;
+                brelse(mp->mp_bh[i]);
+        }
+}
-        *ptr = cpu_to_be64(*block);
+/**
-        ip->i_di.di_blocks++;
+ * gfs2_extent_length - Returns length of an extent of blocks
-        gfs2_set_inode_blocks(&ip->i_inode);
+ * @start: Start of the buffer
+ * @len: Length of the buffer in bytes
+ * @ptr: Current position in the buffer
+ * @limit: Max extent length to return (0 = unlimited)
+ * @eob: Set to 1 if we hit "end of block"
+ *
+ * If the first block is zero (unallocated) it will return the number of
+ * unallocated blocks in the extent, otherwise it will return the number
+ * of contiguous blocks in the extent.
+ *
+ * Returns: The length of the extent (minimum of one block)
+ */
-        *new = 1;
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
-        return 0;
+{
+        const __be64 *end = (start + len);
+        const __be64 *first = ptr;
+        u64 d = be64_to_cpu(*ptr);
+        *eob = 0;
+        do {
+                ptr++;
+                if (ptr >= end)
+                        break;
+                if (limit && --limit == 0)
+                        break;
+                if (d)
+                        d++;
+        } while(be64_to_cpu(*ptr) == d);
+        if (ptr >= end)
+                *eob = 1;
+        return (ptr - first);
 }
-static inline void bmap_lock(struct inode *inode, int create)
+static inline void bmap_lock(struct gfs2_inode *ip, int create)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
        if (create)
                down_write(&ip->i_rw_mutex);
        else
                down_read(&ip->i_rw_mutex);
 }
-static inline void bmap_unlock(struct inode *inode, int create)
+static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
        if (create)
                up_write(&ip->i_rw_mutex);
        else
                up_read(&ip->i_rw_mutex);
 }
+static inline __be64 *gfs2_indirect_init(struct metapath *mp,
+                                         struct gfs2_glock *gl, unsigned int i,
+                                         unsigned offset, u64 bn)
+{
+        __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
+                       ((i > 1) ? sizeof(struct gfs2_meta_header) :
+                                 sizeof(struct gfs2_dinode)));
+        BUG_ON(i < 1);
+        BUG_ON(mp->mp_bh[i] != NULL);
+        mp->mp_bh[i] = gfs2_meta_new(gl, bn);
+        gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+        gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+        gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
+        ptr += offset;
+        *ptr = cpu_to_be64(bn);
+        return ptr;
+}
+enum alloc_state {
+        ALLOC_DATA = 0,
+        ALLOC_GROW_DEPTH = 1,
+        ALLOC_GROW_HEIGHT = 2,
+        /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
+};
+/**
+ * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * @inode: The GFS2 inode
+ * @lblock: The logical starting block of the extent
+ * @bh_map: This is used to return the mapping details
+ * @mp: The metapath
+ * @sheight: The starting height (i.e. whats already mapped)
+ * @height: The height to build to
+ * @maxlen: The max number of data blocks to alloc
+ *
+ * In this routine we may have to alloc:
+ *   i) Indirect blocks to grow the metadata tree height
+ *  ii) Indirect blocks to fill in lower part of the metadata tree
+ * iii) Data blocks
+ *
+ * The function is in two parts. The first part works out the total
+ * number of blocks which we need. The second part does the actual
+ * allocation asking for an extent at a time (if enough contiguous free
+ * blocks are available, there will only be one request per bmap call)
+ * and uses the state machine to initialise the blocks in order.
+ *
+ * Returns: errno on error
+ */
+static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
+                           struct buffer_head *bh_map, struct metapath *mp,
+                           const unsigned int sheight,
+                           const unsigned int height,
+                           const unsigned int maxlen)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *dibh = mp->mp_bh[0];
+        u64 bn, dblock = 0;
+        unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+        unsigned dblks = 0;
+        unsigned ptrs_per_blk;
+        const unsigned end_of_metadata = height - 1;
+        int eob = 0;
+        enum alloc_state state;
+        __be64 *ptr;
+        __be64 zero_bn = 0;
+        BUG_ON(sheight < 1);
+        BUG_ON(dibh == NULL);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if (height == sheight) {
+                struct buffer_head *bh;
+                /* Bottom indirect block exists, find unalloced extent size */
+                ptr = metapointer(end_of_metadata, mp);
+                bh = mp->mp_bh[end_of_metadata];
+                dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+                                           &eob);
+                BUG_ON(dblks < 1);
+                state = ALLOC_DATA;
+        } else {
+                /* Need to allocate indirect blocks */
+                ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+                dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+                if (height == ip->i_height) {
+                        /* Writing into existing tree, extend tree down */
+                        iblks = height - sheight;
+                        state = ALLOC_GROW_DEPTH;
+                } else {
+                        /* Building up tree height */
+                        state = ALLOC_GROW_HEIGHT;
+                        iblks = height - ip->i_height;
+                        zmpl = zero_metapath_length(mp, height);
+                        iblks -= zmpl;
+                        iblks += height;
+                }
+        }
+        /* start of the second part of the function (state machine) */
+        blks = dblks + iblks;
+        i = sheight;
+        do {
+                n = blks - alloced;
+                bn = gfs2_alloc_block(ip, &n);
+                alloced += n;
+                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
+                        gfs2_trans_add_unrevoke(sdp, bn, n);
+                switch (state) {
+                /* Growing height of tree */
+                case ALLOC_GROW_HEIGHT:
+                        if (i == 1) {
+                                ptr = (__be64 *)(dibh->b_data +
+                                                 sizeof(struct gfs2_dinode));
+                                zero_bn = *ptr;
+                        }
+                        for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+                                gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
+                        if (i - 1 == height - ip->i_height) {
+                                i--;
+                                gfs2_buffer_copy_tail(mp->mp_bh[i],
+                                                sizeof(struct gfs2_meta_header),
+                                                dibh, sizeof(struct gfs2_dinode));
+                                gfs2_buffer_clear_tail(dibh,
+                                                sizeof(struct gfs2_dinode) +
+                                                sizeof(__be64));
+                                ptr = (__be64 *)(mp->mp_bh[i]->b_data +
+                                        sizeof(struct gfs2_meta_header));
+                                *ptr = zero_bn;
+                                state = ALLOC_GROW_DEPTH;
+                                for(i = zmpl; i < height; i++) {
+                                        if (mp->mp_bh[i] == NULL)
+                                                break;
+                                        brelse(mp->mp_bh[i]);
+                                        mp->mp_bh[i] = NULL;
+                                }
+                                i = zmpl;
+                        }
+                        if (n == 0)
+                                break;
+                /* Branching from existing tree */
+                case ALLOC_GROW_DEPTH:
+                        if (i > 1 && i < height)
+                                gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+                        for (; i < height && n > 0; i++, n--)
+                                gfs2_indirect_init(mp, ip->i_gl, i,
+                                                   mp->mp_list[i-1], bn++);
+                        if (i == height)
+                                state = ALLOC_DATA;
+                        if (n == 0)
+                                break;
+                /* Tree complete, adding data blocks */
+                case ALLOC_DATA:
+                        BUG_ON(n > dblks);
+                        BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
+                        gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+                        dblks = n;
+                        ptr = metapointer(end_of_metadata, mp);
+                        dblock = bn;
+                        while (n-- > 0)
+                                *ptr++ = cpu_to_be64(bn++);
+                        break;
+                }
+        } while (state != ALLOC_DATA);
+        ip->i_height = height;
+        gfs2_add_inode_blocks(&ip->i_inode, alloced);
+        gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
+        map_bh(bh_map, inode->i_sb, dblock);
+        bh_map->b_size = dblks << inode->i_blkbits;
+        set_buffer_new(bh_map);
+        return 0;
+}
 /**
 * gfs2_block_map - Map a block from an inode to a disk block
 * @inode: The inode
 * @lblock: The logical block number
 * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
 *
- * Find the block number on the current device which corresponds to an
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
- * inode's block. If the block had to be created, "new" will be set.
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
 *
 * Returns: errno
 */
@@ -457,97 +572,78 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *bh;
+        unsigned int bsize = sdp->sd_sb.sb_bsize;
-        unsigned int bsize;
+        const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
-        unsigned int height;
+        const u64 *arr = sdp->sd_heightsize;
-        unsigned int end_of_metadata;
+        __be64 *ptr;
-        unsigned int x;
-        int error = 0;
-        int new = 0;
-        u64 dblock = 0;
-        int boundary;
-        unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
-        struct metapath mp;
        u64 size;
-        struct buffer_head *dibh = NULL;
+        struct metapath mp;
+        int ret;
+        int eob;
+        unsigned int len;
+        struct buffer_head *bh;
+        u8 height;
        BUG_ON(maxlen == 0);
-        if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+        memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
-                return 0;
+        bmap_lock(ip, create);
-        bmap_lock(inode, create);
        clear_buffer_mapped(bh_map);
        clear_buffer_new(bh_map);
        clear_buffer_boundary(bh_map);
-        bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+        if (gfs2_is_dir(ip)) {
-        size = (lblock + 1) * bsize;
+                bsize = sdp->sd_jbsize;
+                arr = sdp->sd_jheightsize;
-        if (size > ip->i_di.di_size) {
-                height = calc_tree_height(ip, size);
-                if (ip->i_di.di_height < height) {
-                        if (!create)
-                                goto out_ok;
-        
-                        error = build_height(inode, height);
-                        if (error)
-                                goto out_fail;
-                }
        }
-        find_metapath(ip, lblock, &mp);
+        ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
-        end_of_metadata = ip->i_di.di_height - 1;
+        if (ret)
-        error = gfs2_meta_inode_buffer(ip, &bh);
+                goto out;
-        if (error)
-                goto out_fail;
-        dibh = bh;
-        get_bh(dibh);
-        for (x = 0; x < end_of_metadata; x++) {
+        height = ip->i_height;
-                lookup_block(ip, bh, x, &mp, create, &new, &dblock);
+        size = (lblock + 1) * bsize;
-                brelse(bh);
+        while (size > arr[height])
-                if (!dblock)
+                height++;
-                        goto out_ok;
+        find_metapath(sdp, lblock, &mp, height);
+        ret = 1;
+        if (height > ip->i_height || gfs2_is_stuffed(ip))
+                goto do_alloc;
+        ret = lookup_metapath(ip, &mp);
+        if (ret < 0)
+                goto out;
+        if (ret != ip->i_height)
+                goto do_alloc;
+        ptr = metapointer(ip->i_height - 1, &mp);
+        if (*ptr == 0)
+                goto do_alloc;
+        map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+        bh = mp.mp_bh[ip->i_height - 1];
+        len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
+        bh_map->b_size = (len << inode->i_blkbits);
+        if (eob)
+                set_buffer_boundary(bh_map);
+        ret = 0;
+out:
+        release_metapath(&mp);
+        bmap_unlock(ip, create);
+        return ret;
-                error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+do_alloc:
-                if (error)
+        /* All allocations are done here, firstly check create flag */
-                        goto out_fail;
+        if (!create) {
+                BUG_ON(gfs2_is_stuffed(ip));
+                ret = 0;
+                goto out;
        }
-        boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock);
+        /* At this point ret is the tree depth of already allocated blocks */
-        if (dblock) {
+        ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
-                map_bh(bh_map, inode->i_sb, dblock);
+        goto out;
-                if (boundary)
-                        set_buffer_boundary(bh_map);
-                if (new) {
-                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                        gfs2_dinode_out(ip, dibh->b_data);
-                        set_buffer_new(bh_map);
-                        goto out_brelse;
-                }
-                while(--maxlen && !buffer_boundary(bh_map)) {
-                        u64 eblock;
-                        mp.mp_list[end_of_metadata]++;
-                        boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock);
-                        if (eblock != ++dblock)
-                                break;
-                        bh_map->b_size += (1 << inode->i_blkbits);
-                        if (boundary)
-                                set_buffer_boundary(bh_map);
-                }
-        }
-out_brelse:
-        brelse(bh);
-out_ok:
-        error = 0;
-out_fail:
-        if (dibh)
-                brelse(dibh);
-        bmap_unlock(inode, create);
-        return error;
 }
+/*
+ * Deprecated: do not use in new code
+ */
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 {
        struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
@@ -558,7 +654,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
        BUG_ON(!dblock);
        BUG_ON(!new);
-        bh.b_size = 1 << (inode->i_blkbits + 5);
+        bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
        ret = gfs2_block_map(inode, lblock, &bh, create);
        *extlen = bh.b_size >> inode->i_blkbits;
        *dblock = bh.b_blocknr;
@@ -621,7 +717,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (error)
                goto out;
-        if (height < ip->i_di.di_height - 1)
+        if (height < ip->i_height - 1)
                for (; top < bottom; top++, first = 0) {
                        if (!*top)
                                continue;
@@ -679,7 +775,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                sm->sm_first = 0;
        }
-        metadata = (height != ip->i_di.di_height - 1);
+        metadata = (height != ip->i_height - 1);
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
@@ -713,7 +809,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        else
                goto out; /* Nothing to do */
-        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
        for (x = 0; x < rlist.rl_rgrps; x++) {
                struct gfs2_rgrpd *rgd;
@@ -760,10 +856,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                }
                *p = 0;
-                if (!ip->i_di.di_blocks)
+                gfs2_add_inode_blocks(&ip->i_inode, -1);
-                        gfs2_consist_inode(ip);
-                ip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&ip->i_inode);
        }
        if (bstart) {
                if (metadata)
@@ -804,19 +897,16 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al;
        struct buffer_head *dibh;
-        unsigned int h;
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
-        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        error = gfs2_quota_lock_check(ip);
        if (error)
                goto out;
-        error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (error)
-                goto out_gunlock_q;
        al->al_requested = sdp->sd_max_height + RES_DATA;
        error = gfs2_inplace_reserve(ip);
@@ -829,34 +919,25 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
        if (error)
                goto out_ipres;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
                if (gfs2_is_stuffed(ip)) {
                        error = gfs2_unstuff_dinode(ip, NULL);
                        if (error)
-                                goto out_end_trans;
+                                goto out_brelse;
-                }
-                h = calc_tree_height(ip, size);
-                if (ip->i_di.di_height < h) {
-                        down_write(&ip->i_rw_mutex);
-                        error = build_height(&ip->i_inode, h);
-                        up_write(&ip->i_rw_mutex);
-                        if (error)
-                                goto out_end_trans;
                }
        }
        ip->i_di.di_size = size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                goto out_end_trans;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
+out_brelse:
+        brelse(dibh);
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipres:
@@ -986,7 +1067,8 @@ out:
 static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 {
-        unsigned int height = ip->i_di.di_height;
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int height = ip->i_height;
        u64 lblock;
        struct metapath mp;
        int error;
@@ -994,10 +1076,11 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
        if (!size)
                lblock = 0;
        else
-                lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+                lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
-        find_metapath(ip, lblock, &mp);
+        find_metapath(sdp, lblock, &mp, ip->i_height);
-        gfs2_alloc_get(ip);
+        if (!gfs2_alloc_get(ip))
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -1037,10 +1120,8 @@ static int trunc_end(struct gfs2_inode *ip)
                goto out;
        if (!ip->i_di.di_size) {
-                ip->i_di.di_height = 0;
+                ip->i_height = 0;
-                ip->i_di.di_goal_meta =
+                ip->i_goal = ip->i_no_addr;
-                        ip->i_di.di_goal_data =
-                        ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        }
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
@@ -1197,10 +1278,9 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                              unsigned int len, int *alloc_required)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        u64 lblock, lblock_stop, dblock;
+        struct buffer_head bh;
-        u32 extlen;
+        unsigned int shift;
-        int new = 0;
+        u64 lblock, lblock_stop, size;
-        int error = 0;
        *alloc_required = 0;
@@ -1214,6 +1294,8 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                return 0;
        }
+        *alloc_required = 1;
+        shift = sdp->sd_sb.sb_bsize_shift;
        if (gfs2_is_dir(ip)) {
                unsigned int bsize = sdp->sd_jbsize;
                lblock = offset;
@@ -1221,27 +1303,25 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                lblock_stop = offset + len + bsize - 1;
                do_div(lblock_stop, bsize);
        } else {
-                unsigned int shift = sdp->sd_sb.sb_bsize_shift;
                u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
                lblock = offset >> shift;
                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                if (lblock_stop > end_of_file) {
+                if (lblock_stop > end_of_file)
-                        *alloc_required = 1;
                        return 0;
-                }
        }
-        for (; lblock < lblock_stop; lblock += extlen) {
+        size = (lblock_stop - lblock) << shift;
-                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+        do {
-                if (error)
+                bh.b_state = 0;
-                        return error;
+                bh.b_size = size;
+                gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
-                if (!dblock) {
+                if (!buffer_mapped(&bh))
-                        *alloc_required = 1;
                        return 0;
-                }
+                size -= bh.b_size;
-        }
+                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+        } while(size > 0);
+        *alloc_required = 0;
        return 0;
 }
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c34709512b19..eed040d8ba3a 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -159,6 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
        unsigned int o;
        int copied = 0;
        int error = 0;
+        int new = 0;
        if (!size)
                return 0;
@@ -183,7 +184,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
        while (copied < size) {
                unsigned int amount;
                struct buffer_head *bh;
-                int new = 0;
                amount = size - copied;
                if (amount > sdp->sd_sb.sb_bsize - o)
@@ -757,7 +757,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf;
-                unsigned hsize = 1 << ip->i_di.di_depth;
+                unsigned hsize = 1 << ip->i_depth;
                unsigned index;
                u64 ln;
                if (hsize * sizeof(u64) != ip->i_di.di_size) {
@@ -765,7 +765,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
                        return ERR_PTR(-EIO);
                }
-                index = name->hash >> (32 - ip->i_di.di_depth);
+                index = name->hash >> (32 - ip->i_depth);
                error = get_first_leaf(ip, index, &bh);
                if (error)
                        return ERR_PTR(error);
@@ -803,14 +803,15 @@ got_dent:
 static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        u64 bn = gfs2_alloc_meta(ip);
+        unsigned int n = 1;
+        u64 bn = gfs2_alloc_block(ip, &n);
        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "", .len = 0, .hash = 0 };
        if (!bh)
                return NULL;
+        gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
        leaf = (struct gfs2_leaf *)bh->b_data;
@@ -905,12 +906,11 @@ static int dir_make_exhash(struct inode *inode)
                *lp = cpu_to_be64(bn);
        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
-        dip->i_di.di_blocks++;
+        gfs2_add_inode_blocks(&dip->i_inode, 1);
-        gfs2_set_inode_blocks(&dip->i_inode);
        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
-        dip->i_di.di_depth = y;
+        dip->i_depth = y;
        gfs2_dinode_out(dip, dibh->b_data);
@@ -941,7 +941,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        int x, moved = 0;
        int error;
-        index = name->hash >> (32 - dip->i_di.di_depth);
+        index = name->hash >> (32 - dip->i_depth);
        error = get_leaf_nr(dip, index, &leaf_no);
        if (error)
                return error;
@@ -952,7 +952,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
                return error;
        oleaf = (struct gfs2_leaf *)obh->b_data;
-        if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+        if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
                brelse(obh);
                return 1; /* can't split */
        }
@@ -967,10 +967,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        bn = nbh->b_blocknr;
        /*  Compute the start and len of leaf pointers in the hash table.  */
-        len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+        len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
        half_len = len >> 1;
        if (!half_len) {
-                printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+                printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
                gfs2_consist_inode(dip);
                error = -EIO;
                goto fail_brelse;
@@ -997,7 +997,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        kfree(lp);
        /*  Compute the divider  */
-        divider = (start + half_len) << (32 - dip->i_di.di_depth);
+        divider = (start + half_len) << (32 - dip->i_depth);
        /*  Copy the entries  */
        dirent_first(dip, obh, &dent);
@@ -1021,13 +1021,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
                        new->de_inum = dent->de_inum; /* No endian worries */
                        new->de_type = dent->de_type; /* No endian worries */
-                        nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+                        be16_add_cpu(&nleaf->lf_entries, 1);
                        dirent_del(dip, obh, prev, dent);
                        if (!oleaf->lf_entries)
                                gfs2_consist_inode(dip);
-                        oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+                        be16_add_cpu(&oleaf->lf_entries, -1);
                        if (!prev)
                                prev = dent;
@@ -1044,8 +1044,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
                gfs2_trans_add_bh(dip->i_gl, dibh, 1);
-                dip->i_di.di_blocks++;
+                gfs2_add_inode_blocks(&dip->i_inode, 1);
-                gfs2_set_inode_blocks(&dip->i_inode);
                gfs2_dinode_out(dip, dibh->b_data);
                brelse(dibh);
        }
@@ -1082,7 +1081,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        int x;
        int error = 0;
-        hsize = 1 << dip->i_di.di_depth;
+        hsize = 1 << dip->i_depth;
        if (hsize * sizeof(u64) != dip->i_di.di_size) {
                gfs2_consist_inode(dip);
                return -EIO;
@@ -1090,7 +1089,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        /*  Allocate both the "from" and "to" buffers in one big chunk  */
-        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1125,7 +1124,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (!gfs2_assert_withdraw(sdp, !error)) {
-                dip->i_di.di_depth++;
+                dip->i_depth++;
                gfs2_dinode_out(dip, dibh->b_data);
                brelse(dibh);
        }
@@ -1370,16 +1369,16 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        int error = 0;
        unsigned depth = 0;
-        hsize = 1 << dip->i_di.di_depth;
+        hsize = 1 << dip->i_depth;
        if (hsize * sizeof(u64) != dip->i_di.di_size) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
        hash = gfs2_dir_offset2hash(*offset);
-        index = hash >> (32 - dip->i_di.di_depth);
+        index = hash >> (32 - dip->i_depth);
-        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
        if (!lp)
                return -ENOMEM;
@@ -1405,7 +1404,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
                if (error)
                        break;
-                len = 1 << (dip->i_di.di_depth - depth);
+                len = 1 << (dip->i_depth - depth);
                index = (index & ~(len - 1)) + len;
        }
@@ -1444,7 +1443,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
        error = -ENOMEM;
        /* 96 is max number of dirents which can be stuffed into an inode */
-        darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
+        darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
        if (darr) {
                g.pdent = darr;
                g.offset = 0;
@@ -1549,7 +1548,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        u32 index;
        u64 bn;
-        index = name->hash >> (32 - ip->i_di.di_depth);
+        index = name->hash >> (32 - ip->i_depth);
        error = get_first_leaf(ip, index, &obh);
        if (error)
                return error;
@@ -1579,8 +1578,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        if (error)
                return error;
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        ip->i_di.di_blocks++;
+        gfs2_add_inode_blocks(&ip->i_inode, 1);
-        gfs2_set_inode_blocks(&ip->i_inode);
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
        return 0;
@@ -1616,7 +1614,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        dent->de_type = cpu_to_be16(type);
                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
-                                leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
                        brelse(bh);
                        error = gfs2_meta_inode_buffer(ip, &bh);
@@ -1641,7 +1639,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        continue;
                if (error < 0)
                        break;
-                if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+                if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
                        error = dir_double_exhash(ip);
                        if (error)
                                break;
@@ -1785,13 +1783,13 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
        u64 leaf_no;
        int error = 0;
-        hsize = 1 << dip->i_di.di_depth;
+        hsize = 1 << dip->i_depth;
        if (hsize * sizeof(u64) != dip->i_di.di_size) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
-        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
        if (!lp)
                return -ENOMEM;
@@ -1817,7 +1815,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
                        if (error)
                                goto out;
                        leaf = (struct gfs2_leaf *)bh->b_data;
-                        len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+                        len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
                        brelse(bh);
                        error = lc(dip, index, len, leaf_no, data);
@@ -1866,15 +1864,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-        ht = kzalloc(size, GFP_KERNEL);
+        ht = kzalloc(size, GFP_NOFS);
        if (!ht)
                return -ENOMEM;
-        gfs2_alloc_get(dip);
+        if (!gfs2_alloc_get(dip)) {
+                error = -ENOMEM;
+                goto out;
+        }
        error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
-                goto out;
+                goto out_put;
        error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
        if (error)
@@ -1894,7 +1895,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                l_blocks++;
        }
-        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
        for (x = 0; x < rlist.rl_rgrps; x++) {
                struct gfs2_rgrpd *rgd;
@@ -1921,11 +1922,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                brelse(bh);
                gfs2_free_meta(dip, blk, 1);
+                gfs2_add_inode_blocks(&dip->i_inode, -1);
-                if (!dip->i_di.di_blocks)
-                        gfs2_consist_inode(dip);
-                dip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&dip->i_inode);
        }
        error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
@@ -1952,8 +1949,9 @@ out_rlist:
        gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
 out_qs:
        gfs2_quota_unhold(dip);
-out:
+out_put:
        gfs2_alloc_put(dip);
+out:
        kfree(ht);
        return error;
 }
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index bee99704ea10..e3f76f451b0a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -277,10 +277,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
                }
                *dataptrs = 0;
-                if (!ip->i_di.di_blocks)
+                gfs2_add_inode_blocks(&ip->i_inode, -1);
-                        gfs2_consist_inode(ip);
-                ip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&ip->i_inode);
        }
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
@@ -321,6 +318,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -449,7 +448,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
        unsigned int x;
        int error = 0;
-        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
        if (!bh)
                return -ENOMEM;
@@ -582,10 +581,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_header *ea;
+        unsigned int n = 1;
        u64 block;
-        block = gfs2_alloc_meta(ip);
+        block = gfs2_alloc_block(ip, &n);
+        gfs2_trans_add_unrevoke(sdp, block, 1);
        *bhp = gfs2_meta_new(ip->i_gl, block);
        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
        gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
@@ -597,8 +597,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
        ea->ea_flags = GFS2_EAFLAG_LAST;
        ea->ea_num_ptrs = 0;
-        ip->i_di.di_blocks++;
+        gfs2_add_inode_blocks(&ip->i_inode, 1);
-        gfs2_set_inode_blocks(&ip->i_inode);
        return 0;
 }
@@ -642,15 +641,15 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                        struct buffer_head *bh;
                        u64 block;
                        int mh_size = sizeof(struct gfs2_meta_header);
+                        unsigned int n = 1;
-                        block = gfs2_alloc_meta(ip);
+                        block = gfs2_alloc_block(ip, &n);
+                        gfs2_trans_add_unrevoke(sdp, block, 1);
                        bh = gfs2_meta_new(ip->i_gl, block);
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
                        gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
-                        ip->i_di.di_blocks++;
+                        gfs2_add_inode_blocks(&ip->i_inode, 1);
-                        gfs2_set_inode_blocks(&ip->i_inode);
                        copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
                                                           data_len;
@@ -684,15 +683,13 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
-        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        error = gfs2_quota_lock_check(ip);
        if (error)
                goto out;
-        error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (error)
-                goto out_gunlock_q;
        al->al_requested = blks;
        error = gfs2_inplace_reserve(ip);
@@ -966,9 +963,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
        } else {
                u64 blk;
+                unsigned int n = 1;
-                blk = gfs2_alloc_meta(ip);
+                blk = gfs2_alloc_block(ip, &n);
+                gfs2_trans_add_unrevoke(sdp, blk, 1);
                indbh = gfs2_meta_new(ip->i_gl, blk);
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
                gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
@@ -978,8 +975,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                *eablk = cpu_to_be64(ip->i_di.di_eattr);
                ip->i_di.di_eattr = blk;
                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
-                ip->i_di.di_blocks++;
+                gfs2_add_inode_blocks(&ip->i_inode, 1);
-                gfs2_set_inode_blocks(&ip->i_inode);
                eablk++;
        }
@@ -1210,7 +1206,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
        unsigned int x;
        int error;
-        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
        if (!bh)
                return -ENOMEM;
@@ -1347,7 +1343,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        else
                goto out;
-        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
        for (x = 0; x < rlist.rl_rgrps; x++) {
                struct gfs2_rgrpd *rgd;
@@ -1387,10 +1383,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
                }
                *eablk = 0;
-                if (!ip->i_di.di_blocks)
+                gfs2_add_inode_blocks(&ip->i_inode, -1);
-                        gfs2_consist_inode(ip);
-                ip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&ip->i_inode);
        }
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
@@ -1442,10 +1435,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
        ip->i_di.di_eattr = 0;
-        if (!ip->i_di.di_blocks)
+        gfs2_add_inode_blocks(&ip->i_inode, -1);
-                gfs2_consist_inode(ip);
-        ip->i_di.di_blocks--;
-        gfs2_set_inode_blocks(&ip->i_inode);
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1474,6 +1464,8 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7175a4d06435..d636b3e80f5d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "lops.h"
 #include "meta_io.h"
 #include "quota.h"
@@ -183,7 +182,8 @@ static void glock_free(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct inode *aspace = gl->gl_aspace;
-        gfs2_lm_put_lock(sdp, gl->gl_lock);
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
        if (aspace)
                gfs2_aspace_put(aspace);
@@ -197,7 +197,7 @@ static void glock_free(struct gfs2_glock *gl)
 *
 */
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
        atomic_inc(&gl->gl_ref);
 }
@@ -293,6 +293,16 @@ static void glock_work_func(struct work_struct *work)
        gfs2_glock_put(gl);
 }
+static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                     void **lockp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
+        return error;
+}
 /**
 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
 * @sdp: The GFS2 superblock
@@ -338,8 +348,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_ip = 0;
        gl->gl_ops = glops;
        gl->gl_req_gh = NULL;
-        gl->gl_req_bh = NULL;
-        gl->gl_vn = 0;
        gl->gl_stamp = jiffies;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
@@ -595,11 +603,12 @@ static void run_queue(struct gfs2_glock *gl)
                        blocked = rq_mutex(gh);
                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
                        blocked = rq_demote(gl);
-                        if (gl->gl_waiters2 && !blocked) {
+                        if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
+                                     !blocked) {
                                set_bit(GLF_DEMOTE, &gl->gl_flags);
                                gl->gl_demote_state = LM_ST_UNLOCKED;
                        }
-                        gl->gl_waiters2 = 0;
+                        clear_bit(GLF_WAITERS2, &gl->gl_flags);
                } else if (!list_empty(&gl->gl_waiters3)) {
                        gh = list_entry(gl->gl_waiters3.next,
                                        struct gfs2_holder, gh_list);
@@ -710,7 +719,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
                if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
-                        gl->gl_waiters2 = 1;
+                        set_bit(GLF_WAITERS2, &gl->gl_flags);
                else 
                        gl->gl_demote_state = LM_ST_UNLOCKED;
        }
@@ -743,6 +752,43 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 }
 /**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
+        gfs2_assert_warn(sdp, !ret);
+        state_change(gl, LM_ST_UNLOCKED);
+        if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
+                spin_lock(&gl->gl_spin);
+                gh->gh_error = 0;
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_xmote_th(gl, gl->gl_req_gh);
+                gfs2_glock_put(gl);
+                return;
+        }
+        spin_lock(&gl->gl_spin);
+        gfs2_demote_wake(gl);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+/**
 * xmote_bh - Called after the lock module is done acquiring a lock
 * @gl: The glock in question
 * @ret: the int returned from the lock module
@@ -754,25 +800,19 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh = gl->gl_req_gh;
-        int prev_state = gl->gl_state;
        int op_done = 1;
+        if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
+                drop_bh(gl, ret);
+                return;
+        }
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
        state_change(gl, ret & LM_OUT_ST_MASK);
-        if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
-                if (glops->go_inval)
-                        glops->go_inval(gl, DIO_METADATA);
-        } else if (gl->gl_state == LM_ST_DEFERRED) {
-                /* We might not want to do this here.
-                   Look at moving to the inode glops. */
-                if (glops->go_inval)
-                        glops->go_inval(gl, 0);
-        }
        /*  Deal with each possible exit condition  */
        if (!gh) {
@@ -782,7 +822,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
                } else {
                        spin_lock(&gl->gl_spin);
                        if (gl->gl_state != gl->gl_demote_state) {
-                                gl->gl_req_bh = NULL;
                                spin_unlock(&gl->gl_spin);
                                gfs2_glock_drop_th(gl);
                                gfs2_glock_put(gl);
@@ -793,6 +832,14 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
                }
        } else {
                spin_lock(&gl->gl_spin);
+                if (ret & LM_OUT_CONV_DEADLK) {
+                        gh->gh_error = 0;
+                        set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
+                        spin_unlock(&gl->gl_spin);
+                        gfs2_glock_drop_th(gl);
+                        gfs2_glock_put(gl);
+                        return;
+                }
                list_del_init(&gh->gh_list);
                gh->gh_error = -EIO;
                if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
@@ -824,7 +871,6 @@ out:
        if (op_done) {
                spin_lock(&gl->gl_spin);
                gl->gl_req_gh = NULL;
-                gl->gl_req_bh = NULL;
                clear_bit(GLF_LOCK, &gl->gl_flags);
                spin_unlock(&gl->gl_spin);
        }
@@ -835,6 +881,17 @@ out:
                gfs2_holder_wake(gh);
 }
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                                 unsigned int cur_state, unsigned int req_state,
+                                 unsigned int flags)
+{
+        int ret = 0;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
 /**
 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
 * @gl: The glock in question
@@ -856,6 +913,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
+        if (state == LM_ST_DEFERRED && glops->go_inval)
+                glops->go_inval(gl, DIO_METADATA);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -863,7 +922,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
        gfs2_assert_warn(sdp, state != gl->gl_state);
        gfs2_glock_hold(gl);
-        gl->gl_req_bh = xmote_bh;
        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
@@ -876,49 +934,13 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
                xmote_bh(gl, lck_ret);
 }
-/**
+static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- * drop_bh - Called after a lock module unlock completes
+                                   unsigned int cur_state)
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 {
-        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int ret = 0;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-        struct gfs2_holder *gh = gl->gl_req_gh;
+                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+        return ret;
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !ret);
-        state_change(gl, LM_ST_UNLOCKED);
-        if (glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        if (gh) {
-                spin_lock(&gl->gl_spin);
-                list_del_init(&gh->gh_list);
-                gh->gh_error = 0;
-                spin_unlock(&gl->gl_spin);
-        }
-        spin_lock(&gl->gl_spin);
-        gfs2_demote_wake(gl);
-        gl->gl_req_gh = NULL;
-        gl->gl_req_bh = NULL;
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
-        if (gh)
-                gfs2_holder_wake(gh);
 }
 /**
@@ -935,13 +957,14 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
+        if (glops->go_inval)
+                glops->go_inval(gl, DIO_METADATA);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
        gfs2_glock_hold(gl);
-        gl->gl_req_bh = drop_bh;
        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
@@ -964,16 +987,17 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 static void do_cancels(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        spin_lock(&gl->gl_spin);
        while (gl->gl_req_gh != gh &&
               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
               !list_empty(&gh->gh_list)) {
-                if (gl->gl_req_bh && !(gl->gl_req_gh &&
+                if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-                                     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
                        spin_unlock(&gl->gl_spin);
-                        gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+                        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                                sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
                        msleep(100);
                        spin_lock(&gl->gl_spin);
                } else {
@@ -1041,7 +1065,6 @@ static int glock_wait_internal(struct gfs2_holder *gh)
                spin_lock(&gl->gl_spin);
                gl->gl_req_gh = NULL;
-                gl->gl_req_bh = NULL;
                clear_bit(GLF_LOCK, &gl->gl_flags);
                run_queue(gl);
                spin_unlock(&gl->gl_spin);
@@ -1428,6 +1451,14 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
                gfs2_glock_dq_uninit(&ghs[x]);
 }
+static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+        return error;
+}
 /**
 * gfs2_lvb_hold - attach a LVB from a glock
 * @gl: The glock in question
@@ -1463,12 +1494,15 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 void gfs2_lvb_unhold(struct gfs2_glock *gl)
 {
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        gfs2_glock_hold(gl);
        gfs2_glmutex_lock(gl);
        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-                gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+                if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                        sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
                gl->gl_lvb = NULL;
                gfs2_glock_put(gl);
        }
@@ -1534,8 +1568,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                gl = gfs2_glock_find(sdp, &async->lc_name);
                if (gfs2_assert_warn(sdp, gl))
                        return;
-                if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+                xmote_bh(gl, async->lc_ret);
-                        gl->gl_req_bh(gl, async->lc_ret);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
                up_read(&gfs2_umount_flush_sem);
@@ -1594,10 +1627,10 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
                gfs2_glock_hold(gl);
                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
                atomic_inc(&sdp->sd_reclaim_count);
-        }
+                spin_unlock(&sdp->sd_reclaim_lock);
-        spin_unlock(&sdp->sd_reclaim_lock);
+                wake_up(&sdp->sd_reclaim_wq);
+        } else
-        wake_up(&sdp->sd_reclaim_wq);
+                spin_unlock(&sdp->sd_reclaim_lock);
 }
 /**
@@ -1897,7 +1930,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
                print_dbg(gi, "  gl_owner = -1\n");
        print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
        print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-        print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
        print_dbg(gi, "  reclaim = %s\n",
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2f9c6d136b37..cdad3e6f8150 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -32,24 +32,23 @@
 #define GLR_TRYFAILED           13
 #define GLR_CANCELED            14
-static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
-        int locked = 0;
        struct pid *pid;
        /* Look in glock's list of holders for one with current task as owner */
        spin_lock(&gl->gl_spin);
        pid = task_pid(current);
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                if (gh->gh_owner_pid == pid) {
+                if (gh->gh_owner_pid == pid)
-                        locked = 1;
+                        goto out;
-                        break;
-                }
        }
+        gh = NULL;
+out:
        spin_unlock(&gl->gl_spin);
-        return locked;
+        return gh;
 }
 static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
@@ -79,7 +78,6 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
-void gfs2_glock_hold(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c663b7a0f410..d31badadef8f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -126,7 +126,13 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
                return;
        gfs2_meta_inval(gl);
-        gl->gl_vn++;
+        if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
+                gl->gl_sbd->sd_rindex_uptodate = 0;
+        else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
+                struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+                rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+        }
 }
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 525dcae352d6..9c2c0b90b22a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -44,7 +44,6 @@ struct gfs2_log_header_host {
 struct gfs2_log_operations {
        void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
-        void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
        void (*lo_before_commit) (struct gfs2_sbd *sdp);
        void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
        void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -70,7 +69,6 @@ struct gfs2_bitmap {
 };
 struct gfs2_rgrp_host {
-        u32 rg_flags;
        u32 rg_free;
        u32 rg_dinodes;
        u64 rg_igeneration;
@@ -87,17 +85,17 @@ struct gfs2_rgrpd {
        u32 rd_data;                    /* num of data blocks in rgrp */
        u32 rd_bitbytes;                /* number of bytes in data bitmaps */
        struct gfs2_rgrp_host rd_rg;
-        u64 rd_rg_vn;
        struct gfs2_bitmap *rd_bits;
        unsigned int rd_bh_count;
        struct mutex rd_mutex;
        u32 rd_free_clone;
        struct gfs2_log_element rd_le;
-        u32 rd_last_alloc_data;
+        u32 rd_last_alloc;
-        u32 rd_last_alloc_meta;
        struct gfs2_sbd *rd_sbd;
-        unsigned long rd_flags;
+        unsigned char rd_flags;
-#define GFS2_RDF_CHECK        0x0001          /* Need to check for unlinked inodes */
+#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
+#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
 };
 enum gfs2_state_bits {
@@ -168,6 +166,8 @@ enum {
        GLF_DIRTY               = 5,
        GLF_DEMOTE_IN_PROGRESS  = 6,
        GLF_LFLUSH              = 7,
+        GLF_WAITERS2            = 8,
+        GLF_CONV_DEADLK         = 9,
 };
 struct gfs2_glock {
@@ -187,18 +187,15 @@ struct gfs2_glock {
        struct list_head gl_holders;
        struct list_head gl_waiters1;   /* HIF_MUTEX */
        struct list_head gl_waiters3;   /* HIF_PROMOTE */
-        int gl_waiters2;                /* GIF_DEMOTE */
        const struct gfs2_glock_operations *gl_ops;
        struct gfs2_holder *gl_req_gh;
-        gfs2_glop_bh_t gl_req_bh;
        void *gl_lock;
        char *gl_lvb;
        atomic_t gl_lvb_count;
-        u64 gl_vn;
        unsigned long gl_stamp;
        unsigned long gl_tchange;
        void *gl_object;
@@ -213,6 +210,8 @@ struct gfs2_glock {
        struct delayed_work gl_work;
 };
+#define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
 struct gfs2_alloc {
        /* Quota stuff */
@@ -241,14 +240,9 @@ enum {
 struct gfs2_dinode_host {
        u64 di_size;            /* number of bytes in file */
-        u64 di_blocks;          /* number of blocks in file */
-        u64 di_goal_meta;       /* rgrp to alloc from next */
-        u64 di_goal_data;       /* data block goal */
        u64 di_generation;      /* generation number for NFS */
        u32 di_flags;           /* GFS2_DIF_... */
-        u16 di_height;          /* height of metadata */
        /* These only apply to directories  */
-        u16 di_depth;           /* Number of bits in the table */
        u32 di_entries;         /* The number of entries in the directory */
        u64 di_eattr;           /* extended attribute block number */
 };
@@ -265,9 +259,10 @@ struct gfs2_inode {
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
        struct gfs2_alloc *i_alloc;
-        u64 i_last_rg_alloc;
+        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
+        u8 i_height;
+        u8 i_depth;
 };
 /*
@@ -490,9 +485,9 @@ struct gfs2_sbd {
        u32 sd_qc_per_block;
        u32 sd_max_dirres;      /* Max blocks needed to add a directory entry */
        u32 sd_max_height;      /* Max height of a file's metadata tree */
-        u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+        u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
        u32 sd_max_jheight; /* Max height of journaled file's meta tree */
-        u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+        u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
        struct gfs2_args sd_args;       /* Mount arguments */
        struct gfs2_tune sd_tune;       /* Filesystem tuning structure */
@@ -533,7 +528,7 @@ struct gfs2_sbd {
        /* Resource group stuff */
-        u64 sd_rindex_vn;
+        int sd_rindex_uptodate;
        spinlock_t sd_rindex_spin;
        struct mutex sd_rindex_mutex;
        struct list_head sd_rindex_list;
@@ -637,9 +632,6 @@ struct gfs2_sbd {
        /* Counters */
-        atomic_t sd_glock_count;
-        atomic_t sd_glock_held_count;
-        atomic_t sd_inode_count;
        atomic_t sd_reclaimed;
        char sd_fsname[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 37725ade3c51..3a9ef526c308 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -149,7 +149,8 @@ void gfs2_set_iop(struct inode *inode)
        } else if (S_ISLNK(mode)) {
                inode->i_op = &gfs2_symlink_iops;
        } else {
-                inode->i_op = &gfs2_dev_iops;
+                inode->i_op = &gfs2_file_iops;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        }
        unlock_new_inode(inode);
@@ -248,12 +249,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
+        u16 height, depth;
-        if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) {
+        if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
-                if (gfs2_consist_inode(ip))
+                goto corrupt;
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
        ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
        ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
        ip->i_inode.i_rdev = 0;
@@ -275,8 +274,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
        di->di_size = be64_to_cpu(str->di_size);
        i_size_write(&ip->i_inode, di->di_size);
-        di->di_blocks = be64_to_cpu(str->di_blocks);
+        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-        gfs2_set_inode_blocks(&ip->i_inode);
        ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
        ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
        ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
@@ -284,15 +282,20 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
-        di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+        ip->i_goal = be64_to_cpu(str->di_goal_meta);
-        di->di_goal_data = be64_to_cpu(str->di_goal_data);
        di->di_generation = be64_to_cpu(str->di_generation);
        di->di_flags = be32_to_cpu(str->di_flags);
        gfs2_set_inode_flags(&ip->i_inode);
-        di->di_height = be16_to_cpu(str->di_height);
+        height = be16_to_cpu(str->di_height);
+        if (unlikely(height > GFS2_MAX_META_HEIGHT))
-        di->di_depth = be16_to_cpu(str->di_depth);
+                goto corrupt;
+        ip->i_height = (u8)height;
+        depth = be16_to_cpu(str->di_depth);
+        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+                goto corrupt;
+        ip->i_depth = (u8)depth;
        di->di_entries = be32_to_cpu(str->di_entries);
        di->di_eattr = be64_to_cpu(str->di_eattr);
@@ -300,6 +303,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
                gfs2_set_aops(&ip->i_inode);
        return 0;
+corrupt:
+        if (gfs2_consist_inode(ip))
+                gfs2_dinode_print(ip);
+        return -EIO;
 }
 /**
@@ -337,13 +344,15 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        struct gfs2_rgrpd *rgd;
        int error;
-        if (ip->i_di.di_blocks != 1) {
+        if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                return -EIO;
        }
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -487,7 +496,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                return dir;
        }
-        if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) {
+        if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
                if (error)
                        return ERR_PTR(error);
@@ -818,7 +827,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        int error;
        munge_mode_uid_gid(dip, &mode, &uid, &gid);
-        gfs2_alloc_get(dip);
+        if (!gfs2_alloc_get(dip))
+                return -ENOMEM;
        error = gfs2_quota_lock(dip, uid, gid);
        if (error)
@@ -853,6 +863,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        int error;
        al = gfs2_alloc_get(dip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -1219,7 +1231,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        x = ip->i_di.di_size + 1;
        if (x > *len) {
-                *buf = kmalloc(x, GFP_KERNEL);
+                *buf = kmalloc(x, GFP_NOFS);
                if (!*buf) {
                        error = -ENOMEM;
                        goto out_brelse;
@@ -1391,21 +1403,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
        str->di_size = cpu_to_be64(di->di_size);
-        str->di_blocks = cpu_to_be64(di->di_blocks);
+        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
        str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
-        str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
+        str->di_goal_meta = cpu_to_be64(ip->i_goal);
-        str->di_goal_data = cpu_to_be64(di->di_goal_data);
+        str->di_goal_data = cpu_to_be64(ip->i_goal);
        str->di_generation = cpu_to_be64(di->di_generation);
        str->di_flags = cpu_to_be32(di->di_flags);
-        str->di_height = cpu_to_be16(di->di_height);
+        str->di_height = cpu_to_be16(ip->i_height);
        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
                                             !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
                                             GFS2_FORMAT_DE : 0);
-        str->di_depth = cpu_to_be16(di->di_depth);
+        str->di_depth = cpu_to_be16(ip->i_depth);
        str->di_entries = cpu_to_be32(di->di_entries);
        str->di_eattr = cpu_to_be64(di->di_eattr);
@@ -1423,15 +1435,13 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
        printk(KERN_INFO "  no_addr = %llu\n",
               (unsigned long long)ip->i_no_addr);
        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
-        printk(KERN_INFO "  di_blocks = %llu\n",
+        printk(KERN_INFO "  blocks = %llu\n",
-               (unsigned long long)di->di_blocks);
+               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
-        printk(KERN_INFO "  di_goal_meta = %llu\n",
+        printk(KERN_INFO "  i_goal = %llu\n",
-               (unsigned long long)di->di_goal_meta);
+               (unsigned long long)ip->i_goal);
-        printk(KERN_INFO "  di_goal_data = %llu\n",
-               (unsigned long long)di->di_goal_data);
        printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
-        printk(KERN_INFO "  di_height = %u\n", di->di_height);
+        printk(KERN_INFO "  i_height = %u\n", ip->i_height);
-        printk(KERN_INFO "  di_depth = %u\n", di->di_depth);
+        printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
        printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
        printk(KERN_INFO "  di_eattr = %llu\n",
               (unsigned long long)di->di_eattr);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d44650662615..580da454b38f 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,9 +10,11 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
+#include "util.h"
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
-        return !ip->i_di.di_height;
+        return !ip->i_height;
 }
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
@@ -37,13 +39,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
        return S_ISDIR(ip->i_inode.i_mode);
 }
-static inline void gfs2_set_inode_blocks(struct inode *inode)
+static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
+{
+        inode->i_blocks = blocks <<
+                (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        return inode->i_blocks >>
-        inode->i_blocks = ip->i_di.di_blocks <<
                (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
 }
+static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
+{
+        gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
+        change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+        inode->i_blocks += change;
+}
 static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
                                  u64 no_formal_ino)
 {
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
deleted file mode 100644
index cfcc39b86a53..000000000000
--- a/fs/gfs2/lm.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "lm.h"
-#include "super.h"
-#include "util.h"
-/**
- * gfs2_lm_mount - mount a locking protocol
- * @sdp: the filesystem
- * @args: mount arguements
- * @silent: if 1, don't complain if the FS isn't a GFS2 fs
- *
- * Returns: errno
- */
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
-{
-        char *proto = sdp->sd_proto_name;
-        char *table = sdp->sd_table_name;
-        int flags = 0;
-        int error;
-        if (sdp->sd_args.ar_spectator)
-                flags |= LM_MFLAG_SPECTATOR;
-        fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
-        error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
-                                     gfs2_glock_cb, sdp,
-                                     GFS2_MIN_LVB_SIZE, flags,
-                                     &sdp->sd_lockstruct, &sdp->sd_kobj);
-        if (error) {
-                fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
-                        proto, table, sdp->sd_args.ar_hostdata);
-                goto out;
-        }
-        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
-                                  GFS2_MIN_LVB_SIZE)) {
-                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-                goto out;
-        }
-        if (sdp->sd_args.ar_spectator)
-                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
-        else
-                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
-                         sdp->sd_lockstruct.ls_jid);
-        fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-        if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
-            !sdp->sd_args.ar_ignore_local_fs) {
-                sdp->sd_args.ar_localflocks = 1;
-                sdp->sd_args.ar_localcaching = 1;
-        }
-out:
-        return error;
-}
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
-                                        sdp->sd_lockstruct.ls_lockspace);
-}
-void gfs2_lm_unmount(struct gfs2_sbd *sdp)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-}
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-{
-        va_list args;
-        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return 0;
-        va_start(args, fmt);
-        vprintk(fmt, args);
-        va_end(args);
-        fs_err(sdp, "about to withdraw this file system\n");
-        BUG_ON(sdp->sd_args.ar_debug);
-        fs_err(sdp, "telling LM to withdraw\n");
-        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
-        fs_err(sdp, "withdrawn\n");
-        dump_stack();
-        return -1;
-}
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                     void **lockp)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
-                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
-        return error;
-}
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
-}
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                          unsigned int cur_state, unsigned int req_state,
-                          unsigned int flags)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-                                                         req_state, flags);
-        return ret;
-}
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                            unsigned int cur_state)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-        return ret;
-}
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
-}
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
-        return error;
-}
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
-}
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                      struct file *file, struct file_lock *fl)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
-                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-        return error;
-}
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                  struct file *file, int cmd, struct file_lock *fl)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_plock(
-                                sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
-        return error;
-}
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                    struct file *file, struct file_lock *fl)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_punlock(
-                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-        return error;
-}
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-                           unsigned int message)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
-                        sdp->sd_lockstruct.ls_lockspace, jid, message);
-}
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
deleted file mode 100644
index 21cdc30ee08c..000000000000
--- a/fs/gfs2/lm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __LM_DOT_H__
-#define __LM_DOT_H__
-struct gfs2_sbd;
-#define GFS2_MIN_LVB_SIZE 32
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
-void gfs2_lm_unmount(struct gfs2_sbd *sdp);
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-                                __attribute__ ((format(printf, 2, 3)));
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                     void **lockp);
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                         unsigned int cur_state, unsigned int req_state,
-                         unsigned int flags);
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                           unsigned int cur_state);
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                      struct file *file, struct file_lock *fl);
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                  struct file *file, int cmd, struct file_lock *fl);
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                    struct file *file, struct file_lock *fl);
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-                           unsigned int message);
-#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 542a797ac89a..cf7ea8abec87 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -137,7 +137,8 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
                /* Conversion deadlock avoidance by DLM */
-                if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+                if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
+                    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
                    !(lkf & DLM_LKF_NOQUEUE) &&
                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
                        lkf |= DLM_LKF_CONVDEADLK;
@@ -164,7 +165,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 {
        struct gdlm_lock *lp;
-        lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+        lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
        if (!lp)
                return -ENOMEM;
@@ -382,7 +383,7 @@ static int gdlm_add_lvb(struct gdlm_lock *lp)
 {
        char *lvb;
-        lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+        lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
        if (!lvb)
                return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 9e8265d28377..58fcf8c5bf39 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -183,5 +183,10 @@ int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
                struct file_lock *);
 int gdlm_punlock(void *, struct lm_lockname *, struct file *,
                struct file_lock *);
+/* mount.c */
+extern const struct lm_lockops gdlm_ops;
 #endif
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index a0e7eda643ed..36a225850bd8 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,8 +11,6 @@
 #include "lock_dlm.h"
-extern struct lm_lockops gdlm_ops;
 static int __init init_lock_dlm(void)
 {
        int error;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a87b09839761..8479da47049c 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -12,8 +12,6 @@
 #include "lock_dlm.h"
-extern struct lm_lockops gdlm_ops;
 static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
 {
        return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 521694fc19d6..e53db6fd28ab 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -135,7 +135,15 @@ static void process_complete(struct gdlm_lock *lp)
                         lp->lksb.sb_status, lp->lockname.ln_type,
                         (unsigned long long)lp->lockname.ln_number,
                         lp->flags);
-                return;
+                if (lp->lksb.sb_status == -EDEADLOCK &&
+                    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
+                        lp->req = lp->cur;
+                        acb.lc_ret |= LM_OUT_CONV_DEADLK;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                } else
+                        return;
        }
        /*
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index d3b8ce6fbbe3..284a5ece8d94 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -140,7 +140,7 @@ static int nolock_hold_lvb(void *lock, char **lvbp)
        struct nolock_lockspace *nl = lock;
        int error = 0;
-        *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+        *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
        if (!*lvbp)
                error = -ENOMEM;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 161ab6f2058e..548264b1836d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -769,8 +769,8 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
        reserved = calc_reserved(sdp);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
-        gfs2_assert_withdraw(sdp, unused >= 0);
        atomic_add(unused, &sdp->sd_log_blks_free);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
                             sdp->sd_jdesc->jd_blocks);
@@ -779,6 +779,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        gfs2_log_unlock(sdp);
 }
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        struct list_head *head = &tr->tr_list_buf;
+        struct gfs2_bufdata *bd;
+        gfs2_log_lock(sdp);
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+                list_del_init(&bd->bd_list_tr);
+                tr->tr_num_buf--;
+        }
+        gfs2_log_unlock(sdp);
+        gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
 /**
 * gfs2_log_commit - Commit a transaction to the log
 * @sdp: the filesystem
@@ -790,7 +805,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
        log_refund(sdp, tr);
-        lops_incore_commit(sdp, tr);
+        buf_lo_incore_commit(sdp, tr);
        sdp->sd_vfs->s_dirt = 1;
        up_read(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index fae59d69d01a..4390f6f4047d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -152,21 +152,6 @@ out:
        unlock_buffer(bd->bd_bh);
 }
-static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
-{
-        struct list_head *head = &tr->tr_list_buf;
-        struct gfs2_bufdata *bd;
-        gfs2_log_lock(sdp);
-        while (!list_empty(head)) {
-                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
-                list_del_init(&bd->bd_list_tr);
-                tr->tr_num_buf--;
-        }
-        gfs2_log_unlock(sdp);
-        gfs2_assert_warn(sdp, !tr->tr_num_buf);
-}
 static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 {
        struct buffer_head *bh;
@@ -419,8 +404,10 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                        blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
                        error = gfs2_revoke_add(sdp, blkno, start);
-                        if (error < 0)
+                        if (error < 0) {
+                                brelse(bh);
                                return error;
+                        }
                        else if (error)
                                sdp->sd_found_revokes++;
@@ -737,7 +724,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 const struct gfs2_log_operations gfs2_buf_lops = {
        .lo_add = buf_lo_add,
-        .lo_incore_commit = buf_lo_incore_commit,
        .lo_before_commit = buf_lo_before_commit,
        .lo_after_commit = buf_lo_after_commit,
        .lo_before_scan = buf_lo_before_scan,
@@ -763,7 +749,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
 const struct gfs2_log_operations gfs2_databuf_lops = {
        .lo_add = databuf_lo_add,
-        .lo_incore_commit = buf_lo_incore_commit,
        .lo_before_commit = databuf_lo_before_commit,
        .lo_after_commit = databuf_lo_after_commit,
        .lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 41a00df75587..3c0b2737658a 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -57,15 +57,6 @@ static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
                le->le_ops->lo_add(sdp, le);
 }
-static inline void lops_incore_commit(struct gfs2_sbd *sdp,
-                                      struct gfs2_trans *tr)
-{
-        int x;
-        for (x = 0; gfs2_log_ops[x]; x++)
-                if (gfs2_log_ops[x]->lo_incore_commit)
-                        gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
-}
 static inline void lops_before_commit(struct gfs2_sbd *sdp)
 {
        int x;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 9c7765c12d62..053e2ebbbd50 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -89,6 +89,12 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_bufdata_cachep)
                goto fail;
+        gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
+                                              sizeof(struct gfs2_rgrpd),
+                                              0, 0, NULL);
+        if (!gfs2_rgrpd_cachep)
+                goto fail;
        error = register_filesystem(&gfs2_fs_type);
        if (error)
                goto fail;
@@ -108,6 +114,9 @@ fail_unregister:
 fail:
        gfs2_glock_exit();
+        if (gfs2_rgrpd_cachep)
+                kmem_cache_destroy(gfs2_rgrpd_cachep);
        if (gfs2_bufdata_cachep)
                kmem_cache_destroy(gfs2_bufdata_cachep);
@@ -133,6 +142,7 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
        kmem_cache_destroy(gfs2_inode_cachep);
        kmem_cache_destroy(gfs2_glock_cachep);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ac772b6d9dbb..90a04a6e3789 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
 #include <linux/backing-dev.h>
-#include <linux/pagevec.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -104,11 +103,9 @@ static int gfs2_writepage_common(struct page *page,
        loff_t i_size = i_size_read(inode);
        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
        unsigned offset;
-        int ret = -EIO;
        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
                goto out;
-        ret = 0;
        if (current->journal_info)
                goto redirty;
        /* Is the page fully outside i_size? (truncate in progress) */
@@ -280,7 +277,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
        int i;
        int ret;
-        ret = gfs2_trans_begin(sdp, nrblocks, 0);
+        ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
        if (ret < 0)
                return ret;
@@ -510,23 +507,26 @@ static int __gfs2_readpage(void *file, struct page *page)
 static int gfs2_readpage(struct file *file, struct page *page)
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        struct gfs2_holder gh;
+        struct gfs2_holder *gh;
        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
+        gh = gfs2_glock_is_locked_by_me(ip->i_gl);
-        error = gfs2_glock_nq_atime(&gh);
+        if (!gh) {
-        if (unlikely(error)) {
+                gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+                if (!gh)
+                        return -ENOBUFS;
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
                unlock_page(page);
-                goto out;
+                error = gfs2_glock_nq_atime(gh);
+                if (likely(error != 0))
+                        goto out;
+                return AOP_TRUNCATED_PAGE;
        }
        error = __gfs2_readpage(file, page);
-        gfs2_glock_dq(&gh);
+        gfs2_glock_dq(gh);
 out:
-        gfs2_holder_uninit(&gh);
+        gfs2_holder_uninit(gh);
-        if (error == GLR_TRYFAILED) {
+        kfree(gh);
-                yield();
-                return AOP_TRUNCATED_PAGE;
-        }
        return error;
 }
@@ -648,15 +648,15 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_unlock;
+                }
-                error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                error = gfs2_quota_lock_check(ip);
                if (error)
                        goto out_alloc_put;
-                error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-                if (error)
-                        goto out_qunlock;
                al->al_requested = data_blocks + ind_blocks;
                error = gfs2_inplace_reserve(ip);
                if (error)
@@ -828,7 +828,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        unsigned int to = from + len;
        int ret;
-        BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0);
+        BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
        ret = gfs2_meta_inode_buffer(ip, &dibh);
        if (unlikely(ret)) {
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 793e334d098e..4a5e676b4420 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        struct gfs2_holder d_gh;
        struct gfs2_inode *ip = NULL;
        int error;
-        int had_lock=0;
+        int had_lock = 0;
        if (inode) {
                if (is_bad_inode(inode))
@@ -54,7 +54,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        if (sdp->sd_args.ar_localcaching)
                goto valid;
-        had_lock = gfs2_glock_is_locked_by_me(dip->i_gl);
+        had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
        if (!had_lock) {
                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
                if (error)
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 334c7f85351b..990d9f4bc463 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -204,8 +204,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
                                        inum->no_addr,
                                        0, 0);
-        if (!inode)
-                goto fail;
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index f4842f2548cd..e1b7d525a066 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -30,7 +30,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "log.h"
 #include "meta_io.h"
 #include "quota.h"
@@ -39,6 +38,7 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
+#include "ops_inode.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -369,12 +369,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        if (al == NULL)
                goto out_unlock;
-        ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        ret = gfs2_quota_lock_check(ip);
        if (ret)
                goto out_alloc_put;
-        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (ret)
-                goto out_quota_unlock;
        al->al_requested = data_blocks + ind_blocks;
        ret = gfs2_inplace_reserve(ip);
        if (ret)
@@ -596,6 +593,36 @@ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
        return generic_setlease(file, arg, fl);
 }
+static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                      struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
+static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                  struct file *file, int cmd, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+        return error;
+}
+static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                    struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
 /**
 * gfs2_lock - acquire/release a posix lock on a file
 * @file: the file pointer
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4bee6aa845e4..ef9c6c4f80f6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -26,7 +26,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "mount.h"
 #include "ops_fstype.h"
 #include "ops_dentry.h"
@@ -363,6 +362,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
        return rc;
 }
+static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+                                        sdp->sd_lockstruct.ls_lockspace);
+}
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct gfs2_holder ji_gh;
@@ -542,7 +548,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        }
        ip = GFS2_I(sdp->sd_rindex);
        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
-        sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
        sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
@@ -705,6 +711,69 @@ fail:
 }
 /**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+        char *proto = sdp->sd_proto_name;
+        char *table = sdp->sd_table_name;
+        int flags = LM_MFLAG_CONV_NODROP;
+        int error;
+        if (sdp->sd_args.ar_spectator)
+                flags |= LM_MFLAG_SPECTATOR;
+        fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+        error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+                                     gfs2_glock_cb, sdp,
+                                     GFS2_MIN_LVB_SIZE, flags,
+                                     &sdp->sd_lockstruct, &sdp->sd_kobj);
+        if (error) {
+                fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+                        proto, table, sdp->sd_args.ar_hostdata);
+                goto out;
+        }
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+                                  GFS2_MIN_LVB_SIZE)) {
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+                goto out;
+        }
+        if (sdp->sd_args.ar_spectator)
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+        else
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+                         sdp->sd_lockstruct.ls_jid);
+        fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+        if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+            !sdp->sd_args.ar_ignore_local_fs) {
+                sdp->sd_args.ar_localflocks = 1;
+                sdp->sd_args.ar_localcaching = 1;
+        }
+out:
+        return error;
+}
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+/**
 * fill_super - Read in superblock
 * @sb: The VFS superblock
 * @data: Mount options
@@ -874,7 +943,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
 {
        struct kstat stat;
        struct nameidata nd;
-        struct file_system_type *fstype;
        struct super_block *sb = NULL, *s;
        int error;
@@ -886,8 +954,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
        }
        error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-        fstype = get_fs_type("gfs2");
+        list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
-        list_for_each_entry(s, &fstype->fs_supers, s_instances) {
                if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
                    (S_ISDIR(stat.mode) &&
                     s == nd.path.dentry->d_inode->i_sb)) {
@@ -931,7 +998,6 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                error = PTR_ERR(new);
                goto error;
        }
-        module_put(fs_type->owner);
        new->s_flags = flags;
        strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
        sb_set_blocksize(new, sb->s_blocksize);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e87412902bed..2686ad4c0029 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -200,15 +200,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (alloc_required) {
                struct gfs2_alloc *al = gfs2_alloc_get(dip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_gunlock;
+                }
-                error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                error = gfs2_quota_lock_check(dip);
                if (error)
                        goto out_alloc;
-                error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
-                if (error)
-                        goto out_gunlock_q;
                al->al_requested = sdp->sd_max_dirres;
                error = gfs2_inplace_reserve(dip);
@@ -716,15 +716,15 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (alloc_required) {
                struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_gunlock;
+                }
-                error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                error = gfs2_quota_lock_check(ndip);
                if (error)
                        goto out_alloc;
-                error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid);
-                if (error)
-                        goto out_gunlock_q;
                al->al_requested = sdp->sd_max_dirres;
                error = gfs2_inplace_reserve(ndip);
@@ -898,7 +898,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        int error;
        int unlock = 0;
-        if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
                        return error;
@@ -953,7 +953,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
                ogid = ngid = NO_QUOTA_CHANGE;
-        gfs2_alloc_get(ip);
+        if (!gfs2_alloc_get(ip))
+                return -ENOMEM;
        error = gfs2_quota_lock(ip, nuid, ngid);
        if (error)
@@ -981,8 +982,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-                gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
+                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
-                gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+                gfs2_quota_change(ip, -blocks, ouid, ogid);
+                gfs2_quota_change(ip, blocks, nuid, ngid);
        }
 out_end_trans:
@@ -1064,7 +1066,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
        int error;
        int unlock = 0;
-        if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
                if (error)
                        return error;
@@ -1148,16 +1150,6 @@ const struct inode_operations gfs2_file_iops = {
        .removexattr = gfs2_removexattr,
 };
-const struct inode_operations gfs2_dev_iops = {
-        .permission = gfs2_permission,
-        .setattr = gfs2_setattr,
-        .getattr = gfs2_getattr,
-        .setxattr = gfs2_setxattr,
-        .getxattr = gfs2_getxattr,
-        .listxattr = gfs2_listxattr,
-        .removexattr = gfs2_removexattr,
-};
 const struct inode_operations gfs2_dir_iops = {
        .create = gfs2_create,
        .lookup = gfs2_lookup,
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index fd8cee231e1d..14b4b797622a 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -15,7 +15,6 @@
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct inode_operations gfs2_dev_iops;
 extern const struct file_operations gfs2_file_fops;
 extern const struct file_operations gfs2_dir_fops;
 extern const struct file_operations gfs2_file_fops_nolock;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 5e524217944a..2278c68b7e35 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -25,7 +25,6 @@
 #include "incore.h"
 #include "glock.h"
 #include "inode.h"
-#include "lm.h"
 #include "log.h"
 #include "mount.h"
 #include "ops_super.h"
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a08dabd6ce90..56aaf915c59a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -94,7 +94,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        struct gfs2_quota_data *qd;
        int error;
-        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
        if (!qd)
                return -ENOMEM;
@@ -616,16 +616,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        s64 value;
        int err = -EIO;
-        if (gfs2_is_stuffed(ip)) {
+        if (gfs2_is_stuffed(ip))
-                struct gfs2_alloc *al = NULL;
-                al = gfs2_alloc_get(ip);
-                /* just request 1 blk */
-                al->al_requested = 1;
-                gfs2_inplace_reserve(ip);
                gfs2_unstuff_dinode(ip, NULL);
-                gfs2_inplace_release(ip);
+        
-                gfs2_alloc_put(ip);
-        }
        page = grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -690,14 +683,14 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        unsigned int qx, x;
        struct gfs2_quota_data *qd;
        loff_t offset;
-        unsigned int nalloc = 0;
+        unsigned int nalloc = 0, blocks;
        struct gfs2_alloc *al = NULL;
        int error;
        gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
                              &data_blocks, &ind_blocks);
-        ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+        ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
        if (!ghs)
                return -ENOMEM;
@@ -727,30 +720,33 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                        nalloc++;
        }
-        if (nalloc) {
+        al = gfs2_alloc_get(ip);
-                al = gfs2_alloc_get(ip);
+        if (!al) {
+                error = -ENOMEM;
+                goto out_gunlock;
+        }
+        /* 
+         * 1 blk for unstuffing inode if stuffed. We add this extra
+         * block to the reservation unconditionally. If the inode
+         * doesn't need unstuffing, the block will be released to the 
+         * rgrp since it won't be allocated during the transaction
+         */
+        al->al_requested = 1;
+        /* +1 in the end for block requested above for unstuffing */
+        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
-                al->al_requested = nalloc * (data_blocks + ind_blocks);
+        if (nalloc)
+                al->al_requested += nalloc * (data_blocks + ind_blocks);                
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_alloc;
-                error = gfs2_inplace_reserve(ip);
+        if (nalloc)
-                if (error)
+                blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
-                        goto out_alloc;
+        error = gfs2_trans_begin(sdp, blocks, 0);
-                error = gfs2_trans_begin(sdp,
+        if (error)
-                                         al->al_rgd->rd_length +
+                goto out_ipres;
-                                         num_qd * data_blocks +
-                                         nalloc * ind_blocks +
-                                         RES_DINODE + num_qd +
-                                         RES_STATFS, 0);
-                if (error)
-                        goto out_ipres;
-        } else {
-                error = gfs2_trans_begin(sdp,
-                                         num_qd * data_blocks +
-                                         RES_DINODE + num_qd, 0);
-                if (error)
-                        goto out_gunlock;
-        }
        for (x = 0; x < num_qd; x++) {
                qd = qda[x];
@@ -769,11 +765,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipres:
-        if (nalloc)
+        gfs2_inplace_release(ip);
-                gfs2_inplace_release(ip);
 out_alloc:
-        if (nalloc)
+        gfs2_alloc_put(ip);
-                gfs2_alloc_put(ip);
 out_gunlock:
        gfs2_glock_dq_uninit(&i_gh);
 out:
@@ -1124,12 +1118,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
        error = -ENOMEM;
        sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
-                                       sizeof(unsigned char *), GFP_KERNEL);
+                                       sizeof(unsigned char *), GFP_NOFS);
        if (!sdp->sd_quota_bitmap)
                return error;
        for (x = 0; x < sdp->sd_quota_chunks; x++) {
-                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
                if (!sdp->sd_quota_bitmap[x])
                        goto fail;
        }
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index a8be1417051f..3b7f4b0e5dfe 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -32,4 +32,21 @@ int gfs2_quota_init(struct gfs2_sbd *sdp);
 void gfs2_quota_scan(struct gfs2_sbd *sdp);
 void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int ret;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return 0;
+        ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (ret)
+                return ret;
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        if (ret)
+                gfs2_quota_unlock(ip);
+        return ret;
+}
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 6fb07d67ca8a..2888e4b4b1c5 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -20,7 +20,6 @@
 #include "bmap.h"
 #include "glock.h"
 #include "glops.h"
-#include "lm.h"
 #include "lops.h"
 #include "meta_io.h"
 #include "recovery.h"
@@ -69,7 +68,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
                return 0;
        }
-        rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+        rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
        if (!rr)
                return -ENOMEM;
@@ -150,7 +149,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
                          struct gfs2_log_header_host *head)
 {
        struct buffer_head *bh;
-        struct gfs2_log_header_host lh;
+        struct gfs2_log_header_host uninitialized_var(lh);
        const u32 nothing = 0;
        u32 hash;
        int error;
@@ -425,6 +424,16 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
        return error;
 }
+static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                                  unsigned int message)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+                        sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
 /**
 * gfs2_recover_journal - recovery a given journal
 * @jd: the struct gfs2_jdesc describing the journal
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3552110b2e5f..7e8f0b1d6c6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/prefetch.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -33,6 +34,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
 /*
 * These routines are used by the resource group routines (rgrp.c)
 * to keep track of block allocation.  Each block is represented by two
@@ -53,7 +64,8 @@ static const char valid_change[16] = {
 };
 static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-                        unsigned char old_state, unsigned char new_state);
+                        unsigned char old_state, unsigned char new_state,
+                        unsigned int *n);
 /**
 * gfs2_setbit - Set a bit in the bitmaps
@@ -64,26 +76,32 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 *
 */
-static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
-                        unsigned int buflen, u32 block,
+                               unsigned char *buf2, unsigned int offset,
-                        unsigned char new_state)
+                               unsigned int buflen, u32 block,
+                               unsigned char new_state)
 {
-        unsigned char *byte, *end, cur_state;
+        unsigned char *byte1, *byte2, *end, cur_state;
-        unsigned int bit;
+        const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
-        byte = buffer + (block / GFS2_NBBY);
+        byte1 = buf1 + offset + (block / GFS2_NBBY);
-        bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buf1 + offset + buflen;
-        end = buffer + buflen;
-        gfs2_assert(rgd->rd_sbd, byte < end);
+        BUG_ON(byte1 >= end);
-        cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+        cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
-        if (valid_change[new_state * 4 + cur_state]) {
+        if (unlikely(!valid_change[new_state * 4 + cur_state])) {
-                *byte ^= cur_state << bit;
-                *byte |= new_state << bit;
-        } else
                gfs2_consist_rgrpd(rgd);
+                return;
+        }
+        *byte1 ^= (cur_state ^ new_state) << bit;
+        if (buf2) {
+                byte2 = buf2 + offset + (block / GFS2_NBBY);
+                cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
+                *byte2 ^= (cur_state ^ new_state) << bit;
+        }
 }
 /**
@@ -94,10 +112,12 @@ static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
 *
 */
-static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
-                                  unsigned int buflen, u32 block)
+                                         const unsigned char *buffer,
+                                         unsigned int buflen, u32 block)
 {
-        unsigned char *byte, *end, cur_state;
+        const unsigned char *byte, *end;
+        unsigned char cur_state;
        unsigned int bit;
        byte = buffer + (block / GFS2_NBBY);
@@ -126,47 +146,66 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
 * Return: the block number (bitmap buffer scope) that was found
 */
-static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
+static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
-                       unsigned char old_state)
+                       u8 old_state)
 {
-        unsigned char *byte;
+        const u8 *byte, *start, *end;
-        u32 blk = goal;
+        int bit, startbit;
-        unsigned int bit, bitlong;
+        u32 g1, g2, misaligned;
-        unsigned long *plong, plong55;
+        unsigned long *plong;
+        unsigned long lskipval;
-        byte = buffer + (goal / GFS2_NBBY);
-        plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
+        lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
-        bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+        g1 = (goal / GFS2_NBBY);
-        bitlong = bit;
+        start = buffer + g1;
-#if BITS_PER_LONG == 32
+        byte = start;
-        plong55 = 0x55555555;
+        end = buffer + buflen;
-#else
+        g2 = ALIGN(g1, sizeof(unsigned long));
-        plong55 = 0x5555555555555555;
+        plong = (unsigned long *)(buffer + g2);
-#endif
+        startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-        while (byte < buffer + buflen) {
+        misaligned = g2 - g1;
+        if (!misaligned)
-                if (bitlong == 0 && old_state == 0 && *plong == plong55) {
+                goto ulong_aligned;
-                        plong++;
+/* parse the bitmap a byte at a time */
-                        byte += sizeof(unsigned long);
+misaligned:
-                        blk += sizeof(unsigned long) * GFS2_NBBY;
+        while (byte < end) {
-                        continue;
+                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+                        return goal +
+                                (((byte - start) * GFS2_NBBY) +
+                                 ((bit - startbit) >> 1));
                }
-                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-                        return blk;
                bit += GFS2_BIT_SIZE;
-                if (bit >= 8) {
+                if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
                        bit = 0;
                        byte++;
+                        misaligned--;
+                        if (!misaligned) {
+                                plong = (unsigned long *)byte;
+                                goto ulong_aligned;
+                        }
                }
-                bitlong += GFS2_BIT_SIZE;
-                if (bitlong >= sizeof(unsigned long) * 8) {
-                        bitlong = 0;
-                        plong++;
-                }
-                blk++;
        }
+        return BFITNOENT;
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+        /* Stop at "end - 1" or else prefetch can go past the end and segfault.
+           We could "if" it but we'd lose some of the performance gained.
+           This way will only slow down searching the very last 4/8 bytes
+           depending on architecture.  I've experimented with several ways
+           of writing this section such as using an else before the goto
+           but this one seems to be the fastest. */
+        while ((unsigned char *)plong < end - 1) {
+                prefetch(plong + 1);
+                if (((*plong) & LBITMASK) != lskipval)
+                        break;
+                plong++;
+        }
+        if ((unsigned char *)plong < end) {
+                byte = (const u8 *)plong;
+                misaligned += sizeof(unsigned long) - 1;
+                goto misaligned;
+        }
        return BFITNOENT;
 }
@@ -179,14 +218,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
 * Returns: The number of bits
 */
-static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
-                              unsigned int buflen, unsigned char state)
+                         unsigned int buflen, u8 state)
 {
-        unsigned char *byte = buffer;
+        const u8 *byte = buffer;
-        unsigned char *end = buffer + buflen;
+        const u8 *end = buffer + buflen;
-        unsigned char state1 = state << 2;
+        const u8 state1 = state << 2;
-        unsigned char state2 = state << 4;
+        const u8 state2 = state << 4;
-        unsigned char state3 = state << 6;
+        const u8 state3 = state << 6;
        u32 count = 0;
        for (; byte < end; byte++) {
@@ -353,7 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
                }
                kfree(rgd->rd_bits);
-                kfree(rgd);
+                kmem_cache_free(gfs2_rgrpd_cachep, rgd);
        }
 }
@@ -516,7 +555,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
                return error;
        }
-        rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+        rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
        error = -ENOMEM;
        if (!rgd)
                return error;
@@ -539,7 +578,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
                return error;
        rgd->rd_gl->gl_object = rgd;
-        rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
        rgd->rd_flags |= GFS2_RDF_CHECK;
        return error;
 }
@@ -575,7 +614,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
                }
        }
-        sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
@@ -609,7 +648,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
                }
        }
-        sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
@@ -642,9 +681,9 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
                return error;
        /* Read new copy from disk if we don't have the latest */
-        if (sdp->sd_rindex_vn != gl->gl_vn) {
+        if (!sdp->sd_rindex_uptodate) {
                mutex_lock(&sdp->sd_rindex_mutex);
-                if (sdp->sd_rindex_vn != gl->gl_vn) {
+                if (!sdp->sd_rindex_uptodate) {
                        error = gfs2_ri_update(ip);
                        if (error)
                                gfs2_glock_dq_uninit(ri_gh);
@@ -655,21 +694,31 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
        return error;
 }
-static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
+static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
        const struct gfs2_rgrp *str = buf;
+        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+        u32 rg_flags;
-        rg->rg_flags = be32_to_cpu(str->rg_flags);
+        rg_flags = be32_to_cpu(str->rg_flags);
+        if (rg_flags & GFS2_RGF_NOALLOC)
+                rgd->rd_flags |= GFS2_RDF_NOALLOC;
+        else
+                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
        rg->rg_free = be32_to_cpu(str->rg_free);
        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
 }
-static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
+static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
+        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+        u32 rg_flags = 0;
-        str->rg_flags = cpu_to_be32(rg->rg_flags);
+        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+                rg_flags |= GFS2_RGF_NOALLOC;
+        str->rg_flags = cpu_to_be32(rg_flags);
        str->rg_free = cpu_to_be32(rg->rg_free);
        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
        str->__pad = cpu_to_be32(0);
@@ -726,9 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                }
        }
-        if (rgd->rd_rg_vn != gl->gl_vn) {
+        if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
-                gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-                rgd->rd_rg_vn = gl->gl_vn;
+                rgd->rd_flags |= GFS2_RDF_UPTODATE;
        }
        spin_lock(&sdp->sd_rindex_spin);
@@ -840,7 +889,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int ret = 0;
-        if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
                return 0;
        spin_lock(&sdp->sd_rindex_spin);
@@ -866,13 +915,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        unsigned int n;
        for(;;) {
                if (goal >= rgd->rd_data)
                        break;
                down_write(&sdp->sd_log_flush_lock);
+                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
-                                     GFS2_BLKST_UNLINKED);
+                                     GFS2_BLKST_UNLINKED, &n);
                up_write(&sdp->sd_log_flush_lock);
                if (block == BFITNOENT)
                        break;
@@ -904,24 +955,20 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
                                            u64 rglast)
 {
-        struct gfs2_rgrpd *rgd = NULL;
+        struct gfs2_rgrpd *rgd;
        spin_lock(&sdp->sd_rindex_spin);
-        if (list_empty(&sdp->sd_rindex_recent_list))
+        if (rglast) {
-                goto out;
+                list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+                        if (rgrp_contains_block(rgd, rglast))
-        if (!rglast)
+                                goto out;
-                goto first;
+                }
-        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                if (rgd->rd_addr == rglast)
-                        goto out;
        }
+        rgd = NULL;
-first:
+        if (!list_empty(&sdp->sd_rindex_recent_list))
-        rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
+                rgd = list_entry(sdp->sd_rindex_recent_list.next,
-                         rd_recent);
+                                 struct gfs2_rgrpd, rd_recent);
 out:
        spin_unlock(&sdp->sd_rindex_spin);
        return rgd;
@@ -1067,7 +1114,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        /* Try recently successful rgrps */
-        rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+        rgd = recent_rgrp_first(sdp, ip->i_goal);
        while (rgd) {
                rg_locked = 0;
@@ -1151,8 +1198,6 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        }
 out:
-        ip->i_last_rg_alloc = rgd->rd_addr;
        if (begin) {
                recent_rgrp_add(rgd);
                rgd = gfs2_rgrpd_get_next(rgd);
@@ -1275,6 +1320,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 * @goal: the goal block within the RG (start here to search for avail block)
 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
 * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ * @n: The extent length
 *
 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
 * Add the found bitmap buffer to the transaction.
@@ -1290,13 +1336,17 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 */
 static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-                        unsigned char old_state, unsigned char new_state)
+                        unsigned char old_state, unsigned char new_state,
+                        unsigned int *n)
 {
        struct gfs2_bitmap *bi = NULL;
-        u32 length = rgd->rd_length;
+        const u32 length = rgd->rd_length;
        u32 blk = 0;
        unsigned int buf, x;
+        const unsigned int elen = *n;
+        const u8 *buffer;
+        *n = 0;
        /* Find bitmap block that contains bits for goal block */
        for (buf = 0; buf < length; buf++) {
                bi = rgd->rd_bits + buf;
@@ -1317,12 +1367,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
        for (x = 0; x <= length; x++) {
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
+                buffer = bi->bi_bh->b_data + bi->bi_offset;
                if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
-                        blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
+                        buffer = bi->bi_clone + bi->bi_offset;
-                                          bi->bi_len, goal, old_state);
-                else
+                blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
-                        blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
-                                          bi->bi_len, goal, old_state);
                if (blk != BFITNOENT)
                        break;
@@ -1333,12 +1382,23 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
        }
        if (blk != BFITNOENT && old_state != new_state) {
+                *n = 1;
                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-                gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
                            bi->bi_len, blk, new_state);
-                if (bi->bi_clone)
+                goal = blk;
-                        gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
+                while (*n < elen) {
-                                    bi->bi_len, blk, new_state);
+                        goal++;
+                        if (goal >= (bi->bi_len * GFS2_NBBY))
+                                break;
+                        if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+                            GFS2_BLKST_FREE)
+                                break;
+                        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
+                                    bi->bi_offset, bi->bi_len, goal,
+                                    new_state);
+                        (*n)++;
+                }
        }
        return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
@@ -1393,7 +1453,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
                               bi->bi_len);
                }
                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-                gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
                            bi->bi_len, buf_blk, new_state);
        }
@@ -1401,13 +1461,13 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 /**
- * gfs2_alloc_data - Allocate a data block
+ * gfs2_alloc_block - Allocate a block
- * @ip: the inode to allocate the data block for
+ * @ip: the inode to allocate the block for
 *
 * Returns: the allocated block
 */
-u64 gfs2_alloc_data(struct gfs2_inode *ip)
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1415,77 +1475,31 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
        u32 goal, blk;
        u64 block;
-        if (rgrp_contains_block(rgd, ip->i_di.di_goal_data))
+        if (rgrp_contains_block(rgd, ip->i_goal))
-                goal = ip->i_di.di_goal_data - rgd->rd_data0;
+                goal = ip->i_goal - rgd->rd_data0;
        else
-                goal = rgd->rd_last_alloc_data;
+                goal = rgd->rd_last_alloc;
-        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc_data = blk;
+        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
-        ip->i_di.di_goal_data = block;
+        ip->i_goal = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
-        rgd->rd_rg.rg_free--;
+        rgd->rd_rg.rg_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-        al->al_alloced++;
+        al->al_alloced += *n;
-        gfs2_statfs_change(sdp, 0, -1, 0);
+        gfs2_statfs_change(sdp, 0, -*n, 0);
-        gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone--;
+        rgd->rd_free_clone -= *n;
-        spin_unlock(&sdp->sd_rindex_spin);
-        return block;
-}
-/**
- * gfs2_alloc_meta - Allocate a metadata block
- * @ip: the inode to allocate the metadata block for
- *
- * Returns: the allocated block
- */
-u64 gfs2_alloc_meta(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = ip->i_alloc;
-        struct gfs2_rgrpd *rgd = al->al_rgd;
-        u32 goal, blk;
-        u64 block;
-        if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta))
-                goal = ip->i_di.di_goal_meta - rgd->rd_data0;
-        else
-                goal = rgd->rd_last_alloc_meta;
-        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
-        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc_meta = blk;
-        block = rgd->rd_data0 + blk;
-        ip->i_di.di_goal_meta = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
-        rgd->rd_rg.rg_free--;
-        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
-        al->al_alloced++;
-        gfs2_statfs_change(sdp, 0, -1, 0);
-        gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        gfs2_trans_add_unrevoke(sdp, block);
-        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone--;
        spin_unlock(&sdp->sd_rindex_spin);
        return block;
@@ -1505,12 +1519,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 blk;
        u64 block;
+        unsigned int n = 1;
-        blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+        blk = rgblk_search(rgd, rgd->rd_last_alloc,
-                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc_meta = blk;
+        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
@@ -1519,12 +1534,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        rgd->rd_rg.rg_dinodes++;
        *generation = rgd->rd_rg.rg_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        al->al_alloced++;
        gfs2_statfs_change(sdp, 0, -1, +1);
-        gfs2_trans_add_unrevoke(sdp, block);
+        gfs2_trans_add_unrevoke(sdp, block, 1);
        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone--;
@@ -1553,7 +1568,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd->rd_rg.rg_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
@@ -1581,7 +1596,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd->rd_rg.rg_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
@@ -1601,7 +1616,7 @@ void gfs2_unlink_di(struct inode *inode)
        if (!rgd)
                return;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
 }
@@ -1621,7 +1636,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
        rgd->rd_rg.rg_free++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_statfs_change(sdp, 0, +1, -1);
        gfs2_trans_add_rg(rgd);
@@ -1699,8 +1714,7 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
 *
 */
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
-                      int flags)
 {
        unsigned int x;
@@ -1708,7 +1722,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
                                GFP_NOFS | __GFP_NOFAIL);
        for (x = 0; x < rlist->rl_rgrps; x++)
                gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
-                                state, flags,
+                                state, 0,
                                &rlist->rl_ghs[x]);
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 149bb161f4b6..3181c7e624bf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -46,8 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip);
 unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-u64 gfs2_alloc_data(struct gfs2_inode *ip);
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
-u64 gfs2_alloc_meta(struct gfs2_inode *ip);
 u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -64,8 +63,7 @@ struct gfs2_rgrp_list {
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
                    u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-                      int flags);
 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 u64 gfs2_ri_total(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ef0562c3bc71..7aeacbc65f35 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -210,7 +210,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        struct page *page;
        struct bio *bio;
-        page = alloc_page(GFP_KERNEL);
+        page = alloc_page(GFP_NOFS);
        if (unlikely(!page))
                return -ENOBUFS;
@@ -218,7 +218,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        ClearPageDirty(page);
        lock_page(page);
-        bio = bio_alloc(GFP_KERNEL, 1);
+        bio = bio_alloc(GFP_NOFS, 1);
        if (unlikely(!bio)) {
                __free_page(page);
                return -ENOBUFS;
@@ -316,6 +316,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
                sdp->sd_heightsize[x] = space;
        }
        sdp->sd_max_height = x;
+        sdp->sd_heightsize[x] = ~0;
        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
@@ -334,6 +335,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
                sdp->sd_jheightsize[x] = space;
        }
        sdp->sd_max_jheight = x;
+        sdp->sd_jheightsize[x] = ~0;
        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
        return 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 60a870e430be..44361ecc44f7 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -17,6 +17,7 @@ void gfs2_tune_init(struct gfs2_tune *gt);
 int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
 int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
 int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index eaa3b7b2f99e..9ab9fc85ecd0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -20,7 +20,6 @@
 #include "gfs2.h"
 #include "incore.h"
-#include "lm.h"
 #include "sys.h"
 #include "super.h"
 #include "glock.h"
@@ -328,15 +327,9 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 }                                                                           \
 static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-COUNTERS_ATTR(glock_count,      "%u\n");
-COUNTERS_ATTR(glock_held_count, "%u\n");
-COUNTERS_ATTR(inode_count,      "%u\n");
 COUNTERS_ATTR(reclaimed,        "%u\n");
 static struct attribute *counters_attrs[] = {
-        &counters_attr_glock_count.attr,
-        &counters_attr_glock_held_count.attr,
-        &counters_attr_inode_count.attr,
        &counters_attr_reclaimed.attr,
        NULL,
 };
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 73e5d92a657c..f677b8a83f0c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -146,30 +146,25 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        lops_add(sdp, &bd->bd_le);
 }
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
 {
-        struct gfs2_bufdata *bd;
+        struct gfs2_bufdata *bd, *tmp;
-        int found = 0;
+        struct gfs2_trans *tr = current->journal_info;
+        unsigned int n = len;
        gfs2_log_lock(sdp);
+        list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
-        list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
+                if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
-                if (bd->bd_blkno == blkno) {
                        list_del_init(&bd->bd_le.le_list);
                        gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
                        sdp->sd_log_num_revoke--;
-                        found = 1;
+                        kmem_cache_free(gfs2_bufdata_cachep, bd);
-                        break;
+                        tr->tr_num_revoke_rm++;
+                        if (--n == 0)
+                                break;
                }
        }
        gfs2_log_unlock(sdp);
-        if (found) {
-                struct gfs2_trans *tr = current->journal_info;
-                kmem_cache_free(gfs2_bufdata_cachep, bd);
-                tr->tr_num_revoke_rm++;
-        }
 }
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index e826f0dab80a..edf9d4bd908e 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
 void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
 #endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 424a0774eda8..d31e355c61fb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -19,12 +19,12 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "glock.h"
-#include "lm.h"
 #include "util.h"
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
@@ -32,6 +32,28 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
               sdp->sd_fsname);
 }
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+        va_list args;
+        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return 0;
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        fs_err(sdp, "about to withdraw this file system\n");
+        BUG_ON(sdp->sd_args.ar_debug);
+        fs_err(sdp, "telling LM to withdraw\n");
+        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+        fs_err(sdp, "withdrawn\n");
+        dump_stack();
+        return -1;
+}
 /**
 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
 * Returns: -1 if this call withdrew the machine,
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 28938a46cf47..509c5d60bd80 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,6 +147,7 @@ gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
 extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_rgrpd_cachep;
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
                                           unsigned int *p)
@@ -163,6 +164,7 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
 void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
                      unsigned int bit, int new_value);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
 #endif /* __UTIL_DOT_H__ */
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f9c5dd6f4b64..dcc2734e0b5d 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -129,7 +129,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        struct inode *inode = mapping->host;
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        uint32_t pageofs = pos & (PAGE_CACHE_SIZE - 1);
+        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
        pg = __grab_cache_page(mapping, index);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index e1985066b1c6..2bc7d8aa5740 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2172,7 +2172,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
        }
        /* update the free count for this dmap */
-        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+        le32_add_cpu(&dp->nfree, -nblocks);
        BMAP_LOCK(bmp);
@@ -2316,7 +2316,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
        /* update the free count for this dmap.
         */
-        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+        le32_add_cpu(&dp->nfree, nblocks);
        BMAP_LOCK(bmp);
@@ -3226,7 +3226,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
        }
        /* update the free count for this dmap */
-        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+        le32_add_cpu(&dp->nfree, -nblocks);
        /* reconstruct summary tree */
        dbInitDmapTree(dp);
@@ -3660,9 +3660,8 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
                        goto initTree;
                }
        } else {
-                dp->nblocks =
+                le32_add_cpu(&dp->nblocks, nblocks);
-                    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
+                le32_add_cpu(&dp->nfree, nblocks);
-                dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
        }
        /* word number containing start block number */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 11e6d471b364..1a6eb41569bc 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -61,7 +61,7 @@
 * determine the maximum free string for four (lower level) nodes
 * of the tree.
 */
-static __inline signed char TREEMAX(signed char *cp)
+static inline signed char TREEMAX(signed char *cp)
 {
        signed char tmp1, tmp2;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 9bf29f771737..734ec916beaf 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1019,8 +1019,7 @@ int diFree(struct inode *ip)
                /* update the free inode counts at the iag, ag and
                 * map level.
                 */
-                iagp->nfreeinos =
+                le32_add_cpu(&iagp->nfreeinos, 1);
-                    cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
                imap->im_agctl[agno].numfree += 1;
                atomic_inc(&imap->im_numfree);
@@ -1219,9 +1218,8 @@ int diFree(struct inode *ip)
        /* update the number of free inodes and number of free extents
         * for the iag.
         */
-        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
+        le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
-                                      (INOSPEREXT - 1));
+        le32_add_cpu(&iagp->nfreeexts, 1);
-        iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
        /* update the number of free inodes and backed inodes
         * at the ag and inode map level.
@@ -2124,7 +2122,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
        /* update the free inode count at the iag, ag, inode
         * map levels.
         */
-        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+        le32_add_cpu(&iagp->nfreeinos, -1);
        imap->im_agctl[agno].numfree -= 1;
        atomic_dec(&imap->im_numfree);
@@ -2378,9 +2376,8 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
        /* update the free inode and free extent counts for the
         * iag.
         */
-        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
+        le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
-                                      (INOSPEREXT - 1));
+        le32_add_cpu(&iagp->nfreeexts, -1);
-        iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
        /* update the free and backed inode counts for the ag.
         */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a000aaa75136..5a61ebf2cbcc 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -905,8 +905,7 @@ int xtInsert(tid_t tid,		/* transaction id */
        XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
        /* advance next available entry index */
-        p->header.nextindex =
+        le16_add_cpu(&p->header.nextindex, 1);
-            cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        /* Don't log it if there are no links to the file */
        if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -997,8 +996,7 @@ xtSplitUp(tid_t tid,
                            split->addr);
                /* advance next available entry index */
-                sp->header.nextindex =
+                le16_add_cpu(&sp->header.nextindex, 1);
-                    cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
                /* Don't log it if there are no links to the file */
                if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1167,9 +1165,7 @@ xtSplitUp(tid_t tid,
                                    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
                        /* advance next available entry index. */
-                        sp->header.nextindex =
+                        le16_add_cpu(&sp->header.nextindex, 1);
-                            cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
-                                        1);
                        /* Don't log it if there are no links to the file */
                        if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1738,8 +1734,7 @@ int xtExtend(tid_t tid,		/* transaction id */
                XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
                /* advance next available entry index */
-                p->header.nextindex =
+                le16_add_cpu(&p->header.nextindex, 1);
-                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        }
        /* get back old entry */
@@ -1905,8 +1900,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
                XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
                /* advance next available entry index */
-                p->header.nextindex =
+                le16_add_cpu(&p->header.nextindex, 1);
-                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        }
        /* get back old XAD */
@@ -2567,8 +2561,7 @@ int xtAppend(tid_t tid,		/* transaction id */
        XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
        /* advance next available entry index */
-        p->header.nextindex =
+        le16_add_cpu(&p->header.nextindex, 1);
-            cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        xtlck->lwm.offset =
            (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
@@ -2631,8 +2624,7 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
         * delete the entry from the leaf page
         */
        nextindex = le16_to_cpu(p->header.nextindex);
-        p->header.nextindex =
+        le16_add_cpu(&p->header.nextindex, -1);
-            cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
        /*
         * if the leaf page bocome empty, free the page
@@ -2795,9 +2787,7 @@ xtDeleteUp(tid_t tid, struct inode *ip,
                                        (nextindex - index -
                                         1) << L2XTSLOTSIZE);
-                        p->header.nextindex =
+                        le16_add_cpu(&p->header.nextindex, -1);
-                            cpu_to_le16(le16_to_cpu(p->header.nextindex) -
-                                        1);
                        jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
                                 (ulong) parent->bn, index);
                }
diff --git a/fs/locks.c b/fs/locks.c
index d83fab1b77b5..43c0af21a0c5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1801,17 +1801,21 @@ again:
        if (error)
                goto out;
-        for (;;) {
+        if (filp->f_op && filp->f_op->lock != NULL)
-                error = vfs_lock_file(filp, cmd, file_lock, NULL);
+                error = filp->f_op->lock(filp, cmd, file_lock);
-                if (error != -EAGAIN || cmd == F_SETLK)
+        else {
-                        break;
+                for (;;) {
-                error = wait_event_interruptible(file_lock->fl_wait,
+                        error = posix_lock_file(filp, file_lock, NULL);
-                                !file_lock->fl_next);
+                        if (error != -EAGAIN || cmd == F_SETLK)
-                if (!error)
+                                break;
-                        continue;
+                        error = wait_event_interruptible(file_lock->fl_wait,
+                                        !file_lock->fl_next);
+                        if (!error)
+                                continue;
-                locks_delete_block(file_lock);
+                        locks_delete_block(file_lock);
-                break;
+                        break;
+                }
        }
        /*
@@ -1925,17 +1929,21 @@ again:
        if (error)
                goto out;
-        for (;;) {
+        if (filp->f_op && filp->f_op->lock != NULL)
-                error = vfs_lock_file(filp, cmd, file_lock, NULL);
+                error = filp->f_op->lock(filp, cmd, file_lock);
-                if (error != -EAGAIN || cmd == F_SETLK64)
+        else {
-                        break;
+                for (;;) {
-                error = wait_event_interruptible(file_lock->fl_wait,
+                        error = posix_lock_file(filp, file_lock, NULL);
-                                !file_lock->fl_next);
+                        if (error != -EAGAIN || cmd == F_SETLK64)
-                if (!error)
+                                break;
-                        continue;
+                        error = wait_event_interruptible(file_lock->fl_wait,
+                                        !file_lock->fl_next);
+                        if (!error)
+                                continue;
-                locks_delete_block(file_lock);
+                        locks_delete_block(file_lock);
-                break;
+                        break;
+                }
        }
        /*
diff --git a/fs/mbcache.c b/fs/mbcache.c
index eb31b73e7d69..ec88ff3d04a9 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -399,11 +399,11 @@ mb_cache_destroy(struct mb_cache *cache)
 * if no more memory was available.
 */
 struct mb_cache_entry *
-mb_cache_entry_alloc(struct mb_cache *cache)
+mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
        struct mb_cache_entry *ce;
-        ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
+        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
        if (ce) {
                atomic_inc(&cache->c_entry_count);
                INIT_LIST_HEAD(&ce->e_lru_list);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42c..f6956de56fdb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
 EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+obj-$(CONFIG_OCFS2_FS) +=       \
+        ocfs2.o                 \
+        ocfs2_stackglue.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
@@ -31,5 +36,10 @@ ocfs2-objs := \
        uptodate.o              \
        ver.o
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+# cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
-obj-$(CONFIG_OCFS2_FS) += dlm/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2e..41f84c92094f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
        BUG_ON(!next_free);
        /* The tree code before us didn't allow enough room in the leaf. */
-        if (el->l_next_free_rec == el->l_count && !has_empty)
+        BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
-                BUG();
        /*
         * The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
 *   - When our insert into the right path leaf is at the leftmost edge
 *     and requires an update of the path immediately to it's left. This
 *     can occur at the end of some types of rotation and appending inserts.
+ *   - When we've adjusted the last extent record in the left path leaf and the
+ *     1st extent record in the right path leaf during cross extent block merge.
 */
 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
                                       struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
        }
 }
+static int ocfs2_get_right_path(struct inode *inode,
+                                struct ocfs2_path *left_path,
+                                struct ocfs2_path **ret_right_path)
+{
+        int ret;
+        u32 right_cpos;
+        struct ocfs2_path *right_path = NULL;
+        struct ocfs2_extent_list *left_el;
+        *ret_right_path = NULL;
+        /* This function shouldn't be called for non-trees. */
+        BUG_ON(left_path->p_tree_depth == 0);
+        left_el = path_leaf_el(left_path);
+        BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+                                             &right_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* This function shouldn't be called for the rightmost leaf. */
+        BUG_ON(right_cpos == 0);
+        right_path = ocfs2_new_path(path_root_bh(left_path),
+                                    path_root_el(left_path));
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, right_path, right_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_right_path = right_path;
+out:
+        if (ret)
+                ocfs2_free_path(right_path);
+        return ret;
+}
 /*
 * Remove split_rec clusters from the record at index and merge them
- * onto the beginning of the record at index + 1.
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
 */
-static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_right(struct inode *inode,
-                                handle_t *handle,
+                                 struct ocfs2_path *left_path,
-                                struct ocfs2_extent_rec *split_rec,
+                                 handle_t *handle,
-                                struct ocfs2_extent_list *el, int index)
+                                 struct ocfs2_extent_rec *split_rec,
+                                 int index)
 {
-        int ret;
+        int ret, next_free, i;
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
        struct ocfs2_extent_rec *left_rec;
        struct ocfs2_extent_rec *right_rec;
+        struct ocfs2_extent_list *right_el;
+        struct ocfs2_path *right_path = NULL;
+        int subtree_index = 0;
+        struct ocfs2_extent_list *el = path_leaf_el(left_path);
+        struct buffer_head *bh = path_leaf_bh(left_path);
+        struct buffer_head *root_bh = NULL;
        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
        left_rec = &el->l_recs[index];
-        right_rec = &el->l_recs[index + 1];
+        if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+            le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+                /* we meet with a cross extent block merge. */
+                ret = ocfs2_get_right_path(inode, left_path, &right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                right_el = path_leaf_el(right_path);
+                next_free = le16_to_cpu(right_el->l_next_free_rec);
+                BUG_ON(next_free <= 0);
+                right_rec = &right_el->l_recs[0];
+                if (ocfs2_is_empty_extent(right_rec)) {
+                        BUG_ON(le16_to_cpu(next_free) <= 1);
+                        right_rec = &right_el->l_recs[1];
+                }
+                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+                       le16_to_cpu(left_rec->e_leaf_clusters) !=
+                       le32_to_cpu(right_rec->e_cpos));
+                subtree_index = ocfs2_find_subtree_root(inode,
+                                                        left_path, right_path);
+                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+                                                      handle->h_buffer_credits,
+                                                      right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                root_bh = left_path->p_node[subtree_index].bh;
+                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+                ret = ocfs2_journal_access(handle, inode, root_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                for (i = subtree_index + 1;
+                     i < path_num_items(right_path); i++) {
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   right_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   left_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+        } else {
+                BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+                right_rec = &el->l_recs[index + 1];
+        }
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
        if (ret)
                mlog_errno(ret);
+        if (right_path) {
+                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+                if (ret)
+                        mlog_errno(ret);
+                ocfs2_complete_edge_insert(inode, handle, left_path,
+                                           right_path, subtree_index);
+        }
+out:
+        if (right_path)
+                ocfs2_free_path(right_path);
+        return ret;
+}
+static int ocfs2_get_left_path(struct inode *inode,
+                               struct ocfs2_path *right_path,
+                               struct ocfs2_path **ret_left_path)
+{
+        int ret;
+        u32 left_cpos;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        /* This function shouldn't be called for non-trees. */
+        BUG_ON(right_path->p_tree_depth == 0);
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                            right_path, &left_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* This function shouldn't be called for the leftmost leaf. */
+        BUG_ON(left_cpos == 0);
+        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                   path_root_el(right_path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, left_path, left_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_left_path = left_path;
 out:
+        if (ret)
+                ocfs2_free_path(left_path);
        return ret;
 }
 /*
 * Remove split_rec clusters from the record at index and merge them
- * onto the tail of the record at index - 1.
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
 */
-static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_left(struct inode *inode,
+                                struct ocfs2_path *right_path,
                                handle_t *handle,
                                struct ocfs2_extent_rec *split_rec,
-                                struct ocfs2_extent_list *el, int index)
+                                struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                int index)
 {
-        int ret, has_empty_extent = 0;
+        int ret, i, subtree_index = 0, has_empty_extent = 0;
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
        struct ocfs2_extent_rec *left_rec;
        struct ocfs2_extent_rec *right_rec;
+        struct ocfs2_extent_list *el = path_leaf_el(right_path);
+        struct buffer_head *bh = path_leaf_bh(right_path);
+        struct buffer_head *root_bh = NULL;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_extent_list *left_el;
-        BUG_ON(index <= 0);
+        BUG_ON(index < 0);
-        left_rec = &el->l_recs[index - 1];
        right_rec = &el->l_recs[index];
-        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+        if (index == 0) {
-                has_empty_extent = 1;
+                /* we meet with a cross extent block merge. */
+                ret = ocfs2_get_left_path(inode, right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                left_el = path_leaf_el(left_path);
+                BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+                       le16_to_cpu(left_el->l_count));
+                left_rec = &left_el->l_recs[
+                                le16_to_cpu(left_el->l_next_free_rec) - 1];
+                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+                       le16_to_cpu(left_rec->e_leaf_clusters) !=
+                       le32_to_cpu(split_rec->e_cpos));
+                subtree_index = ocfs2_find_subtree_root(inode,
+                                                        left_path, right_path);
+                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+                                                      handle->h_buffer_credits,
+                                                      left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                root_bh = left_path->p_node[subtree_index].bh;
+                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+                ret = ocfs2_journal_access(handle, inode, root_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                for (i = subtree_index + 1;
+                     i < path_num_items(right_path); i++) {
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   right_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   left_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+        } else {
+                left_rec = &el->l_recs[index - 1];
+                if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                        has_empty_extent = 1;
+        }
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
                *left_rec = *split_rec;
                has_empty_extent = 0;
-        } else {
+        } else
                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
-        }
        le32_add_cpu(&right_rec->e_cpos, split_clusters);
        le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
        if (ret)
                mlog_errno(ret);
+        if (left_path) {
+                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * In the situation that the right_rec is empty and the extent
+                 * block is empty also,  ocfs2_complete_edge_insert can't handle
+                 * it and we need to delete the right extent block.
+                 */
+                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+                    le16_to_cpu(el->l_next_free_rec) == 1) {
+                        ret = ocfs2_remove_rightmost_path(inode, handle,
+                                                          right_path, dealloc);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /* Now the rightmost extent block has been deleted.
+                         * So we use the new rightmost path.
+                         */
+                        ocfs2_mv_path(right_path, left_path);
+                        left_path = NULL;
+                } else
+                        ocfs2_complete_edge_insert(inode, handle, left_path,
+                                                   right_path, subtree_index);
+        }
 out:
+        if (left_path)
+                ocfs2_free_path(left_path);
        return ret;
 }
 static int ocfs2_try_to_merge_extent(struct inode *inode,
                                     handle_t *handle,
-                                     struct ocfs2_path *left_path,
+                                     struct ocfs2_path *path,
                                     int split_index,
                                     struct ocfs2_extent_rec *split_rec,
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 {
        int ret = 0;
-        struct ocfs2_extent_list *el = path_leaf_el(left_path);
+        struct ocfs2_extent_list *el = path_leaf_el(path);
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * extents - having more than one in a leaf is
                 * illegal.
                 */
-                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                ret = ocfs2_rotate_tree_left(inode, handle, path,
                                             dealloc);
                if (ret) {
                        mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * Left-right contig implies this.
                 */
                BUG_ON(!ctxt->c_split_covers_rec);
-                BUG_ON(split_index == 0);
                /*
                 * Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * Since the adding of an empty extent shifts
                 * everything back to the right, there's no need to
                 * update split_index here.
+                 *
+                 * When the split_index is zero, we need to merge it to the
+                 * prevoius extent block. It is more efficient and easier
+                 * if we do merge_right first and merge_left later.
                 */
-                ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+                ret = ocfs2_merge_rec_right(inode, path,
-                                           handle, split_rec, el, split_index);
+                                            handle, split_rec,
+                                            split_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
-                /*
+                /* The merge left us with an empty extent, remove it. */
-                 * The left merge left us with an empty extent, remove
+                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
-                 * it.
-                 */
-                ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                split_index--;
                rec = &el->l_recs[split_index];
                /*
                 * Note that we don't pass split_rec here on purpose -
-                 * we've merged it into the left side.
+                 * we've merged it into the rec already.
                 */
-                ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+                ret = ocfs2_merge_rec_left(inode, path,
-                                            handle, rec, el, split_index);
+                                           handle, rec,
+                                           dealloc,
+                                           split_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                ret = ocfs2_rotate_tree_left(inode, handle, path,
-                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
                                             dealloc);
                /*
                 * Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                if (ctxt->c_contig_type == CONTIG_RIGHT) {
                        ret = ocfs2_merge_rec_left(inode,
-                                                   path_leaf_bh(left_path),
+                                                   path,
-                                                   handle, split_rec, el,
+                                                   handle, split_rec,
+                                                   dealloc,
                                                   split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                        }
                } else {
                        ret = ocfs2_merge_rec_right(inode,
-                                                    path_leaf_bh(left_path),
+                                                    path,
-                                                    handle, split_rec, el,
+                                                    handle, split_rec,
                                                    split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                         * The merge may have left an empty extent in
                         * our leaf. Try to rotate it away.
                         */
-                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                        ret = ocfs2_rotate_tree_left(inode, handle, path,
                                                     dealloc);
                        if (ret)
                                mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
 }
 static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode,
+ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                               struct ocfs2_extent_list *el, int index,
                               struct ocfs2_extent_rec *split_rec)
 {
-        struct ocfs2_extent_rec *rec;
+        int status;
        enum ocfs2_contig_type ret = CONTIG_NONE;
+        u32 left_cpos, right_cpos;
+        struct ocfs2_extent_rec *rec = NULL;
+        struct ocfs2_extent_list *new_el;
+        struct ocfs2_path *left_path = NULL, *right_path = NULL;
+        struct buffer_head *bh;
+        struct ocfs2_extent_block *eb;
+        if (index > 0) {
+                rec = &el->l_recs[index - 1];
+        } else if (path->p_tree_depth > 0) {
+                status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                                       path, &left_cpos);
+                if (status)
+                        goto out;
+                if (left_cpos != 0) {
+                        left_path = ocfs2_new_path(path_root_bh(path),
+                                                   path_root_el(path));
+                        if (!left_path)
+                                goto out;
+                        status = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (status)
+                                goto out;
+                        new_el = path_leaf_el(left_path);
+                        if (le16_to_cpu(new_el->l_next_free_rec) !=
+                            le16_to_cpu(new_el->l_count)) {
+                                bh = path_leaf_bh(left_path);
+                                eb = (struct ocfs2_extent_block *)bh->b_data;
+                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                 eb);
+                                goto out;
+                        }
+                        rec = &new_el->l_recs[
+                                le16_to_cpu(new_el->l_next_free_rec) - 1];
+                }
+        }
        /*
         * We're careful to check for an empty extent record here -
         * the merge code will know what to do if it sees one.
         */
+        if (rec) {
-        if (index > 0) {
-                rec = &el->l_recs[index - 1];
                if (index == 1 && ocfs2_is_empty_extent(rec)) {
                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
                                ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
                }
        }
-        if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+        rec = NULL;
+        if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+                rec = &el->l_recs[index + 1];
+        else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+                 path->p_tree_depth > 0) {
+                status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
+                                                        path, &right_cpos);
+                if (status)
+                        goto out;
+                if (right_cpos == 0)
+                        goto out;
+                right_path = ocfs2_new_path(path_root_bh(path),
+                                            path_root_el(path));
+                if (!right_path)
+                        goto out;
+                status = ocfs2_find_path(inode, right_path, right_cpos);
+                if (status)
+                        goto out;
+                new_el = path_leaf_el(right_path);
+                rec = &new_el->l_recs[0];
+                if (ocfs2_is_empty_extent(rec)) {
+                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+                                bh = path_leaf_bh(right_path);
+                                eb = (struct ocfs2_extent_block *)bh->b_data;
+                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                 eb);
+                                goto out;
+                        }
+                        rec = &new_el->l_recs[1];
+                }
+        }
+        if (rec) {
                enum ocfs2_contig_type contig_type;
-                rec = &el->l_recs[index + 1];
                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
                        ret = contig_type;
        }
+out:
+        if (left_path)
+                ocfs2_free_path(left_path);
+        if (right_path)
+                ocfs2_free_path(right_path);
        return ret;
 }
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                goto out;
        }
-        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
                                                            split_index,
                                                            split_rec);
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
        status = ocfs2_flush_truncate_log(osb);
        if (status < 0)
                mlog_errno(status);
+        else
+                ocfs2_init_inode_steal_slot(osb);
        mlog_exit(status);
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 90383ed61005..17964c0505a9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned to)
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle = NULL;
+        handle_t *handle;
        int ret = 0;
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (!handle) {
+        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        }
 out:
        if (ret) {
-                if (handle)
+                if (!IS_ERR(handle))
                        ocfs2_commit_trans(osb, handle);
                handle = ERR_PTR(ret);
        }
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f13650..bc8c5e7d8608 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-        quorum.o tcp.o ver.o
+        quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000000..7bf3c0ea7bd9
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "tcp_internal.h"
+#define O2NET_DEBUG_DIR         "o2net"
+#define SC_DEBUG_NAME           "sock_containers"
+#define NST_DEBUG_NAME          "send_tracking"
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+static DEFINE_SPINLOCK(o2net_debug_lock);
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+        spin_lock(&o2net_debug_lock);
+        list_add(&nst->st_net_debug_item, &send_tracking);
+        spin_unlock(&o2net_debug_lock);
+}
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+        spin_lock(&o2net_debug_lock);
+        if (!list_empty(&nst->st_net_debug_item))
+                list_del_init(&nst->st_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+}
+static struct o2net_send_tracking
+                        *next_nst(struct o2net_send_tracking *nst_start)
+{
+        struct o2net_send_tracking *nst, *ret = NULL;
+        assert_spin_locked(&o2net_debug_lock);
+        list_for_each_entry(nst, &nst_start->st_net_debug_item,
+                            st_net_debug_item) {
+                /* discover the head of the list */
+                if (&nst->st_net_debug_item == &send_tracking)
+                        break;
+                /* use st_task to detect real nsts in the list */
+                if (nst->st_task != NULL) {
+                        ret = nst;
+                        break;
+                }
+        }
+        return ret;
+}
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        spin_lock(&o2net_debug_lock);
+        nst = next_nst(dummy_nst);
+        spin_unlock(&o2net_debug_lock);
+        return nst;
+}
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        spin_lock(&o2net_debug_lock);
+        nst = next_nst(dummy_nst);
+        list_del_init(&dummy_nst->st_net_debug_item);
+        if (nst)
+                list_add(&dummy_nst->st_net_debug_item,
+                         &nst->st_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+        return nst; /* unused, just needs to be null when done */
+}
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        spin_lock(&o2net_debug_lock);
+        nst = next_nst(dummy_nst);
+        if (nst != NULL) {
+                /* get_task_comm isn't exported.  oh well. */
+                seq_printf(seq, "%p:\n"
+                           "  pid:          %lu\n"
+                           "  tgid:         %lu\n"
+                           "  process name: %s\n"
+                           "  node:         %u\n"
+                           "  sc:           %p\n"
+                           "  message id:   %d\n"
+                           "  message type: %u\n"
+                           "  message key:  0x%08x\n"
+                           "  sock acquiry: %lu.%lu\n"
+                           "  send start:   %lu.%lu\n"
+                           "  wait start:   %lu.%lu\n",
+                           nst, (unsigned long)nst->st_task->pid,
+                           (unsigned long)nst->st_task->tgid,
+                           nst->st_task->comm, nst->st_node,
+                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                           nst->st_msg_key,
+                           nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
+                           nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+                           nst->st_status_time.tv_sec,
+                           nst->st_status_time.tv_usec);
+        }
+        spin_unlock(&o2net_debug_lock);
+        return 0;
+}
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations nst_seq_ops = {
+        .start = nst_seq_start,
+        .next = nst_seq_next,
+        .stop = nst_seq_stop,
+        .show = nst_seq_show,
+};
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_send_tracking *dummy_nst;
+        struct seq_file *seq;
+        int ret;
+        dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+        if (dummy_nst == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        dummy_nst->st_task = NULL;
+        ret = seq_open(file, &nst_seq_ops);
+        if (ret)
+                goto out;
+        seq = file->private_data;
+        seq->private = dummy_nst;
+        o2net_debug_add_nst(dummy_nst);
+        dummy_nst = NULL;
+out:
+        kfree(dummy_nst);
+        return ret;
+}
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct o2net_send_tracking *dummy_nst = seq->private;
+        o2net_debug_del_nst(dummy_nst);
+        return seq_release_private(inode, file);
+}
+static struct file_operations nst_seq_fops = {
+        .open = nst_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = nst_fop_release,
+};
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+        spin_lock(&o2net_debug_lock);
+        list_add(&sc->sc_net_debug_item, &sock_containers);
+        spin_unlock(&o2net_debug_lock);
+}
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+        spin_lock(&o2net_debug_lock);
+        list_del_init(&sc->sc_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+}
+static struct o2net_sock_container
+                        *next_sc(struct o2net_sock_container *sc_start)
+{
+        struct o2net_sock_container *sc, *ret = NULL;
+        assert_spin_locked(&o2net_debug_lock);
+        list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+                            sc_net_debug_item) {
+                /* discover the head of the list miscast as a sc */
+                if (&sc->sc_net_debug_item == &sock_containers)
+                        break;
+                /* use sc_page to detect real scs in the list */
+                if (sc->sc_page != NULL) {
+                        ret = sc;
+                        break;
+                }
+        }
+        return ret;
+}
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        spin_lock(&o2net_debug_lock);
+        sc = next_sc(dummy_sc);
+        spin_unlock(&o2net_debug_lock);
+        return sc;
+}
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        spin_lock(&o2net_debug_lock);
+        sc = next_sc(dummy_sc);
+        list_del_init(&dummy_sc->sc_net_debug_item);
+        if (sc)
+                list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+        return sc; /* unused, just needs to be null when done */
+}
+#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        spin_lock(&o2net_debug_lock);
+        sc = next_sc(dummy_sc);
+        if (sc != NULL) {
+                struct inet_sock *inet = NULL;
+                __be32 saddr = 0, daddr = 0;
+                __be16 sport = 0, dport = 0;
+                if (sc->sc_sock) {
+                        inet = inet_sk(sc->sc_sock->sk);
+                        /* the stack's structs aren't sparse endian clean */
+                        saddr = (__force __be32)inet->saddr;
+                        daddr = (__force __be32)inet->daddr;
+                        sport = (__force __be16)inet->sport;
+                        dport = (__force __be16)inet->dport;
+                }
+                /* XXX sigh, inet-> doesn't have sparse annotation so any
+                 * use of it here generates a warning with -Wbitwise */
+                seq_printf(seq, "%p:\n"
+                           "  krefs:           %d\n"
+                           "  sock:            %u.%u.%u.%u:%u -> "
+                                              "%u.%u.%u.%u:%u\n"
+                           "  remote node:     %s\n"
+                           "  page off:        %zu\n"
+                           "  handshake ok:    %u\n"
+                           "  timer:           %lu.%lu\n"
+                           "  data ready:      %lu.%lu\n"
+                           "  advance start:   %lu.%lu\n"
+                           "  advance stop:    %lu.%lu\n"
+                           "  func start:      %lu.%lu\n"
+                           "  func stop:       %lu.%lu\n"
+                           "  func key:        %u\n"
+                           "  func type:       %u\n",
+                           sc,
+                           atomic_read(&sc->sc_kref.refcount),
+                           NIPQUAD(saddr), inet ? ntohs(sport) : 0,
+                           NIPQUAD(daddr), inet ? ntohs(dport) : 0,
+                           sc->sc_node->nd_name,
+                           sc->sc_page_off,
+                           sc->sc_handshake_ok,
+                           TV_SEC_USEC(sc->sc_tv_timer),
+                           TV_SEC_USEC(sc->sc_tv_data_ready),
+                           TV_SEC_USEC(sc->sc_tv_advance_start),
+                           TV_SEC_USEC(sc->sc_tv_advance_stop),
+                           TV_SEC_USEC(sc->sc_tv_func_start),
+                           TV_SEC_USEC(sc->sc_tv_func_stop),
+                           sc->sc_msg_key,
+                           sc->sc_msg_type);
+        }
+        spin_unlock(&o2net_debug_lock);
+        return 0;
+}
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations sc_seq_ops = {
+        .start = sc_seq_start,
+        .next = sc_seq_next,
+        .stop = sc_seq_stop,
+        .show = sc_seq_show,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_container *dummy_sc;
+        struct seq_file *seq;
+        int ret;
+        dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+        if (dummy_sc == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        dummy_sc->sc_page = NULL;
+        ret = seq_open(file, &sc_seq_ops);
+        if (ret)
+                goto out;
+        seq = file->private_data;
+        seq->private = dummy_sc;
+        o2net_debug_add_sc(dummy_sc);
+        dummy_sc = NULL;
+out:
+        kfree(dummy_sc);
+        return ret;
+}
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct o2net_sock_container *dummy_sc = seq->private;
+        o2net_debug_del_sc(dummy_sc);
+        return seq_release_private(inode, file);
+}
+static struct file_operations sc_seq_fops = {
+        .open = sc_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+int o2net_debugfs_init(void)
+{
+        o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+        if (!o2net_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                         o2net_dentry, NULL,
+                                         &nst_seq_fops);
+        if (!nst_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                        o2net_dentry, NULL,
+                                        &sc_seq_fops);
+        if (!sc_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        return 0;
+bail:
+        if (sc_dentry)
+                debugfs_remove(sc_dentry);
+        if (nst_dentry)
+                debugfs_remove(nst_dentry);
+        if (o2net_dentry)
+                debugfs_remove(o2net_dentry);
+        return -ENOMEM;
+}
+void o2net_debugfs_exit(void)
+{
+        if (sc_dentry)
+                debugfs_remove(sc_dentry);
+        if (nst_dentry)
+                debugfs_remove(nst_dentry);
+        if (o2net_dentry)
+                debugfs_remove(o2net_dentry);
+}
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7e..cf9401e8cd0b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
        cluster_print_version();
        o2hb_init();
-        o2net_init();
+        ret = o2net_init();
+        if (ret)
+                goto out;
        ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
        if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723d..98429fd68499 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
+        sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
+        /*
+         * Create this symlink for backwards compatibility with old
+         * versions of ocfs2-tools which look for things in /sys/o2cb.
+         */
+        ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
+        if (ret)
+                goto error;
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b205..1e44ad14881a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
-/*
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- * FIXME: These should use to_o2nm_cluster_from_node(), but we end up
+                           u32 msgkey, struct task_struct *task, u8 node)
- * losing our parent link to the cluster during shutdown. This can be
+{
- * solved by adding a pre-removal callback to configfs, or passing
+#ifdef CONFIG_DEBUG_FS
- * around the cluster with the node. -jeffm
+        INIT_LIST_HEAD(&nst->st_net_debug_item);
- */
+        nst->st_task = task;
-static inline int o2net_reconnect_delay(struct o2nm_node *node)
+        nst->st_msg_type = msgtype;
+        nst->st_msg_key = msgkey;
+        nst->st_node = node;
+#endif
+}
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+        do_gettimeofday(&nst->st_sock_time);
+#endif
+}
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+        do_gettimeofday(&nst->st_send_time);
+#endif
+}
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+        do_gettimeofday(&nst->st_status_time);
+#endif
+}
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+                                         struct o2net_sock_container *sc)
+{
+#ifdef CONFIG_DEBUG_FS
+        nst->st_sc = sc;
+#endif
+}
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+{
+#ifdef CONFIG_DEBUG_FS
+        nst->st_id = msg_id;
+#endif
+}
+static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
 }
-static inline int o2net_keepalive_delay(struct o2nm_node *node)
+static inline int o2net_keepalive_delay(void)
 {
        return o2nm_single_cluster->cl_keepalive_delay_ms;
 }
-static inline int o2net_idle_timeout(struct o2nm_node *node)
+static inline int o2net_idle_timeout(void)
 {
        return o2nm_single_cluster->cl_idle_timeout_ms;
 }
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
+        o2net_debug_del_sc(sc);
        kfree(sc);
 }
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        ret = sc;
        sc->sc_page = page;
+        o2net_debug_add_sc(sc);
        sc = NULL;
        page = NULL;
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
        mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
-        /* we won't reconnect after our valid conn goes away for
-         * this hb iteration.. here so it shows up in the logs */
        if (was_valid && !valid && err == 0)
                err = -ENOTCONN;
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        if (!was_valid && valid) {
                o2quo_conn_up(o2net_num_from_nn(nn));
-                /* this is a bit of a hack.  we only try reconnecting
-                 * when heartbeating starts until we get a connection.
-                 * if that connection then dies we don't try reconnecting.
-                 * the only way to start connecting again is to down
-                 * heartbeat and bring it back up. */
                cancel_delayed_work(&nn->nn_connect_expired);
                printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
                       o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
                /* delay if we're withing a RECONNECT_DELAY of the
                 * last attempt */
                delay = (nn->nn_last_connect_attempt +
-                         msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+                         msecs_to_jiffies(o2net_reconnect_delay()))
                        - jiffies;
-                if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+                if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
                        delay = 0;
                mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
                queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+                /*
+                 * Delay the expired work after idle timeout.
+                 *
+                 * We might have lots of failed connection attempts that run
+                 * through here but we only cancel the connect_expired work when
+                 * a connection attempt succeeds.  So only the first enqueue of
+                 * the connect_expired work will do anything.  The rest will see
+                 * that it's already queued and do nothing.
+                 */
+                delay += msecs_to_jiffies(o2net_idle_timeout());
+                queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
        }
        /* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        struct o2net_status_wait nsw = {
                .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
        };
+        struct o2net_send_tracking nst;
+        o2net_init_nst(&nst, msg_type, key, current, target_node);
        if (o2net_wq == NULL) {
                mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                goto out;
        }
+        o2net_debug_add_nst(&nst);
+        o2net_set_nst_sock_time(&nst);
        ret = wait_event_interruptible(nn->nn_sc_wq,
                                       o2net_tx_can_proceed(nn, &sc, &error));
        if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        if (ret)
                goto out;
+        o2net_set_nst_sock_container(&nst, sc);
        veclen = caller_veclen + 1;
        vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
        if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                goto out;
        msg->msg_num = cpu_to_be32(nsw.ns_id);
+        o2net_set_nst_msg_id(&nst, nsw.ns_id);
+        o2net_set_nst_send_time(&nst);
        /* finally, convert the message header to network byte-order
         * and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        }
        /* wait on other node's handler */
+        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
        /* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        mlog(0, "woken, returning system status %d, user status %d\n",
             ret, nsw.ns_status);
 out:
+        o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
        if (sc)
                sc_put(sc);
        if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         * but isn't. This can ultimately cause corruption.
         */
        if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
-                                o2net_idle_timeout(sc->sc_node)) {
+                                o2net_idle_timeout()) {
                mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
                     "%u ms, but we use %u ms locally.  disconnecting\n",
                     SC_NODEF_ARGS(sc),
                     be32_to_cpu(hand->o2net_idle_timeout_ms),
-                     o2net_idle_timeout(sc->sc_node));
+                     o2net_idle_timeout());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
        if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
-                        o2net_keepalive_delay(sc->sc_node)) {
+                        o2net_keepalive_delay()) {
                mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
                     "%u ms, but we use %u ms locally.  disconnecting\n",
                     SC_NODEF_ARGS(sc),
                     be32_to_cpu(hand->o2net_keepalive_delay_ms),
-                     o2net_keepalive_delay(sc->sc_node));
+                     o2net_keepalive_delay());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         * shut down already */
        if (nn->nn_sc == sc) {
                o2net_sc_reset_idle_timer(sc);
+                atomic_set(&nn->nn_timeout, 0);
                o2net_set_nn_state(nn, sc, 1, 0);
        }
        spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
 {
        o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
                O2HB_MAX_WRITE_TIMEOUT_MS);
-        o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
+        o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
-                o2net_idle_timeout(NULL));
        o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
-                o2net_keepalive_delay(NULL));
+                o2net_keepalive_delay());
        o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
-                o2net_reconnect_delay(NULL));
+                o2net_reconnect_delay());
 }
 /* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
        struct timeval now;
        do_gettimeofday(&now);
        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
-                     o2net_idle_timeout(sc->sc_node) / 1000,
+                     o2net_idle_timeout() / 1000,
-                     o2net_idle_timeout(sc->sc_node) % 1000);
+                     o2net_idle_timeout() % 1000);
        mlog(ML_NOTICE, "here are some times that might help debug the "
             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+        /*
+         * Initialize the nn_timeout so that the next connection attempt
+         * will continue in o2net_start_connect.
+         */
+        atomic_set(&nn->nn_timeout, 1);
        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 {
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
-                      msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node)));
+                      msecs_to_jiffies(o2net_keepalive_delay()));
        do_gettimeofday(&sc->sc_tv_timer);
        mod_timer(&sc->sc_idle_timeout,
-               jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node)));
+               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
        struct socket *sock = NULL;
        struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
        int ret = 0, stop;
+        unsigned int timeout;
        /* if we're greater we initiate tx, otherwise we accept */
        if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
        }
        spin_lock(&nn->nn_lock);
-        /* see if we already have one pending or have given up */
+        /*
-        stop = (nn->nn_sc || nn->nn_persistent_error);
+         * see if we already have one pending or have given up.
+         * For nn_timeout, it is set when we close the connection
+         * because of the idle time out. So it means that we have
+         * at least connected to that node successfully once,
+         * now try to connect to it again.
+         */
+        timeout = atomic_read(&nn->nn_timeout);
+        stop = (nn->nn_sc ||
+                (nn->nn_persistent_error &&
+                (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
        spin_unlock(&nn->nn_lock);
        if (stop)
                goto out;
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work)
                mlog(ML_ERROR, "no connection established with node %u after "
                     "%u.%u seconds, giving up and returning errors.\n",
                     o2net_num_from_nn(nn),
-                     o2net_idle_timeout(NULL) / 1000,
+                     o2net_idle_timeout() / 1000,
-                     o2net_idle_timeout(NULL) % 1000);
+                     o2net_idle_timeout() % 1000);
                o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
        }
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
        /* don't reconnect until it's heartbeating again */
        spin_lock(&nn->nn_lock);
+        atomic_set(&nn->nn_timeout, 0);
        o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
        spin_unlock(&nn->nn_lock);
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
        /* ensure an immediate connect attempt */
        nn->nn_last_connect_attempt = jiffies -
-                (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
+                (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
        if (node_num != o2nm_this_node()) {
-                /* heartbeat doesn't work unless a local node number is
-                 * configured and doing so brings up the o2net_wq, so we can
-                 * use it.. */
-                queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
-                                   msecs_to_jiffies(o2net_idle_timeout(node)));
                /* believe it or not, accept and node hearbeating testing
                 * can succeed for this node before we got here.. so
                 * only use set_nn_state to clear the persistent error
                 * if that hasn't already happened */
                spin_lock(&nn->nn_lock);
+                atomic_set(&nn->nn_timeout, 0);
                if (nn->nn_persistent_error)
                        o2net_set_nn_state(nn, NULL, 0, 0);
                spin_unlock(&nn->nn_lock);
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
        new_sock = NULL;
        spin_lock(&nn->nn_lock);
+        atomic_set(&nn->nn_timeout, 0);
        o2net_set_nn_state(nn, sc, 0, 0);
        spin_unlock(&nn->nn_lock);
@@ -1922,6 +1999,9 @@ int o2net_init(void)
        o2quo_init();
+        if (o2net_debugfs_init())
+                return -ENOMEM;
        o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
        o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
        o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1941,6 +2021,7 @@ int o2net_init(void)
        for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
                struct o2net_node *nn = o2net_nn_from_num(i);
+                atomic_set(&nn->nn_timeout, 0);
                spin_lock_init(&nn->nn_lock);
                INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
                INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1962,4 +2043,5 @@ void o2net_exit(void)
        kfree(o2net_hand);
        kfree(o2net_keep_req);
        kfree(o2net_keep_resp);
+        o2net_debugfs_exit();
 }
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3dd..a705d5d19036 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
 int o2net_init(void);
 void o2net_exit(void);
+struct o2net_send_tracking;
+struct o2net_sock_container;
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static int o2net_debugfs_init(void)
+{
+        return 0;
+}
+static void o2net_debugfs_exit(void)
+{
+}
+static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
 #endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
        unsigned                        nn_sc_valid:1;
        /* if this is set tx just returns it */
        int                             nn_persistent_error;
+        /* It is only set to 1 after the idle time out. */
+        atomic_t                        nn_timeout;
        /* threads waiting for an sc to arrive wait on the wq for generation
         * to increase.  it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+#endif
        struct timeval          sc_tv_timer;
        struct timeval          sc_tv_data_ready;
        struct timeval          sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
        struct list_head        ns_node_item;
 };
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+        struct list_head                st_net_debug_item;
+        struct task_struct              *st_task;
+        struct o2net_sock_container     *st_sc;
+        u32                             st_id;
+        u32                             st_msg_type;
+        u32                             st_msg_key;
+        u8                              st_node;
+        struct timeval                  st_sock_time;
+        struct timeval                  st_send_time;
+        struct timeval                  st_status_time;
+};
+#else
+struct o2net_send_tracking {
+        u32     dummy;
+};
+#endif  /* CONFIG_DEBUG_FS */
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d270..190361375700 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index dc8ea666efdb..d5a86fb81a49 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
 /* Intended to make it easier for us to switch out hash functions */
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+enum dlm_mle_type {
+        DLM_MLE_BLOCK,
+        DLM_MLE_MASTER,
+        DLM_MLE_MIGRATION
+};
+struct dlm_lock_name {
+        u8 len;
+        u8 name[DLM_LOCKID_NAME_MAX];
+};
+struct dlm_master_list_entry {
+        struct list_head list;
+        struct list_head hb_events;
+        struct dlm_ctxt *dlm;
+        spinlock_t spinlock;
+        wait_queue_head_t wq;
+        atomic_t woken;
+        struct kref mle_refs;
+        int inuse;
+        unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        u8 master;
+        u8 new_master;
+        enum dlm_mle_type type;
+        struct o2hb_callback_func mle_hb_up;
+        struct o2hb_callback_func mle_hb_down;
+        union {
+                struct dlm_lock_resource *res;
+                struct dlm_lock_name name;
+        } u;
+};
 enum dlm_ast_type {
        DLM_AST = 0,
        DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
        struct list_head purge_list;
        struct list_head pending_asts;
        struct list_head pending_basts;
+        struct list_head tracking_list;
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
        atomic_t remote_resources;
        atomic_t unknown_resources;
+        struct dlm_debug_ctxt *dlm_debug_ctxt;
+        struct dentry *dlm_debugfs_subroot;
        /* NOTE: Next three are protected by dlm_domain_lock */
        struct kref dlm_refs;
        enum dlm_ctxt_state dlm_state;
@@ -270,6 +309,9 @@ struct dlm_lock_resource
        struct list_head dirty;
        struct list_head recovering; // dlm_recovery_ctxt.resources list
+        /* Added during init and removed during release */
+        struct list_head tracking;      /* dlm->tracking_list */
        /* unused lock resources have their last_used stamped and are
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
 int dlm_init_mle_cache(void);
 void dlm_destroy_mle_cache(void);
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d4..5f6d858770a2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
 *
 * debug functionality for the dlm
 *
- * Copyright (C) 2004 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
+#include <linux/debugfs.h>
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -37,17 +38,16 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
 void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
-        mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
-               res->lockname.len, res->lockname.name,
-               res->owner, res->state);
        spin_lock(&res->spinlock);
        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
        int bit;
        assert_spin_locked(&res->spinlock);
-        mlog(ML_NOTICE, "  refmap nodes: [ ");
+        printk("  refmap nodes: [ ");
        bit = 0;
        while (1) {
                bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
        printk("], inflight=%u\n", res->inflight_locks);
 }
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+        spin_lock(&lock->spinlock);
+        printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+               "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+               "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+               lock->ml.type, lock->ml.convert_type, lock->ml.node,
+               dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+               dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+               atomic_read(&lock->lock_refs.refcount),
+               (list_empty(&lock->ast_list) ? 'y' : 'n'),
+               (lock->ast_pending ? 'y' : 'n'),
+               (list_empty(&lock->bast_list) ? 'y' : 'n'),
+               (lock->bast_pending ? 'y' : 'n'),
+               (lock->convert_pending ? 'y' : 'n'),
+               (lock->lock_pending ? 'y' : 'n'),
+               (lock->cancel_pending ? 'y' : 'n'),
+               (lock->unlock_pending ? 'y' : 'n'));
+        spin_unlock(&lock->spinlock);
+}
 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
        struct list_head *iter2;
        struct dlm_lock *lock;
+        char buf[DLM_LOCKID_NAME_MAX];
        assert_spin_locked(&res->spinlock);
-        mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+        stringify_lockname(res->lockname.name, res->lockname.len,
-               res->lockname.len, res->lockname.name,
+                           buf, sizeof(buf) - 1);
-               res->owner, res->state);
+        printk("lockres: %s, owner=%u, state=%u\n",
-        mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
+               buf, res->owner, res->state);
-             res->last_used, list_empty(&res->purge) ? "no" : "yes");
+        printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
+               res->last_used, atomic_read(&res->refs.refcount),
+               list_empty(&res->purge) ? "no" : "yes");
+        printk("  on dirty list: %s, on reco list: %s, "
+               "migrating pending: %s\n",
+               list_empty(&res->dirty) ? "no" : "yes",
+               list_empty(&res->recovering) ? "no" : "yes",
+               res->migration_pending ? "yes" : "no");
+        printk("  inflight locks: %d, asts reserved: %d\n",
+               res->inflight_locks, atomic_read(&res->asts_reserved));
        dlm_print_lockres_refmap(res);
-        mlog(ML_NOTICE, "  granted queue: \n");
+        printk("  granted queue:\n");
        list_for_each(iter2, &res->granted) {
                lock = list_entry(iter2, struct dlm_lock, list);
-                spin_lock(&lock->spinlock);
+                __dlm_print_lock(lock);
-                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                       list_empty(&lock->ast_list) ? 'y' : 'n',
-                       lock->ast_pending ? 'y' : 'n',
-                       list_empty(&lock->bast_list) ? 'y' : 'n',
-                       lock->bast_pending ? 'y' : 'n');
-                spin_unlock(&lock->spinlock);
        }
-        mlog(ML_NOTICE, "  converting queue: \n");
+        printk("  converting queue:\n");
        list_for_each(iter2, &res->converting) {
                lock = list_entry(iter2, struct dlm_lock, list);
-                spin_lock(&lock->spinlock);
+                __dlm_print_lock(lock);
-                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                       list_empty(&lock->ast_list) ? 'y' : 'n',
-                       lock->ast_pending ? 'y' : 'n',
-                       list_empty(&lock->bast_list) ? 'y' : 'n',
-                       lock->bast_pending ? 'y' : 'n');
-                spin_unlock(&lock->spinlock);
        }
-        mlog(ML_NOTICE, "  blocked queue: \n");
+        printk("  blocked queue:\n");
        list_for_each(iter2, &res->blocked) {
                lock = list_entry(iter2, struct dlm_lock, list);
-                spin_lock(&lock->spinlock);
+                __dlm_print_lock(lock);
-                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                       list_empty(&lock->ast_list) ? 'y' : 'n',
-                       lock->ast_pending ? 'y' : 'n',
-                       list_empty(&lock->bast_list) ? 'y' : 'n',
-                       lock->bast_pending ? 'y' : 'n');
-                spin_unlock(&lock->spinlock);
        }
 }
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
-#if 0
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
-{
-        struct dlm_lock_resource *res;
-        struct hlist_node *iter;
-        struct hlist_head *bucket;
-        int i;
-        mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
-                  dlm->name, dlm->node_num, dlm->key);
-        if (!dlm || !dlm->name) {
-                mlog(ML_ERROR, "dlm=%p\n", dlm);
-                return;
-        }
-        spin_lock(&dlm->spinlock);
-        for (i=0; i<DLM_HASH_BUCKETS; i++) {
-                bucket = dlm_lockres_hash(dlm, i);
-                hlist_for_each_entry(res, iter, bucket, hash_node)
-                        dlm_print_one_lock_resource(res);
-        }
-        spin_unlock(&dlm->spinlock);
-}
-#endif  /*  0  */
 static const char *dlm_errnames[] = {
        [DLM_NORMAL] =                  "DLM_NORMAL",
        [DLM_GRANTED] =                 "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
        return dlm_errnames[err];
 }
 EXPORT_SYMBOL_GPL(dlm_errname);
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
+{
+        int out = 0;
+        __be64 inode_blkno_be;
+#define OCFS2_DENTRY_LOCK_INO_START     18
+        if (*lockname == 'N') {
+                memcpy((__be64 *)&inode_blkno_be,
+                       (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+                       sizeof(__be64));
+                out += snprintf(buf + out, len - out, "%.*s%08x",
+                                OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+                                (unsigned int)be64_to_cpu(inode_blkno_be));
+        } else
+                out += snprintf(buf + out, len - out, "%.*s",
+                                locklen, lockname);
+        return out;
+}
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+                             char *buf, int len)
+{
+        int out = 0;
+        int i = -1;
+        while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+                out += snprintf(buf + out, len - out, "%d ", i);
+        return out;
+}
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+        int out = 0;
+        unsigned int namelen;
+        const char *name;
+        char *mle_type;
+        if (mle->type != DLM_MLE_MASTER) {
+                namelen = mle->u.name.len;
+                name = mle->u.name.name;
+        } else {
+                namelen = mle->u.res->lockname.len;
+                name = mle->u.res->lockname.name;
+        }
+        if (mle->type == DLM_MLE_BLOCK)
+                mle_type = "BLK";
+        else if (mle->type == DLM_MLE_MASTER)
+                mle_type = "MAS";
+        else
+                mle_type = "MIG";
+        out += stringify_lockname(name, namelen, buf + out, len - out);
+        out += snprintf(buf + out, len - out,
+                        "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+                        mle_type, mle->master, mle->new_master,
+                        !list_empty(&mle->hb_events),
+                        !!mle->inuse,
+                        atomic_read(&mle->mle_refs.refcount));
+        out += snprintf(buf + out, len - out, "Maybe=");
+        out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "Vote=");
+        out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "Response=");
+        out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "Node=");
+        out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
+        return out;
+}
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+        char *buf;
+        buf = (char *) get_zeroed_page(GFP_NOFS);
+        if (buf) {
+                dump_mle(mle, buf, PAGE_SIZE - 1);
+                free_page((unsigned long)buf);
+        }
+}
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *dlm_debugfs_root = NULL;
+#define DLM_DEBUGFS_DIR                         "o2dlm"
+#define DLM_DEBUGFS_DLM_STATE                   "dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE               "locking_state"
+#define DLM_DEBUGFS_MLE_STATE                   "mle_state"
+#define DLM_DEBUGFS_PURGE_LIST                  "purge_list"
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+        struct dlm_debug_ctxt *dc;
+        dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+        kfree(dc);
+}
+void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+        if (dc)
+                kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+        kref_get(&dc->debug_refcnt);
+}
+static struct debug_buffer *debug_buffer_allocate(void)
+{
+        struct debug_buffer *db = NULL;
+        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+        if (!db)
+                goto bail;
+        db->len = PAGE_SIZE;
+        db->buf = kmalloc(db->len, GFP_KERNEL);
+        if (!db->buf)
+                goto bail;
+        return db;
+bail:
+        kfree(db);
+        return NULL;
+}
+static ssize_t debug_buffer_read(struct file *file, char __user *buf,
+                                 size_t nbytes, loff_t *ppos)
+{
+        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
+}
+static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
+{
+        struct debug_buffer *db = file->private_data;
+        loff_t new = -1;
+        switch (whence) {
+        case 0:
+                new = off;
+                break;
+        case 1:
+                new = file->f_pos + off;
+                break;
+        }
+        if (new < 0 || new > db->len)
+                return -EINVAL;
+        return (file->f_pos = new);
+}
+static int debug_buffer_release(struct inode *inode, struct file *file)
+{
+        struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+        if (db)
+                kfree(db->buf);
+        kfree(db);
+        return 0;
+}
+/* end - util funcs */
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+        struct dlm_lock_resource *res;
+        int out = 0;
+        unsigned long total = 0;
+        out += snprintf(db->buf + out, db->len - out,
+                        "Dumping Purgelist for Domain: %s\n", dlm->name);
+        spin_lock(&dlm->spinlock);
+        list_for_each_entry(res, &dlm->purge_list, purge) {
+                ++total;
+                if (db->len - out < 100)
+                        continue;
+                spin_lock(&res->spinlock);
+                out += stringify_lockname(res->lockname.name,
+                                          res->lockname.len,
+                                          db->buf + out, db->len - out);
+                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                                (jiffies - res->last_used)/HZ);
+                spin_unlock(&res->spinlock);
+        }
+        spin_unlock(&dlm->spinlock);
+        out += snprintf(db->buf + out, db->len - out,
+                        "Total on list: %ld\n", total);
+        return out;
+}
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        struct debug_buffer *db;
+        db = debug_buffer_allocate();
+        if (!db)
+                goto bail;
+        db->len = debug_purgelist_print(dlm, db);
+        file->private_data = db;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static struct file_operations debug_purgelist_fops = {
+        .open =         debug_purgelist_open,
+        .release =      debug_buffer_release,
+        .read =         debug_buffer_read,
+        .llseek =       debug_buffer_llseek,
+};
+/* end - purge list funcs */
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+        struct dlm_master_list_entry *mle;
+        int out = 0;
+        unsigned long total = 0;
+        out += snprintf(db->buf + out, db->len - out,
+                        "Dumping MLEs for Domain: %s\n", dlm->name);
+        spin_lock(&dlm->master_lock);
+        list_for_each_entry(mle, &dlm->master_list, list) {
+                ++total;
+                if (db->len - out < 200)
+                        continue;
+                out += dump_mle(mle, db->buf + out, db->len - out);
+        }
+        spin_unlock(&dlm->master_lock);
+        out += snprintf(db->buf + out, db->len - out,
+                        "Total on list: %ld\n", total);
+        return out;
+}
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        struct debug_buffer *db;
+        db = debug_buffer_allocate();
+        if (!db)
+                goto bail;
+        db->len = debug_mle_print(dlm, db);
+        file->private_data = db;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static struct file_operations debug_mle_fops = {
+        .open =         debug_mle_open,
+        .release =      debug_buffer_release,
+        .read =         debug_buffer_read,
+        .llseek =       debug_buffer_llseek,
+};
+/* end - debug mle funcs */
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+        int out;
+#define DEBUG_LOCK_VERSION      1
+        spin_lock(&lock->spinlock);
+        out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+                       "%d,%d,%d,%d\n",
+                       DEBUG_LOCK_VERSION,
+                       list_type, lock->ml.type, lock->ml.convert_type,
+                       lock->ml.node,
+                       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                       !list_empty(&lock->ast_list),
+                       !list_empty(&lock->bast_list),
+                       lock->ast_pending, lock->bast_pending,
+                       lock->convert_pending, lock->lock_pending,
+                       lock->cancel_pending, lock->unlock_pending,
+                       atomic_read(&lock->lock_refs.refcount));
+        spin_unlock(&lock->spinlock);
+        return out;
+}
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+        struct dlm_lock *lock;
+        int i;
+        int out = 0;
+        out += snprintf(buf + out, len - out, "NAME:");
+        out += stringify_lockname(res->lockname.name, res->lockname.len,
+                                  buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+#define DEBUG_LRES_VERSION      1
+        out += snprintf(buf + out, len - out,
+                        "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+                        DEBUG_LRES_VERSION,
+                        res->owner, res->state, res->last_used,
+                        !list_empty(&res->purge),
+                        !list_empty(&res->dirty),
+                        !list_empty(&res->recovering),
+                        res->inflight_locks, res->migration_pending,
+                        atomic_read(&res->asts_reserved),
+                        atomic_read(&res->refs.refcount));
+        /* refmap */
+        out += snprintf(buf + out, len - out, "RMAP:");
+        out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        /* lvb */
+        out += snprintf(buf + out, len - out, "LVBX:");
+        for (i = 0; i < DLM_LVB_LEN; i++)
+                out += snprintf(buf + out, len - out,
+                                        "%02x", (unsigned char)res->lvb[i]);
+        out += snprintf(buf + out, len - out, "\n");
+        /* granted */
+        list_for_each_entry(lock, &res->granted, list)
+                out += dump_lock(lock, 0, buf + out, len - out);
+        /* converting */
+        list_for_each_entry(lock, &res->converting, list)
+                out += dump_lock(lock, 1, buf + out, len - out);
+        /* blocked */
+        list_for_each_entry(lock, &res->blocked, list)
+                out += dump_lock(lock, 2, buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        return out;
+}
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+        struct debug_lockres *dl = m->private;
+        struct dlm_ctxt *dlm = dl->dl_ctxt;
+        struct dlm_lock_resource *res = NULL;
+        spin_lock(&dlm->spinlock);
+        if (dl->dl_res) {
+                list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                        if (dl->dl_res) {
+                                dlm_lockres_put(dl->dl_res);
+                                dl->dl_res = NULL;
+                        }
+                        if (&res->tracking == &dlm->tracking_list) {
+                                mlog(0, "End of list found, %p\n", res);
+                                dl = NULL;
+                                break;
+                        }
+                        dlm_lockres_get(res);
+                        dl->dl_res = res;
+                        break;
+                }
+        } else {
+                if (!list_empty(&dlm->tracking_list)) {
+                        list_for_each_entry(res, &dlm->tracking_list, tracking)
+                                break;
+                        dlm_lockres_get(res);
+                        dl->dl_res = res;
+                } else
+                        dl = NULL;
+        }
+        if (dl) {
+                spin_lock(&dl->dl_res->spinlock);
+                dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
+                spin_unlock(&dl->dl_res->spinlock);
+        }
+        spin_unlock(&dlm->spinlock);
+        return dl;
+}
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return NULL;
+}
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+        struct debug_lockres *dl = (struct debug_lockres *)v;
+        seq_printf(s, "%s", dl->dl_buf);
+        return 0;
+}
+static struct seq_operations debug_lockres_ops = {
+        .start =        lockres_seq_start,
+        .stop =         lockres_seq_stop,
+        .next =         lockres_seq_next,
+        .show =         lockres_seq_show,
+};
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        int ret = -ENOMEM;
+        struct seq_file *seq;
+        struct debug_lockres *dl = NULL;
+        dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+        if (!dl) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        dl->dl_len = PAGE_SIZE;
+        dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+        if (!dl->dl_buf) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        ret = seq_open(file, &debug_lockres_ops);
+        if (ret) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        seq = (struct seq_file *) file->private_data;
+        seq->private = dl;
+        dlm_grab(dlm);
+        dl->dl_ctxt = dlm;
+        return 0;
+bail:
+        if (dl)
+                kfree(dl->dl_buf);
+        kfree(dl);
+        return ret;
+}
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = (struct seq_file *)file->private_data;
+        struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+        if (dl->dl_res)
+                dlm_lockres_put(dl->dl_res);
+        dlm_put(dl->dl_ctxt);
+        kfree(dl->dl_buf);
+        return seq_release_private(inode, file);
+}
+static struct file_operations debug_lockres_fops = {
+        .open =         debug_lockres_open,
+        .release =      debug_lockres_release,
+        .read =         seq_read,
+        .llseek =       seq_lseek,
+};
+/* end - debug lockres funcs */
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+        int out = 0;
+        struct dlm_reco_node_data *node;
+        char *state;
+        int lres, rres, ures, tres;
+        lres = atomic_read(&dlm->local_resources);
+        rres = atomic_read(&dlm->remote_resources);
+        ures = atomic_read(&dlm->unknown_resources);
+        tres = lres + rres + ures;
+        spin_lock(&dlm->spinlock);
+        switch (dlm->dlm_state) {
+        case DLM_CTXT_NEW:
+                state = "NEW"; break;
+        case DLM_CTXT_JOINED:
+                state = "JOINED"; break;
+        case DLM_CTXT_IN_SHUTDOWN:
+                state = "SHUTDOWN"; break;
+        case DLM_CTXT_LEAVING:
+                state = "LEAVING"; break;
+        default:
+                state = "UNKNOWN"; break;
+        }
+        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Thread Pid: %d  Node: %d  State: %s\n",
+                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+        /* Number of Joins: xxx  Joining Node: xxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Number of Joins: %d  Joining Node: %d\n",
+                        dlm->num_joins, dlm->joining_node);
+        /* Domain Map: xx xx xx */
+        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+                                 db->buf + out, db->len - out);
+        out += snprintf(db->buf + out, db->len - out, "\n");
+        /* Live Map: xx xx xx */
+        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+                                 db->buf + out, db->len - out);
+        out += snprintf(db->buf + out, db->len - out, "\n");
+        /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Mastered Resources Total: %d  Locally: %d  "
+                        "Remotely: %d  Unknown: %d\n",
+                        tres, lres, rres, ures);
+        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
+                        "PendingBASTs=%s  Master=%s\n",
+                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+        /* Purge Count: xxx  Refs: xxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
+                        atomic_read(&dlm->dlm_refs.refcount));
+        /* Dead Node: xxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Dead Node: %d\n", dlm->reco.dead_node);
+        /* What about DLM_RECO_STATE_FINALIZE? */
+        if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+                state = "ACTIVE";
+        else
+                state = "INACTIVE";
+        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Recovery Pid: %d  Master: %d  State: %s\n",
+                        dlm->dlm_reco_thread_task->pid,
+                        dlm->reco.new_master, state);
+        /* Recovery Map: xx xx */
+        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+                                 db->buf + out, db->len - out);
+        out += snprintf(db->buf + out, db->len - out, "\n");
+        /* Recovery Node State: */
+        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        list_for_each_entry(node, &dlm->reco.node_data, list) {
+                switch (node->state) {
+                case DLM_RECO_NODE_DATA_INIT:
+                        state = "INIT";
+                        break;
+                case DLM_RECO_NODE_DATA_REQUESTING:
+                        state = "REQUESTING";
+                        break;
+                case DLM_RECO_NODE_DATA_DEAD:
+                        state = "DEAD";
+                        break;
+                case DLM_RECO_NODE_DATA_RECEIVING:
+                        state = "RECEIVING";
+                        break;
+                case DLM_RECO_NODE_DATA_REQUESTED:
+                        state = "REQUESTED";
+                        break;
+                case DLM_RECO_NODE_DATA_DONE:
+                        state = "DONE";
+                        break;
+                case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                        state = "FINALIZE-SENT";
+                        break;
+                default:
+                        state = "BAD";
+                        break;
+                }
+                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                                node->node_num, state);
+        }
+        spin_unlock(&dlm->spinlock);
+        return out;
+}
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        struct debug_buffer *db = NULL;
+        db = debug_buffer_allocate();
+        if (!db)
+                goto bail;
+        db->len = debug_state_print(dlm, db);
+        file->private_data = db;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static struct file_operations debug_state_fops = {
+        .open =         debug_state_open,
+        .release =      debug_buffer_release,
+        .read =         debug_buffer_read,
+        .llseek =       debug_buffer_llseek,
+};
+/* end  - debug state funcs */
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+        /* for dumping dlm_ctxt */
+        dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+                                                     S_IFREG|S_IRUSR,
+                                                     dlm->dlm_debugfs_subroot,
+                                                     dlm, &debug_state_fops);
+        if (!dc->debug_state_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        /* for dumping lockres */
+        dc->debug_lockres_dentry =
+                        debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+                                            S_IFREG|S_IRUSR,
+                                            dlm->dlm_debugfs_subroot,
+                                            dlm, &debug_lockres_fops);
+        if (!dc->debug_lockres_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        /* for dumping mles */
+        dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+                                                   S_IFREG|S_IRUSR,
+                                                   dlm->dlm_debugfs_subroot,
+                                                   dlm, &debug_mle_fops);
+        if (!dc->debug_mle_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        /* for dumping lockres on the purge list */
+        dc->debug_purgelist_dentry =
+                        debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+                                            S_IFREG|S_IRUSR,
+                                            dlm->dlm_debugfs_subroot,
+                                            dlm, &debug_purgelist_fops);
+        if (!dc->debug_purgelist_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        dlm_debug_get(dc);
+        return 0;
+bail:
+        dlm_debug_shutdown(dlm);
+        return -ENOMEM;
+}
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+        if (dc) {
+                if (dc->debug_purgelist_dentry)
+                        debugfs_remove(dc->debug_purgelist_dentry);
+                if (dc->debug_mle_dentry)
+                        debugfs_remove(dc->debug_mle_dentry);
+                if (dc->debug_lockres_dentry)
+                        debugfs_remove(dc->debug_lockres_dentry);
+                if (dc->debug_state_dentry)
+                        debugfs_remove(dc->debug_state_dentry);
+                dlm_debug_put(dc);
+        }
+}
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+        dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+                                                      dlm_debugfs_root);
+        if (!dlm->dlm_debugfs_subroot) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+                                      GFP_KERNEL);
+        if (!dlm->dlm_debug_ctxt) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+        return 0;
+bail:
+        dlm_destroy_debugfs_subroot(dlm);
+        return -ENOMEM;
+}
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+        if (dlm->dlm_debugfs_subroot)
+                debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+        dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+        if (!dlm_debugfs_root) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void dlm_destroy_debugfs_root(void)
+{
+        if (dlm_debugfs_root)
+                debugfs_remove(dlm_debugfs_root);
+}
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..d34a62a3a625
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,86 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+#ifdef CONFIG_DEBUG_FS
+struct dlm_debug_ctxt {
+        struct kref debug_refcnt;
+        struct dentry *debug_state_dentry;
+        struct dentry *debug_lockres_dentry;
+        struct dentry *debug_mle_dentry;
+        struct dentry *debug_purgelist_dentry;
+};
+struct debug_buffer {
+        int len;
+        char *buf;
+};
+struct debug_lockres {
+        int dl_len;
+        char *dl_buf;
+        struct dlm_ctxt *dl_ctxt;
+        struct dlm_lock_resource *dl_res;
+};
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+#else
+static int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+        return 0;
+}
+static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+        return 0;
+}
+static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_root(void)
+{
+        return 0;
+}
+static void dlm_destroy_debugfs_root(void)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+#endif  /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0879d86113e3..63f8125824e8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/debugfs.h>
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 #include "dlmver.h"
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
+        dlm_destroy_debugfs_subroot(dlm);
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
        dlm_unregister_domain_handlers(dlm);
+        dlm_debug_shutdown(dlm);
        dlm_complete_thread(dlm);
        dlm_complete_recovery_thread(dlm);
        dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
 void dlm_unregister_domain(struct dlm_ctxt *dlm)
 {
        int leave = 0;
+        struct dlm_lock_resource *res;
        spin_lock(&dlm_domain_lock);
        BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                        msleep(500);
                        mlog(0, "%s: more migration to do\n", dlm->name);
                }
+                /* This list should be empty. If not, print remaining lockres */
+                if (!list_empty(&dlm->tracking_list)) {
+                        mlog(ML_ERROR, "Following lockres' are still on the "
+                             "tracking list:\n");
+                        list_for_each_entry(res, &dlm->tracking_list, tracking)
+                                dlm_print_one_lock_resource(res);
+                }
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
                dlm_complete_dlm_shutdown(dlm);
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                goto bail;
        }
+        status = dlm_debug_init(dlm);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
        status = dlm_launch_thread(dlm);
        if (status < 0) {
                mlog_errno(status);
@@ -1472,6 +1492,7 @@ bail:
        if (status) {
                dlm_unregister_domain_handlers(dlm);
+                dlm_debug_shutdown(dlm);
                dlm_complete_thread(dlm);
                dlm_complete_recovery_thread(dlm);
                dlm_destroy_dlm_worker(dlm);
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                                u32 key)
 {
        int i;
+        int ret;
        struct dlm_ctxt *dlm = NULL;
        dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
+        ret = dlm_create_debugfs_subroot(dlm);
+        if (ret < 0) {
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+                kfree(dlm->name);
+                kfree(dlm);
+                dlm = NULL;
+                goto leave;
+        }
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        INIT_LIST_HEAD(&dlm->reco.node_data);
        INIT_LIST_HEAD(&dlm->purge_list);
        INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+        INIT_LIST_HEAD(&dlm->tracking_list);
        dlm->reco.state = 0;
        INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void)
        dlm_print_version();
        status = dlm_init_mle_cache();
-        if (status)
+        if (status) {
-                return -1;
+                mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+                goto error;
+        }
+        status = dlm_init_master_caches();
+        if (status) {
+                mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+                     "o2dlm_lockname slabcaches\n");
+                goto error;
+        }
+        status = dlm_init_lock_cache();
+        if (status) {
+                mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+                goto error;
+        }
        status = dlm_register_net_handlers();
        if (status) {
-                dlm_destroy_mle_cache();
+                mlog(ML_ERROR, "Unable to register network handlers\n");
-                return -1;
+                goto error;
        }
+        status = dlm_create_debugfs_root();
+        if (status)
+                goto error;
        return 0;
+error:
+        dlm_unregister_net_handlers();
+        dlm_destroy_lock_cache();
+        dlm_destroy_master_caches();
+        dlm_destroy_mle_cache();
+        return -1;
 }
 static void __exit dlm_exit (void)
 {
+        dlm_destroy_debugfs_root();
        dlm_unregister_net_handlers();
+        dlm_destroy_lock_cache();
+        dlm_destroy_master_caches();
        dlm_destroy_mle_cache();
 }
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9a..83a9f2972ac8 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
+static struct kmem_cache *dlm_lock_cache = NULL;
 static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
 static void dlm_lock_release(struct kref *kref);
 static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+int dlm_init_lock_cache(void)
+{
+        dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+                                           sizeof(struct dlm_lock),
+                                           0, SLAB_HWCACHE_ALIGN, NULL);
+        if (dlm_lock_cache == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void dlm_destroy_lock_cache(void)
+{
+        if (dlm_lock_cache)
+                kmem_cache_destroy(dlm_lock_cache);
+}
 /* Tell us whether we can grant a new lock request.
 * locking:
 *   caller needs:  res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
                mlog(0, "freeing kernel-allocated lksb\n");
                kfree(lock->lksb);
        }
-        kfree(lock);
+        kmem_cache_free(dlm_lock_cache, lock);
 }
 /* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = kzalloc(sizeof(*lock), GFP_NOFS);
+        lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ea6b89577860..efc015c6128a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,47 +48,11 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
-enum dlm_mle_type {
-        DLM_MLE_BLOCK,
-        DLM_MLE_MASTER,
-        DLM_MLE_MIGRATION
-};
-struct dlm_lock_name
-{
-        u8 len;
-        u8 name[DLM_LOCKID_NAME_MAX];
-};
-struct dlm_master_list_entry
-{
-        struct list_head list;
-        struct list_head hb_events;
-        struct dlm_ctxt *dlm;
-        spinlock_t spinlock;
-        wait_queue_head_t wq;
-        atomic_t woken;
-        struct kref mle_refs;
-        int inuse;
-        unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        u8 master;
-        u8 new_master;
-        enum dlm_mle_type type;
-        struct o2hb_callback_func mle_hb_up;
-        struct o2hb_callback_func mle_hb_down;
-        union {
-                struct dlm_lock_resource *res;
-                struct dlm_lock_name name;
-        } u;
-};
 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
                              struct dlm_master_list_entry *mle,
                              struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
        return 1;
 }
-#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
+static struct kmem_cache *dlm_lockres_cache = NULL;
-static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+static struct kmem_cache *dlm_lockname_cache = NULL;
-{
-        int i;
-        printk("%s=[ ", mapname);
-        for (i=0; i<O2NM_MAX_NODES; i++)
-                if (test_bit(i, map))
-                        printk("%d ", i);
-        printk("]");
-}
-static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
-{
-        int refs;
-        char *type;
-        char attached;
-        u8 master;
-        unsigned int namelen;
-        const char *name;
-        struct kref *k;
-        unsigned long *maybe = mle->maybe_map,
-                      *vote = mle->vote_map,
-                      *resp = mle->response_map,
-                      *node = mle->node_map;
-        k = &mle->mle_refs;
-        if (mle->type == DLM_MLE_BLOCK)
-                type = "BLK";
-        else if (mle->type == DLM_MLE_MASTER)
-                type = "MAS";
-        else
-                type = "MIG";
-        refs = atomic_read(&k->refcount);
-        master = mle->master;
-        attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
-        if (mle->type != DLM_MLE_MASTER) {
-                namelen = mle->u.name.len;
-                name = mle->u.name.name;
-        } else {
-                namelen = mle->u.res->lockname.len;
-                name = mle->u.res->lockname.name;
-        }
-        mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
-                  namelen, name, type, refs, master, mle->new_master, attached,
-                  mle->inuse);
-        dlm_print_nodemap(maybe);
-        printk(", ");
-        dlm_print_nodemap(vote);
-        printk(", ");
-        dlm_print_nodemap(resp);
-        printk(", ");
-        dlm_print_nodemap(node);
-        printk(", ");
-        printk("\n");
-}
-#if 0
-/* Code here is included but defined out as it aids debugging */
-static void dlm_dump_mles(struct dlm_ctxt *dlm)
-{
-        struct dlm_master_list_entry *mle;
-        
-        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-        spin_lock(&dlm->master_lock);
-        list_for_each_entry(mle, &dlm->master_list, list)
-                dlm_print_one_mle(mle);
-        spin_unlock(&dlm->master_lock);
-}
-int dlm_dump_all_mles(const char __user *data, unsigned int len)
-{
-        struct dlm_ctxt *dlm;
-        spin_lock(&dlm_domain_lock);
-        list_for_each_entry(dlm, &dlm_domains, list) {
-                mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
-                dlm_dump_mles(dlm);
-        }
-        spin_unlock(&dlm_domain_lock);
-        return len;
-}
-EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
-#endif  /*  0  */
 static struct kmem_cache *dlm_mle_cache = NULL;
 static void dlm_mle_release(struct kref *kref);
 static void dlm_init_mle(struct dlm_master_list_entry *mle,
                        enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
 int dlm_init_mle_cache(void)
 {
-        dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+        dlm_mle_cache = kmem_cache_create("o2dlm_mle",
                                          sizeof(struct dlm_master_list_entry),
                                          0, SLAB_HWCACHE_ALIGN,
                                          NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
 * LOCK RESOURCE FUNCTIONS
 */
+int dlm_init_master_caches(void)
+{
+        dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+                                              sizeof(struct dlm_lock_resource),
+                                              0, SLAB_HWCACHE_ALIGN, NULL);
+        if (!dlm_lockres_cache)
+                goto bail;
+        dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+                                               DLM_LOCKID_NAME_MAX, 0,
+                                               SLAB_HWCACHE_ALIGN, NULL);
+        if (!dlm_lockname_cache)
+                goto bail;
+        return 0;
+bail:
+        dlm_destroy_master_caches();
+        return -ENOMEM;
+}
+void dlm_destroy_master_caches(void)
+{
+        if (dlm_lockname_cache)
+                kmem_cache_destroy(dlm_lockname_cache);
+        if (dlm_lockres_cache)
+                kmem_cache_destroy(dlm_lockres_cache);
+}
 static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
                                  struct dlm_lock_resource *res,
                                  u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        if (!list_empty(&res->tracking))
+                list_del_init(&res->tracking);
+        else {
+                mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+                     res->lockname.len, res->lockname.name);
+                dlm_print_one_lock_resource(res);
+        }
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
            !list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
        BUG_ON(!list_empty(&res->recovering));
        BUG_ON(!list_empty(&res->purge));
-        kfree(res->lockname.name);
+        kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
-        kfree(res);
+        kmem_cache_free(dlm_lockres_cache, res);
 }
 void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        INIT_LIST_HEAD(&res->dirty);
        INIT_LIST_HEAD(&res->recovering);
        INIT_LIST_HEAD(&res->purge);
+        INIT_LIST_HEAD(&res->tracking);
        atomic_set(&res->asts_reserved, 0);
        res->migration_pending = 0;
        res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->last_used = 0;
+        list_add_tail(&res->tracking, &dlm->tracking_list);
        memset(res->lvb, 0, DLM_LVB_LEN);
        memset(res->refmap, 0, sizeof(res->refmap));
 }
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                   const char *name,
                                   unsigned int namelen)
 {
-        struct dlm_lock_resource *res;
+        struct dlm_lock_resource *res = NULL;
-        res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
+        res = (struct dlm_lock_resource *)
+                                kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
-                return NULL;
+                goto error;
-        res->lockname.name = kmalloc(namelen, GFP_NOFS);
+        res->lockname.name = (char *)
-        if (!res->lockname.name) {
+                                kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
-                kfree(res);
+        if (!res->lockname.name)
-                return NULL;
+                goto error;
-        }
        dlm_init_lockres(dlm, res, name, namelen);
        return res;
+error:
+        if (res && res->lockname.name)
+                kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+        if (res)
+                kmem_cache_free(dlm_lockres_cache, res);
+        return NULL;
 }
 void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1f1873bf41fb..394d25a131a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,18 +27,11 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
-#include <linux/crc32.h>
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-#include <dlm/dlmapi.h>
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -53,6 +46,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "stackglue.h"
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        struct ocfs2_meta_lvb *lvb =
+                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
-/*
- * This is the filesystem locking protocol version.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-const struct dlm_protocol_version ocfs2_locking_protocol = {
-        .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-        .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
 static int ocfs2_lock_create(struct ocfs2_super *osb,
                             struct ocfs2_lock_res *lockres,
                             int level,
-                             int dlm_flags);
+                             u32 dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
                                                     int wanted);
 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                                                int convert);
-#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {        \
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {                 \
-        mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "  \
+        mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
-                "resource %s: %s\n", dlm_errname(_stat), _func, \
+             _err, _func, _lockres->l_name);                            \
-                _lockres->l_name, dlm_errmsg(_stat));           \
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-                                      int new_level);
+                                              int new_level);
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                                  struct ocfs2_lock_res *lockres,
                                  int new_level,
-                                  int lvb);
+                                  int lvb,
+                                  unsigned int generation);
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
        res->l_ops           = ops;
        res->l_priv          = priv;
-        res->l_level         = LKM_IVMODE;
+        res->l_level         = DLM_LOCK_IV;
-        res->l_requested     = LKM_IVMODE;
+        res->l_requested     = DLM_LOCK_IV;
-        res->l_blocking      = LKM_IVMODE;
+        res->l_blocking      = DLM_LOCK_IV;
        res->l_action        = OCFS2_AST_INVALID;
        res->l_unlock_action = OCFS2_UNLOCK_INVALID;
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
        BUG_ON(!lockres);
        switch(level) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                lockres->l_ex_holders++;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                lockres->l_ro_holders++;
                break;
        default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
        BUG_ON(!lockres);
        switch(level) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                BUG_ON(!lockres->l_ex_holders);
                lockres->l_ex_holders--;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                BUG_ON(!lockres->l_ro_holders);
                lockres->l_ro_holders--;
                break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
 * lock types are added. */
 static inline int ocfs2_highest_compat_lock_level(int level)
 {
-        int new_level = LKM_EXMODE;
+        int new_level = DLM_LOCK_EX;
-        if (level == LKM_EXMODE)
+        if (level == DLM_LOCK_EX)
-                new_level = LKM_NLMODE;
+                new_level = DLM_LOCK_NL;
-        else if (level == LKM_PRMODE)
+        else if (level == DLM_LOCK_PR)
-                new_level = LKM_PRMODE;
+                new_level = DLM_LOCK_PR;
        return new_level;
 }
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-        BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+        BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
        lockres->l_level = lockres->l_requested;
        if (lockres->l_level <=
            ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
-                lockres->l_blocking = LKM_NLMODE;
+                lockres->l_blocking = DLM_LOCK_NL;
                lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
        }
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
         * information is already up to data. Convert from NL to
         * *anything* however should mark ourselves as needing an
         * update */
-        if (lockres->l_level == LKM_NLMODE &&
+        if (lockres->l_level == DLM_LOCK_NL &&
            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
        BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
-        if (lockres->l_requested > LKM_NLMODE &&
+        if (lockres->l_requested > DLM_LOCK_NL &&
            !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
        return needs_downconvert;
 }
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist?  To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock().  See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely.  However, it introduces
+ * a race on itself.  In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns.  The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time.  When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again.  If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action?  The other path has re-set PENDING.  Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ *  ocfs2_cluster_lock()
+ *   set BUSY
+ *   set PENDING
+ *   drop l_lock
+ *   ocfs2_dlm_lock()
+ *    ocfs2_locking_ast()               ocfs2_downconvert_thread()
+ *     clear PENDING                     ocfs2_unblock_lock()
+ *                                        take_l_lock
+ *                                        !BUSY
+ *                                        ocfs2_prepare_downconvert()
+ *                                         set BUSY
+ *                                         set PENDING
+ *                                        drop l_lock
+ *   take l_lock
+ *   clear PENDING
+ *   drop l_lock
+ *                      <window>
+ *                                        ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert().  That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen.  A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres.  lockres_set_pending() will return the
+ * current generation number.  When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending().  In our
+ * example above, the generation numbers will *not* match.  Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+                                    unsigned int generation,
+                                    struct ocfs2_super *osb)
+{
+        assert_spin_locked(&lockres->l_lock);
+        /*
+         * The ast and locking functions can race us here.  The winner
+         * will clear pending, the loser will not.
+         */
+        if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+            (lockres->l_pending_gen != generation))
+                return;
+        lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+        lockres->l_pending_gen++;
+        /*
+         * The downconvert thread may have skipped us because we
+         * were PENDING.  Wake it up.
+         */
+        if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+                ocfs2_wake_downconvert_thread(osb);
+}
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+                                  unsigned int generation,
+                                  struct ocfs2_super *osb)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        __lockres_clear_pending(lockres, generation, osb);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+        assert_spin_locked(&lockres->l_lock);
+        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+        lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+        return lockres->l_pending_gen;
+}
 static void ocfs2_blocking_ast(void *opaque, int level)
 {
        struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
        int needs_downconvert;
        unsigned long flags;
-        BUG_ON(level <= LKM_NLMODE);
+        BUG_ON(level <= DLM_LOCK_NL);
        mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
             lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 static void ocfs2_locking_ast(void *opaque)
 {
        struct ocfs2_lock_res *lockres = opaque;
-        struct dlm_lockstatus *lksb = &lockres->l_lksb;
+        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
        unsigned long flags;
+        int status;
        spin_lock_irqsave(&lockres->l_lock, flags);
-        if (lksb->status != DLM_NORMAL) {
+        status = ocfs2_dlm_lock_status(&lockres->l_lksb);
-                mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
-                     lockres->l_name, lksb->status);
+        if (status == -EAGAIN) {
+                lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+                goto out;
+        }
+        if (status) {
+                mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+                     lockres->l_name, status);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                return;
        }
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
                     lockres->l_unlock_action);
                BUG();
        }
+out:
        /* set it to something invalid so if we get called again we
         * can catch it. */
        lockres->l_action = OCFS2_AST_INVALID;
+        /* Did we try to cancel this lock?  Clear that state */
+        if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+                lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+        /*
+         * We may have beaten the locking functions here.  We certainly
+         * know that dlm_lock() has been called :-)
+         * Because we can't have two lock calls in flight at once, we
+         * can use lockres->l_pending_gen.
+         */
+        __lockres_clear_pending(lockres, lockres->l_pending_gen,  osb);
        wake_up(&lockres->l_event);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 static int ocfs2_lock_create(struct ocfs2_super *osb,
                             struct ocfs2_lock_res *lockres,
                             int level,
-                             int dlm_flags)
+                             u32 dlm_flags)
 {
        int ret = 0;
-        enum dlm_status status = DLM_NORMAL;
        unsigned long flags;
+        unsigned int gen;
        mlog_entry_void();
-        mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+        mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
             dlm_flags);
        spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
        lockres->l_action = OCFS2_AST_ATTACH;
        lockres->l_requested = level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+        gen = lockres_set_pending(lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        status = dlmlock(osb->dlm,
+        ret = ocfs2_dlm_lock(osb->cconn,
-                         level,
+                             level,
-                         &lockres->l_lksb,
+                             &lockres->l_lksb,
-                         dlm_flags,
+                             dlm_flags,
-                         lockres->l_name,
+                             lockres->l_name,
-                         OCFS2_LOCK_ID_MAX_LEN - 1,
+                             OCFS2_LOCK_ID_MAX_LEN - 1,
-                         ocfs2_locking_ast,
+                             lockres);
-                         lockres,
+        lockres_clear_pending(lockres, gen, osb);
-                         ocfs2_blocking_ast);
+        if (ret) {
-        if (status != DLM_NORMAL) {
+                ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
-                ocfs2_log_dlm_error("dlmlock", status, lockres);
-                ret = -EINVAL;
                ocfs2_recover_from_dlm_error(lockres, 1);
        }
-        mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+        mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 bail:
        mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                              struct ocfs2_lock_res *lockres,
                              int level,
-                              int lkm_flags,
+                              u32 lkm_flags,
                              int arg_flags)
 {
        struct ocfs2_mask_waiter mw;
-        enum dlm_status status;
        int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
        int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
        unsigned long flags;
+        unsigned int gen;
+        int noqueue_attempted = 0;
        mlog_entry_void();
        ocfs2_init_mask_waiter(&mw);
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-                lkm_flags |= LKM_VALBLK;
+                lkm_flags |= DLM_LKF_VALBLK;
 again:
        wait = 0;
@@ -1068,52 +1165,56 @@ again:
        }
        if (level > lockres->l_level) {
+                if (noqueue_attempted > 0) {
+                        ret = -EAGAIN;
+                        goto unlock;
+                }
+                if (lkm_flags & DLM_LKF_NOQUEUE)
+                        noqueue_attempted = 1;
                if (lockres->l_action != OCFS2_AST_INVALID)
                        mlog(ML_ERROR, "lockres %s has action %u pending\n",
                             lockres->l_name, lockres->l_action);
                if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
                        lockres->l_action = OCFS2_AST_ATTACH;
-                        lkm_flags &= ~LKM_CONVERT;
+                        lkm_flags &= ~DLM_LKF_CONVERT;
                } else {
                        lockres->l_action = OCFS2_AST_CONVERT;
-                        lkm_flags |= LKM_CONVERT;
+                        lkm_flags |= DLM_LKF_CONVERT;
                }
                lockres->l_requested = level;
                lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+                gen = lockres_set_pending(lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                BUG_ON(level == LKM_IVMODE);
+                BUG_ON(level == DLM_LOCK_IV);
-                BUG_ON(level == LKM_NLMODE);
+                BUG_ON(level == DLM_LOCK_NL);
                mlog(0, "lock %s, convert from %d to level = %d\n",
                     lockres->l_name, lockres->l_level, level);
                /* call dlm_lock to upgrade lock now */
-                status = dlmlock(osb->dlm,
+                ret = ocfs2_dlm_lock(osb->cconn,
-                                 level,
+                                     level,
-                                 &lockres->l_lksb,
+                                     &lockres->l_lksb,
-                                 lkm_flags,
+                                     lkm_flags,
-                                 lockres->l_name,
+                                     lockres->l_name,
-                                 OCFS2_LOCK_ID_MAX_LEN - 1,
+                                     OCFS2_LOCK_ID_MAX_LEN - 1,
-                                 ocfs2_locking_ast,
+                                     lockres);
-                                 lockres,
+                lockres_clear_pending(lockres, gen, osb);
-                                 ocfs2_blocking_ast);
+                if (ret) {
-                if (status != DLM_NORMAL) {
+                        if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
-                        if ((lkm_flags & LKM_NOQUEUE) &&
+                            (ret != -EAGAIN)) {
-                            (status == DLM_NOTQUEUED))
+                                ocfs2_log_dlm_error("ocfs2_dlm_lock",
-                                ret = -EAGAIN;
+                                                    ret, lockres);
-                        else {
-                                ocfs2_log_dlm_error("dlmlock", status,
-                                                    lockres);
-                                ret = -EINVAL;
                        }
                        ocfs2_recover_from_dlm_error(lockres, 1);
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from dlmlock\n",
+                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
                                 int ex,
                                 int local)
 {
-        int level =  ex ? LKM_EXMODE : LKM_PRMODE;
+        int level =  ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        unsigned long flags;
-        int lkm_flags = local ? LKM_LOCAL : 0;
+        u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
        spin_lock_irqsave(&lockres->l_lock, flags);
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
        }
        /*
-         * We don't want to use LKM_LOCAL on a meta data lock as they
+         * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
         * don't use a generation in their lock names.
         */
        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
        lockres = &OCFS2_I(inode)->ip_rw_lockres;
-        level = write ? LKM_EXMODE : LKM_PRMODE;
+        level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
                                    0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 void ocfs2_rw_unlock(struct inode *inode, int write)
 {
-        int level = write ? LKM_EXMODE : LKM_PRMODE;
+        int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
        lockres = &OCFS2_I(inode)->ip_open_lockres;
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-                                    LKM_PRMODE, 0, 0);
+                                    DLM_LOCK_PR, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
        lockres = &OCFS2_I(inode)->ip_open_lockres;
-        level = write ? LKM_EXMODE : LKM_PRMODE;
+        level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        /*
         * The file system may already holding a PRMODE/EXMODE open lock.
-         * Since we pass LKM_NOQUEUE, the request won't block waiting on
+         * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
         * other nodes and the -EAGAIN will indicate to the caller that
         * this inode is still in use.
         */
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-                                    level, LKM_NOQUEUE, 0);
+                                    level, DLM_LKF_NOQUEUE, 0);
 out:
        mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
        if(lockres->l_ro_holders)
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-                                     LKM_PRMODE);
+                                     DLM_LOCK_PR);
        if(lockres->l_ex_holders)
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-                                     LKM_EXMODE);
+                                     DLM_LOCK_EX);
 out:
        mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        ocfs2_init_mask_waiter(&mw);
        if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
-            (lockres->l_level > LKM_NLMODE)) {
+            (lockres->l_level > DLM_LOCK_NL)) {
                mlog(ML_ERROR,
                     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
                     "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+        ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
-                      lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                             lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
-                      ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+                             lockres);
-        if (ret != DLM_NORMAL) {
+        if (ret) {
-                if (trylock && ret == DLM_NOTQUEUED)
+                if (!trylock || (ret != -EAGAIN)) {
-                        ret = -EAGAIN;
+                        ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
-                else {
-                        ocfs2_log_dlm_error("dlmlock", ret, lockres);
                        ret = -EINVAL;
                }
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * to just bubble sucess back up to the user.
                 */
                ret = ocfs2_flock_handle_signal(lockres, level);
+        } else if (!ret && (level > lockres->l_level)) {
+                /* Trylock failed asynchronously */
+                BUG_ON(!trylock);
+                ret = -EAGAIN;
        }
 out:
@@ -1549,6 +1652,7 @@ out:
 void ocfs2_file_unlock(struct file *file)
 {
        int ret;
+        unsigned int gen;
        unsigned long flags;
        struct ocfs2_file_private *fp = file->private_data;
        struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
         * Fake a blocking ast for the downconvert code.
         */
        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-        lockres->l_blocking = LKM_EXMODE;
+        lockres->l_blocking = DLM_LOCK_EX;
-        ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
        if (ret) {
                mlog_errno(ret);
                return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
         * condition. */
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
                switch(lockres->l_blocking) {
-                case LKM_EXMODE:
+                case DLM_LOCK_EX:
                        if (!lockres->l_ex_holders && !lockres->l_ro_holders)
                                kick = 1;
                        break;
-                case LKM_PRMODE:
+                case DLM_LOCK_PR:
                        if (!lockres->l_ex_holders)
                                kick = 1;
                        break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        mlog_entry_void();
-        lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_meta_lvb(0, lockres);
-        lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        struct ocfs2_meta_lvb *lvb =
+                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
                         int ex,
                         int arg_flags)
 {
-        int status, level, dlm_flags, acquired;
+        int status, level, acquired;
+        u32 dlm_flags;
        struct ocfs2_lock_res *lockres = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
                goto local;
        if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-                wait_event(osb->recovery_event,
+                ocfs2_wait_for_recovery(osb);
-                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
        lockres = &OCFS2_I(inode)->ip_inode_lockres;
-        level = ex ? LKM_EXMODE : LKM_PRMODE;
+        level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        dlm_flags = 0;
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
-                dlm_flags |= LKM_NOQUEUE;
+                dlm_flags |= DLM_LKF_NOQUEUE;
        status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
        if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
         * committed to owning this lock so we don't allow signals to
         * abort the operation. */
        if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-                wait_event(osb->recovery_event,
+                ocfs2_wait_for_recovery(osb);
-                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
 local:
        /*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 void ocfs2_inode_unlock(struct inode *inode,
                       int ex)
 {
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex)
 {
        int status = 0;
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
-        struct buffer_head *bh;
-        struct ocfs2_slot_info *si = osb->slot_info;
        mlog_entry_void();
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                goto bail;
        }
        if (status) {
-                bh = si->si_bh;
+                status = ocfs2_refresh_slot_info(osb);
-                status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
-                                          si->si_inode);
-                if (status == 0)
-                        ocfs2_update_slot_info(si);
                ocfs2_complete_lock_res_refresh(lockres, status);
@@ -2178,7 +2276,7 @@ bail:
 void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex)
 {
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
        if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return 0;
-        status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+        status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
        struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
        if (!ocfs2_mount_local(osb))
-                ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
 {
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
                   lockres->l_blocking);
        /* Dump the raw LVB */
-        lvb = lockres->l_lksb.lvb;
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        for(i = 0; i < DLM_LVB_LEN; i++)
                seq_printf(m, "0x%x\t", lvb[i]);
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
 int ocfs2_dlm_init(struct ocfs2_super *osb)
 {
        int status = 0;
-        u32 dlm_key;
+        struct ocfs2_cluster_connection *conn = NULL;
-        struct dlm_ctxt *dlm = NULL;
        mlog_entry_void();
-        if (ocfs2_mount_local(osb))
+        if (ocfs2_mount_local(osb)) {
+                osb->node_num = 0;
                goto local;
+        }
        status = ocfs2_dlm_init_debug(osb);
        if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
-        /* used by the dlm code to make message headers unique, each
-         * node in this domain must agree on this. */
-        dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
        /* for now, uuid == domain */
-        dlm = dlm_register_domain(osb->uuid_str, dlm_key,
+        status = ocfs2_cluster_connect(osb->osb_cluster_stack,
-                                  &osb->osb_locking_proto);
+                                       osb->uuid_str,
-        if (IS_ERR(dlm)) {
+                                       strlen(osb->uuid_str),
-                status = PTR_ERR(dlm);
+                                       ocfs2_do_node_down, osb,
+                                       &conn);
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
-        dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+        status = ocfs2_cluster_this_node(&osb->node_num);
+        if (status < 0) {
+                mlog_errno(status);
+                mlog(ML_ERROR,
+                     "could not find this host's node number\n");
+                ocfs2_cluster_disconnect(conn, 0);
+                goto bail;
+        }
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
-        osb->dlm = dlm;
+        osb->cconn = conn;
        status = 0;
 bail:
@@ -2560,14 +2664,19 @@ bail:
        return status;
 }
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+                        int hangup_pending)
 {
        mlog_entry_void();
-        dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
        ocfs2_drop_osb_locks(osb);
+        /*
+         * Now that we have dropped all locks and ocfs2_dismount_volume()
+         * has disabled recovery, the DLM won't be talking to us.  It's
+         * safe to tear things down before disconnecting the cluster.
+         */
        if (osb->dc_task) {
                kthread_stop(osb->dc_task);
                osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
-        dlm_unregister_domain(osb->dlm);
+        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
-        osb->dlm = NULL;
+        osb->cconn = NULL;
        ocfs2_dlm_shutdown_debug(osb);
        mlog_exit_void();
 }
-static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
+static void ocfs2_unlock_ast(void *opaque, int error)
 {
        struct ocfs2_lock_res *lockres = opaque;
        unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
             lockres->l_unlock_action);
        spin_lock_irqsave(&lockres->l_lock, flags);
-        /* We tried to cancel a convert request, but it was already
+        if (error) {
-         * granted. All we want to do here is clear our unlock
+                mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
-         * state. The wake_up call done at the bottom is redundant
+                     "unlock_action %d\n", error, lockres->l_name,
-         * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
-         * hurt anything anyway */
-        if (status == DLM_CANCELGRANT &&
-            lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
-                mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
-                /* We don't clear the busy flag in this case as it
-                 * should have been cleared by the ast which the dlm
-                 * has called. */
-                goto complete_unlock;
-        }
-        if (status != DLM_NORMAL) {
-                mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
-                     "unlock_action %d\n", status, lockres->l_name,
                     lockres->l_unlock_action);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
                lockres->l_action = OCFS2_AST_INVALID;
                break;
        case OCFS2_UNLOCK_DROP_LOCK:
-                lockres->l_level = LKM_IVMODE;
+                lockres->l_level = DLM_LOCK_IV;
                break;
        default:
                BUG();
        }
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-complete_unlock:
        lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
        spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2643,16 +2736,16 @@ complete_unlock:
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
                           struct ocfs2_lock_res *lockres)
 {
-        enum dlm_status status;
+        int ret;
        unsigned long flags;
-        int lkm_flags = 0;
+        u32 lkm_flags = 0;
        /* We didn't get anywhere near actually using this lockres. */
        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
                goto out;
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-                lkm_flags |= LKM_VALBLK;
+                lkm_flags |= DLM_LKF_VALBLK;
        spin_lock_irqsave(&lockres->l_lock, flags);
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
                if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
-                    lockres->l_level == LKM_EXMODE &&
+                    lockres->l_level == DLM_LOCK_EX &&
                    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
                        lockres->l_ops->set_lvb(lockres);
        }
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        mlog(0, "lock %s\n", lockres->l_name);
-        status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
+        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
-                           ocfs2_unlock_ast, lockres);
+                               lockres);
-        if (status != DLM_NORMAL) {
+        if (ret) {
-                ocfs2_log_dlm_error("dlmunlock", status, lockres);
+                ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
                mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
-                dlm_print_one_lock(lockres->l_lksb.lockid);
+                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from dlmunlock\n",
+        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
        return status;
 }
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-                                      int new_level)
+                                              int new_level)
 {
        assert_spin_locked(&lockres->l_lock);
-        BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+        BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
        if (lockres->l_level <= new_level) {
-                mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+                mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
                     lockres->l_level, new_level);
                BUG();
        }
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
        lockres->l_action = OCFS2_AST_DOWNCONVERT;
        lockres->l_requested = new_level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+        return lockres_set_pending(lockres);
 }
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                                  struct ocfs2_lock_res *lockres,
                                  int new_level,
-                                  int lvb)
+                                  int lvb,
+                                  unsigned int generation)
 {
-        int ret, dlm_flags = LKM_CONVERT;
+        int ret;
-        enum dlm_status status;
+        u32 dlm_flags = DLM_LKF_CONVERT;
        mlog_entry_void();
        if (lvb)
-                dlm_flags |= LKM_VALBLK;
+                dlm_flags |= DLM_LKF_VALBLK;
-        status = dlmlock(osb->dlm,
+        ret = ocfs2_dlm_lock(osb->cconn,
-                         new_level,
+                             new_level,
-                         &lockres->l_lksb,
+                             &lockres->l_lksb,
-                         dlm_flags,
+                             dlm_flags,
-                         lockres->l_name,
+                             lockres->l_name,
-                         OCFS2_LOCK_ID_MAX_LEN - 1,
+                             OCFS2_LOCK_ID_MAX_LEN - 1,
-                         ocfs2_locking_ast,
+                             lockres);
-                         lockres,
+        lockres_clear_pending(lockres, generation, osb);
-                         ocfs2_blocking_ast);
+        if (ret) {
-        if (status != DLM_NORMAL) {
+                ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
-                ocfs2_log_dlm_error("dlmlock", status, lockres);
-                ret = -EINVAL;
                ocfs2_recover_from_dlm_error(lockres, 1);
                goto bail;
        }
@@ -2862,7 +2955,7 @@ bail:
        return ret;
 }
-/* returns 1 when the caller should unlock and call dlmunlock */
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres)
 {
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
                                struct ocfs2_lock_res *lockres)
 {
        int ret;
-        enum dlm_status status;
        mlog_entry_void();
        mlog(0, "lock %s\n", lockres->l_name);
-        ret = 0;
+        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
-        status = dlmunlock(osb->dlm,
+                               DLM_LKF_CANCEL, lockres);
-                           &lockres->l_lksb,
+        if (ret) {
-                           LKM_CANCEL,
+                ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
-                           ocfs2_unlock_ast,
-                           lockres);
-        if (status != DLM_NORMAL) {
-                ocfs2_log_dlm_error("dlmunlock", status, lockres);
-                ret = -EINVAL;
                ocfs2_recover_from_dlm_error(lockres, 0);
        }
-        mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+        mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
        mlog_exit(ret);
        return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
        int new_level;
        int ret = 0;
        int set_lvb = 0;
+        unsigned int gen;
        mlog_entry_void();
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 recheck:
        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+                /* XXX
+                 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
+                 * exists entirely for one reason - another thread has set
+                 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+                 *
+                 * If we do ocfs2_cancel_convert() before the other thread
+                 * calls dlm_lock(), our cancel will do nothing.  We will
+                 * get no ast, and we will have no way of knowing the
+                 * cancel failed.  Meanwhile, the other thread will call
+                 * into dlm_lock() and wait...forever.
+                 *
+                 * Why forever?  Because another node has asked for the
+                 * lock first; that's why we're here in unblock_lock().
+                 *
+                 * The solution is OCFS2_LOCK_PENDING.  When PENDING is
+                 * set, we just requeue the unblock.  Only when the other
+                 * thread has called dlm_lock() and cleared PENDING will
+                 * we then cancel their request.
+                 *
+                 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+                 * at the same time they set OCFS2_DLM_BUSY.  They must
+                 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+                 */
+                if (lockres->l_flags & OCFS2_LOCK_PENDING)
+                        goto leave_requeue;
                ctl->requeue = 1;
                ret = ocfs2_prepare_cancel_convert(osb, lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
        /* if we're blocking an exclusive and we have *any* holders,
         * then requeue. */
-        if ((lockres->l_blocking == LKM_EXMODE)
+        if ((lockres->l_blocking == DLM_LOCK_EX)
            && (lockres->l_ex_holders || lockres->l_ro_holders))
                goto leave_requeue;
        /* If it's a PR we're blocking, then only
         * requeue if we've got any EX holders */
-        if (lockres->l_blocking == LKM_PRMODE &&
+        if (lockres->l_blocking == DLM_LOCK_PR &&
            lockres->l_ex_holders)
                goto leave_requeue;
@@ -3005,7 +3119,7 @@ downconvert:
        ctl->requeue = 0;
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
-                if (lockres->l_level == LKM_EXMODE)
+                if (lockres->l_level == DLM_LOCK_EX)
                        set_lvb = 1;
                /*
@@ -3018,9 +3132,11 @@ downconvert:
                        lockres->l_ops->set_lvb(lockres);
        }
-        ocfs2_prepare_downconvert(lockres, new_level);
+        gen = ocfs2_prepare_downconvert(lockres, new_level);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+        ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+                                     gen);
 leave:
        mlog_exit(ret);
        return ret;
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
        }
        sync_mapping_buffers(mapping);
-        if (blocking == LKM_EXMODE) {
+        if (blocking == DLM_LOCK_EX) {
                truncate_inode_pages(mapping, 0);
        } else {
                /* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
        struct inode *inode = ocfs2_lock_res_inode(lockres);
        int checkpointed = ocfs2_inode_fully_checkpointed(inode);
-        BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
+        BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
-        BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
+        BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
        if (checkpointed)
                return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
         * valid. The downconvert code will retain a PR for this node,
         * so there's no further work to do.
         */
-        if (blocking == LKM_PRMODE)
+        if (blocking == DLM_LOCK_PR)
                return UNBLOCK_CONTINUE;
        /*
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+        .lp_max_version = {
+                .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+                .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+        },
+        .lp_lock_ast            = ocfs2_locking_ast,
+        .lp_blocking_ast        = ocfs2_blocking_ast,
+        .lp_unlock_ast          = ocfs2_unlock_ast,
+};
+void ocfs2_set_locking_protocol(void)
+{
+        ocfs2_stack_glue_set_locking_protocol(&lproto);
+}
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                                       struct ocfs2_lock_res *lockres)
 {
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902404b4..2bb01f09c1b1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
 #define OCFS2_LOCK_NONBLOCK             (0x04)
 int ocfs2_dlm_init(struct ocfs2_super *osb);
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               enum ocfs2_lock_type type,
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
-extern const struct dlm_protocol_version ocfs2_locking_protocol;
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
 #endif  /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85d..9154c82d3258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
        .open           = ocfs2_file_open,
        .aio_read       = ocfs2_file_aio_read,
        .aio_write      = ocfs2_file_aio_write,
-        .ioctl          = ocfs2_ioctl,
+        .unlocked_ioctl = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
-        .ioctl          = ocfs2_ioctl,
+        .unlocked_ioctl = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf64da0..c6e7213db868 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,9 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/kmod.h>
-#include <dlm/dlmapi.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
                                              int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
 /* special case -1 for now
 * TODO: should *really* make sure the calling func never passes -1!!  */
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
        spin_lock_init(&osb->node_map_lock);
-        ocfs2_node_map_init(&osb->recovery_map);
        ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
-static void ocfs2_do_node_down(int node_num,
+void ocfs2_do_node_down(int node_num, void *data)
-                               struct ocfs2_super *osb)
 {
+        struct ocfs2_super *osb = data;
        BUG_ON(osb->node_num == node_num);
        mlog(0, "ocfs2: node down event for %d\n", node_num);
-        if (!osb->dlm) {
+        if (!osb->cconn) {
                /*
-                 * No DLM means we're not even ready to participate yet.
+                 * No cluster connection means we're not even ready to
-                 * We check the slots after the DLM comes up, so we will
+                 * participate yet.  We check the slots after the cluster
-                 * notice the node death then.  We can safely ignore it
+                 * comes up, so we will notice the node death then.  We
-                 * here.
+                 * can safely ignore it here.
                 */
                return;
        }
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num,
        ocfs2_recovery_thread(osb, node_num);
 }
-/* Called from the dlm when it's about to evict a node. We may also
- * get a heartbeat callback later. */
-static void ocfs2_dlm_eviction_cb(int node_num,
-                                  void *data)
-{
-        struct ocfs2_super *osb = (struct ocfs2_super *) data;
-        struct super_block *sb = osb->sb;
-        mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
-             MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
-        ocfs2_do_node_down(node_num, osb);
-}
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
-{
-        /* Not exactly a heartbeat callback, but leads to essentially
-         * the same path so we set it up here. */
-        dlm_setup_eviction_cb(&osb->osb_eviction_cb,
-                              ocfs2_dlm_eviction_cb,
-                              osb);
-}
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
-{
-        int ret;
-        char *argv[5], *envp[3];
-        if (ocfs2_mount_local(osb))
-                return;
-        if (!osb->uuid_str) {
-                /* This can happen if we don't get far enough in mount... */
-                mlog(0, "No UUID with which to stop heartbeat!\n\n");
-                return;
-        }
-        argv[0] = (char *)o2nm_get_hb_ctl_path();
-        argv[1] = "-K";
-        argv[2] = "-u";
-        argv[3] = osb->uuid_str;
-        argv[4] = NULL;
-        mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-        /* minimal command environment taken from cpu_run_sbin_hotplug */
-        envp[0] = "HOME=/";
-        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-        envp[2] = NULL;
-        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-        if (ret < 0)
-                mlog_errno(ret);
-}
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit)
 {
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
        return ret;
 }
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
-        int bit;
-        bit = find_next_bit(map->map, map->num_nodes, 0);
-        if (bit < map->num_nodes)
-                return 0;
-        return 1;
-}
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-                            struct ocfs2_node_map *map)
-{
-        int ret;
-        BUG_ON(map->num_nodes == 0);
-        spin_lock(&osb->node_map_lock);
-        ret = __ocfs2_node_map_is_empty(map);
-        spin_unlock(&osb->node_map_lock);
-        return ret;
-}
-#if 0
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
-                                 struct ocfs2_node_map *from)
-{
-        BUG_ON(from->num_nodes == 0);
-        ocfs2_node_map_init(target);
-        __ocfs2_node_map_set(target, from);
-}
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *target,
-                           int bit)
-{
-        struct ocfs2_node_map temp;
-        int ret;
-        spin_lock(&osb->node_map_lock);
-        __ocfs2_node_map_dup(&temp, target);
-        __ocfs2_node_map_clear_bit(&temp, bit);
-        ret = __ocfs2_node_map_is_empty(&temp);
-        spin_unlock(&osb->node_map_lock);
-        return ret;
-}
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
-                                 struct ocfs2_node_map *from)
-{
-        int num_longs, i;
-        BUG_ON(target->num_nodes != from->num_nodes);
-        BUG_ON(target->num_nodes == 0);
-        num_longs = BITS_TO_LONGS(target->num_nodes);
-        for (i = 0; i < num_longs; i++)
-                target->map[i] = from->map[i];
-}
-#endif  /*  0  */
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-                           int num)
-{
-        int set = 0;
-        spin_lock(&osb->node_map_lock);
-        if (!test_bit(num, osb->recovery_map.map)) {
-            __ocfs2_node_map_set_bit(&osb->recovery_map, num);
-            set = 1;
-        }
-        spin_unlock(&osb->node_map_lock);
-        return set;
-}
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-                              int num)
-{
-        ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *map,
-                           int idx)
-{
-        int i = idx;
-        idx = O2NM_INVALID_NODE_NUM;
-        spin_lock(&osb->node_map_lock);
-        if ((i != O2NM_INVALID_NODE_NUM) &&
-            (i >= 0) &&
-            (i < map->num_nodes)) {
-                while(i < map->num_nodes) {
-                        if (test_bit(i, map->map)) {
-                                idx = i;
-                                break;
-                        }
-                        i++;
-                }
-        }
-        spin_unlock(&osb->node_map_lock);
-        return idx;
-}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63aed7611..74b9c5dda28d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,13 +28,10 @@
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_do_node_down(int node_num, void *data);
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 /* node map functions - used to keep track of mounted and in-recovery
 * nodes. */
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-                            struct ocfs2_node_map *map);
 void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
                            struct ocfs2_node_map *map,
                            int bit);
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
                            struct ocfs2_node_map *map,
                            int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *map,
-                           int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
-                                               struct ocfs2_node_map *map)
-{
-        return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-                           int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-                              int num);
 #endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162b..b413166dd163 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -112,9 +113,9 @@ bail:
        return status;
 }
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-        unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int new_clusters;
        int status;
@@ -168,9 +169,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 #ifdef CONFIG_COMPAT
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
                cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +188,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
+        return ocfs2_ioctl(file, cmd, arg);
-        ret = ocfs2_ioctl(inode, file, cmd, arg);
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0d..cf9a5ee30fef 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
 #ifndef OCFS2_IOCTL_H
 #define OCFS2_IOCTL_H
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-        unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c3..9698338adc39 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+struct ocfs2_recovery_map {
+        unsigned int rm_used;
+        unsigned int *rm_entries;
+};
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+        struct ocfs2_recovery_map *rm;
+        mutex_init(&osb->recovery_lock);
+        osb->disable_recovery = 0;
+        osb->recovery_thread_task = NULL;
+        init_waitqueue_head(&osb->recovery_event);
+        rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+                     osb->max_slots * sizeof(unsigned int),
+                     GFP_KERNEL);
+        if (!rm) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        rm->rm_entries = (unsigned int *)((char *)rm +
+                                          sizeof(struct ocfs2_recovery_map));
+        osb->recovery_map = rm;
+        return 0;
+}
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+        mb();
+        return osb->recovery_thread_task != NULL;
+}
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+        struct ocfs2_recovery_map *rm;
+        /* disable any new recovery threads and wait for any currently
+         * running ones to exit. Do this before setting the vol_state. */
+        mutex_lock(&osb->recovery_lock);
+        osb->disable_recovery = 1;
+        mutex_unlock(&osb->recovery_lock);
+        wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+        /* At this point, we know that no more recovery threads can be
+         * launched, so wait for any recovery completion work to
+         * complete. */
+        flush_workqueue(ocfs2_wq);
+        /*
+         * Now that recovery is shut down, and the osb is about to be
+         * freed,  the osb_lock is not taken here.
+         */
+        rm = osb->recovery_map;
+        /* XXX: Should we bug if there are dirty entries? */
+        kfree(rm);
+}
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+                                     unsigned int node_num)
+{
+        int i;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        assert_spin_locked(&osb->osb_lock);
+        for (i = 0; i < rm->rm_used; i++) {
+                if (rm->rm_entries[i] == node_num)
+                        return 1;
+        }
+        return 0;
+}
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+                                  unsigned int node_num)
+{
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        if (__ocfs2_recovery_map_test(osb, node_num)) {
+                spin_unlock(&osb->osb_lock);
+                return 1;
+        }
+        /* XXX: Can this be exploited? Not from o2dlm... */
+        BUG_ON(rm->rm_used >= osb->max_slots);
+        rm->rm_entries[rm->rm_used] = node_num;
+        rm->rm_used++;
+        spin_unlock(&osb->osb_lock);
+        return 0;
+}
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+                                     unsigned int node_num)
+{
+        int i;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        for (i = 0; i < rm->rm_used; i++) {
+                if (rm->rm_entries[i] == node_num)
+                        break;
+        }
+        if (i < rm->rm_used) {
+                /* XXX: be careful with the pointer math */
+                memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+                        (rm->rm_used - i - 1) * sizeof(unsigned int));
+                rm->rm_used--;
+        }
+        spin_unlock(&osb->osb_lock);
+}
 static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
        int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
        mlog_entry_void();
-        if (!journal)
+        BUG_ON(!journal);
-                BUG();
        osb = journal->j_osb;
@@ -650,6 +780,23 @@ bail:
        return status;
 }
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+        int empty;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        empty = (rm->rm_used == 0);
+        spin_unlock(&osb->osb_lock);
+        return empty;
+}
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+        wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
 /*
 * JBD Might read a cached version of another nodes journal file. We
 * don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
 {
        int status, node_num;
        struct ocfs2_super *osb = arg;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
        mlog_entry_void();
@@ -863,26 +1011,29 @@ restart:
                goto bail;
        }
-        while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+        spin_lock(&osb->osb_lock);
-                node_num = ocfs2_node_map_first_set_bit(osb,
+        while (rm->rm_used) {
-                                                        &osb->recovery_map);
+                /* It's always safe to remove entry zero, as we won't
-                if (node_num == O2NM_INVALID_NODE_NUM) {
+                 * clear it until ocfs2_recover_node() has succeeded. */
-                        mlog(0, "Out of nodes to recover.\n");
+                node_num = rm->rm_entries[0];
-                        break;
+                spin_unlock(&osb->osb_lock);
-                }
                status = ocfs2_recover_node(osb, node_num);
-                if (status < 0) {
+                if (!status) {
+                        ocfs2_recovery_map_clear(osb, node_num);
+                } else {
                        mlog(ML_ERROR,
                             "Error %d recovering node %d on device (%u,%u)!\n",
                             status, node_num,
                             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
                        mlog(ML_ERROR, "Volume requires unmount.\n");
-                        continue;
                }
-                ocfs2_recovery_map_clear(osb, node_num);
+                spin_lock(&osb->osb_lock);
        }
+        spin_unlock(&osb->osb_lock);
+        mlog(0, "All nodes recovered\n");
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
 bail:
        mutex_lock(&osb->recovery_lock);
-        if (!status &&
+        if (!status && !ocfs2_recovery_completed(osb)) {
-            !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
                mutex_unlock(&osb->recovery_lock);
                goto restart;
        }
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
        /* People waiting on recovery will wait on
         * the recovery map to empty. */
-        if (!ocfs2_recovery_map_set(osb, node_num))
+        if (ocfs2_recovery_map_set(osb, node_num))
-                mlog(0, "node %d already be in recovery.\n", node_num);
+                mlog(0, "node %d already in recovery map.\n", node_num);
        mlog(0, "starting recovery thread...\n");
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 {
        int status = 0;
        int slot_num;
-        struct ocfs2_slot_info *si = osb->slot_info;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
-        slot_num = ocfs2_node_num_to_slot(si, node_num);
+        slot_num = ocfs2_node_num_to_slot(osb, node_num);
-        if (slot_num == OCFS2_INVALID_SLOT) {
+        if (slot_num == -ENOENT) {
                status = 0;
                mlog(0, "no slot for this node, so no recovery required.\n");
                goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* Likewise, this would be a strange but ultimately not so
         * harmful place to get an error... */
-        ocfs2_clear_slot(si, slot_num);
+        status = ocfs2_clear_slot(osb, slot_num);
-        status = ocfs2_update_disk_slots(osb, si);
        if (status < 0)
                mlog_errno(status);
@@ -1184,23 +1332,24 @@ bail:
 * slot info struct has been updated from disk. */
 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
-        int status, i, node_num;
+        unsigned int node_num;
-        struct ocfs2_slot_info *si = osb->slot_info;
+        int status, i;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
-        spin_lock(&si->si_lock);
+        spin_lock(&osb->osb_lock);
-        for(i = 0; i < si->si_num_slots; i++) {
+        for (i = 0; i < osb->max_slots; i++) {
                if (i == osb->slot_num)
                        continue;
-                if (ocfs2_is_empty_slot(si, i))
+                status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+                if (status == -ENOENT)
                        continue;
-                node_num = si->si_global_node_nums[i];
+                if (__ocfs2_recovery_map_test(osb, node_num))
-                if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
                        continue;
-                spin_unlock(&si->si_lock);
+                spin_unlock(&osb->osb_lock);
                /* Ok, we have a slot occupied by another node which
                 * is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                        goto bail;
                }
-                spin_lock(&si->si_lock);
+                spin_lock(&osb->osb_lock);
        }
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
        status = 0;
 bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e78..db82be2532ed 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
 /*
 *  Journal Control:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ab83fd562429..ce0dc147602a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -447,6 +447,8 @@ out_mutex:
        iput(main_bm_inode);
 out:
+        if (!status)
+                ocfs2_init_inode_steal_slot(osb);
        mlog_exit(status);
        return status;
 }
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        }
        ac->ac_inode = local_alloc_inode;
+        /* We should never use localalloc from another slot */
+        ac->ac_alloc_slot = osb->slot_num;
        ac->ac_which = OCFS2_AC_USE_LOCAL;
        get_bh(osb->local_alloc_bh);
        ac->ac_bh = osb->local_alloc_bh;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad9587516..d5d808fe0140 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
-        fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(current->fsuid);
        if (dir->i_mode & S_ISGID) {
                fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
         *
         * And that's why, just like the VFS, we need a file system
         * rename lock. */
-        if (old_dentry != new_dentry) {
+        if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
                status = ocfs2_rename_lock(osb);
                if (status < 0) {
                        mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e3..31692379c170 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,11 +36,8 @@
 #include <linux/mutex.h>
 #include <linux/jbd.h>
-#include "cluster/nodemanager.h"
+/* For union ocfs2_dlm_lksb */
-#include "cluster/heartbeat.h"
+#include "stackglue.h"
-#include "cluster/tcp.h"
-#include "dlm/dlmapi.h"
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
                                               * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
 #define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
+                                                 call to dlm_lock.  Only
+                                                 exists with BUSY set. */
 struct ocfs2_lock_res_ops;
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        struct dlm_lockstatus    l_lksb;
+        union ocfs2_dlm_lksb     l_lksb;
        /* used from AST/BAST funcs. */
        enum ocfs2_ast_action    l_action;
        enum ocfs2_unlock_action l_unlock_action;
        int                      l_requested;
        int                      l_blocking;
+        unsigned int             l_pending_gen;
        wait_queue_head_t        l_event;
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
 #define OCFS2_DEFAULT_ATIME_QUANTUM     60
 struct ocfs2_journal;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
        spinlock_t node_map_lock;
-        struct ocfs2_node_map recovery_map;
        u64 root_blkno;
        u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
        u32 s_feature_incompat;
        u32 s_feature_ro_compat;
-        /* Protects s_next_generaion, osb_flags. Could protect more on
+        /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
-         * osb as it's very short lived. */
+         * Could protect more on osb as it's very short lived.
+         */
        spinlock_t osb_lock;
        u32 s_next_generation;
        unsigned long osb_flags;
+        s16 s_inode_steal_slot;
+        atomic_t s_num_inodes_stolen;
        unsigned long s_mount_opt;
        unsigned int s_atime_quantum;
-        u16 max_slots;
+        unsigned int max_slots;
-        s16 node_num;
+        unsigned int node_num;
-        s16 slot_num;
+        int slot_num;
-        s16 preferred_slot;
+        int preferred_slot;
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
        atomic_t vol_state;
        struct mutex recovery_lock;
+        struct ocfs2_recovery_map *recovery_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
-        struct dlm_ctxt *dlm;
+        char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
-        struct dlm_eviction_cb osb_eviction_cb;
        struct ocfs2_dlm_debug *osb_dlm_debug;
-        struct dlm_protocol_version osb_locking_proto;
        struct dentry *osb_debug_root;
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
        return ret;
 }
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+        return (osb->s_feature_incompat &
+                OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+}
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 {
        return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
 }
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+        return (osb->s_feature_incompat &
+                OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
 #define OCFS2_IS_VALID_DINODE(ptr)                                      \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
        return pages_per_cluster;
 }
+static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+        spin_lock(&osb->osb_lock);
+        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+        spin_unlock(&osb->osb_lock);
+        atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
+                                              s16 slot)
+{
+        spin_lock(&osb->osb_lock);
+        osb->s_inode_steal_slot = slot;
+        spin_unlock(&osb->osb_lock);
+}
+static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+        s16 slot;
+        spin_lock(&osb->osb_lock);
+        slot = osb->s_inode_steal_slot;
+        spin_unlock(&osb->osb_lock);
+        return slot;
+}
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982f..52c426665154 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,9 @@
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
-                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 /*
@@ -125,6 +127,21 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA      0x0040
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+/*
+ * Support for alternate, userspace cluster stacks.  If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used.  This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK  0x0080
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
 #define OCFS2_VOL_UUID_LEN              16
 #define OCFS2_MAX_VOL_LABEL_LEN         64
+/* The alternate, userspace stack fields */
+#define OCFS2_STACK_LABEL_LEN           4
+#define OCFS2_CLUSTER_NAME_LEN          16
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
@@ -475,6 +496,47 @@ struct ocfs2_extent_block
 };
 /*
+ * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
+ * system file.  A slot is valid if it contains a node number >= 0.  The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT.  This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/  __le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block.  OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+struct ocfs2_extended_slot {
+/*00*/  __u8    es_valid;
+        __u8    es_reserved1[3];
+        __le32  es_node_num;
+/*10*/
+};
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set.  It separates out the valid marker from the node number, and
+ * has room to grow.  Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/  struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file.  It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+struct ocfs2_cluster_info {
+/*00*/  __u8   ci_stack[OCFS2_STACK_LABEL_LEN];
+        __le32 ci_reserved;
+/*08*/  __u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+/*
 * On disk superblock for OCFS2
 * Note that it is contained inside an ocfs2_dinode, so all offsets
 * are relative to the start of ocfs2_dinode.id2.
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
                                         * group header */
 /*50*/  __u8  s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
 /*90*/  __u8  s_uuid[OCFS2_VOL_UUID_LEN];       /* 128-bit uuid */
-/*A0*/
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
+                                                     stack.  Only valid
+                                                     with INCOMPAT flag. */
+/*B8*/  __le64 s_reserved2[17];         /* Fill out superblock */
+/*140*/
+        /*
+         * NOTE: As stated above, all offsets are relative to
+         * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+         * 0xC0 + 0x140 = 0x200 or 512 bytes.  A superblock must fit within
+         * our smallest blocksize, which is 512 bytes.  To ensure this,
+         * we reserve the space in s_reserved2.  Anything past s_reserved2
+         * will not be available on the smallest blocksize.
+         */
 };
 /*
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2b..82c200f7a8f1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
 {
 #ifdef __KERNEL__
-        mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+        BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
 #endif
        return ocfs2_lock_type_strings[type];
 }
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e64..bb5ff8939bf1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,81 +42,244 @@
 #include "buffer_head_io.h"
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                                    s16 global);
+struct ocfs2_slot {
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+        int sl_valid;
-                              s16 slot_num,
+        unsigned int sl_node_num;
-                              s16 node_num);
+};
-/* post the slot information on disk into our slot_info struct. */
+struct ocfs2_slot_info {
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+        int si_extended;
+        int si_slots_per_block;
+        struct inode *si_inode;
+        unsigned int si_blocks;
+        struct buffer_head **si_bh;
+        unsigned int si_num_slots;
+        struct ocfs2_slot *si_slots;
+};
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+                                    unsigned int node_num);
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+                                  int slot_num)
+{
+        BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+        si->si_slots[slot_num].sl_valid = 0;
+}
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+                           int slot_num, unsigned int node_num)
+{
+        BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+        si->si_slots[slot_num].sl_valid = 1;
+        si->si_slots[slot_num].sl_node_num = node_num;
+}
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+        int b, i, slotno;
+        struct ocfs2_slot_map_extended *se;
+        slotno = 0;
+        for (b = 0; b < si->si_blocks; b++) {
+                se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+                for (i = 0;
+                     (i < si->si_slots_per_block) &&
+                     (slotno < si->si_num_slots);
+                     i++, slotno++) {
+                        if (se->se_slots[i].es_valid)
+                                ocfs2_set_slot(si, slotno,
+                                               le32_to_cpu(se->se_slots[i].es_node_num));
+                        else
+                                ocfs2_invalidate_slot(si, slotno);
+                }
+        }
+}
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
 {
        int i;
-        __le16 *disk_info;
+        struct ocfs2_slot_map *sm;
-        /* we don't read the slot block here as ocfs2_super_lock
+        sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
-         * should've made sure we have the most recent copy. */
-        spin_lock(&si->si_lock);
-        disk_info = (__le16 *) si->si_bh->b_data;
-        for (i = 0; i < si->si_size; i++)
+        for (i = 0; i < si->si_num_slots; i++) {
-                si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+                if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+                        ocfs2_invalidate_slot(si, i);
+                else
+                        ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+        }
+}
-        spin_unlock(&si->si_lock);
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+        /*
+         * The slot data will have been refreshed when ocfs2_super_lock
+         * was taken.
+         */
+        if (si->si_extended)
+                ocfs2_update_slot_info_extended(si);
+        else
+                ocfs2_update_slot_info_old(si);
+}
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+        int ret;
+        struct ocfs2_slot_info *si = osb->slot_info;
+        if (si == NULL)
+                return 0;
+        BUG_ON(si->si_blocks == 0);
+        BUG_ON(si->si_bh == NULL);
+        mlog(0, "Refreshing slot map, reading %u block(s)\n",
+             si->si_blocks);
+        /*
+         * We pass -1 as blocknr because we expect all of si->si_bh to
+         * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
+         * this is not true, the read of -1 (UINT64_MAX) will fail.
+         */
+        ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+                                si->si_inode);
+        if (ret == 0) {
+                spin_lock(&osb->osb_lock);
+                ocfs2_update_slot_info(si);
+                spin_unlock(&osb->osb_lock);
+        }
+        return ret;
 }
 /* post the our slot info stuff into it's destination bh and write it
 * out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
-                            struct ocfs2_slot_info *si)
+                                            int slot_num,
+                                            struct buffer_head **bh)
 {
-        int status, i;
+        int blkind = slot_num / si->si_slots_per_block;
-        __le16 *disk_info = (__le16 *) si->si_bh->b_data;
+        int slotno = slot_num % si->si_slots_per_block;
+        struct ocfs2_slot_map_extended *se;
+        BUG_ON(blkind >= si->si_blocks);
+        se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+        se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+        if (si->si_slots[slot_num].sl_valid)
+                se->se_slots[slotno].es_node_num =
+                        cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+        *bh = si->si_bh[blkind];
+}
-        spin_lock(&si->si_lock);
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
-        for (i = 0; i < si->si_size; i++)
+                                       int slot_num,
-                disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+                                       struct buffer_head **bh)
-        spin_unlock(&si->si_lock);
+{
+        int i;
+        struct ocfs2_slot_map *sm;
+        sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+        for (i = 0; i < si->si_num_slots; i++) {
+                if (si->si_slots[i].sl_valid)
+                        sm->sm_slots[i] =
+                                cpu_to_le16(si->si_slots[i].sl_node_num);
+                else
+                        sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+        }
+        *bh = si->si_bh[0];
+}
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+                                  struct ocfs2_slot_info *si,
+                                  int slot_num)
+{
+        int status;
+        struct buffer_head *bh;
+        spin_lock(&osb->osb_lock);
+        if (si->si_extended)
+                ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+        else
+                ocfs2_update_disk_slot_old(si, slot_num, &bh);
+        spin_unlock(&osb->osb_lock);
-        status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+        status = ocfs2_write_block(osb, bh, si->si_inode);
        if (status < 0)
                mlog_errno(status);
        return status;
 }
-/* try to find global node in the slot info. Returns
+/*
- * OCFS2_INVALID_SLOT if nothing is found. */
+ * Calculate how many bytes are needed by the slot map.  Returns
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ * an error if the slot map file is too small.
-                                    s16 global)
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+                                        struct inode *inode,
+                                        unsigned long long *bytes)
 {
-        int i;
+        unsigned long long bytes_needed;
-        s16 ret = OCFS2_INVALID_SLOT;
+        if (ocfs2_uses_extended_slot_map(osb)) {
+                bytes_needed = osb->max_slots *
+                        sizeof(struct ocfs2_extended_slot);
+        } else {
+                bytes_needed = osb->max_slots * sizeof(__le16);
+        }
+        if (bytes_needed > i_size_read(inode)) {
+                mlog(ML_ERROR,
+                     "Slot map file is too small!  (size %llu, needed %llu)\n",
+                     i_size_read(inode), bytes_needed);
+                return -ENOSPC;
+        }
+        *bytes = bytes_needed;
+        return 0;
+}
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+                                    unsigned int node_num)
+{
+        int i, ret = -ENOENT;
        for(i = 0; i < si->si_num_slots; i++) {
-                if (global == si->si_global_node_nums[i]) {
+                if (si->si_slots[i].sl_valid &&
-                        ret = (s16) i;
+                    (node_num == si->si_slots[i].sl_node_num)) {
+                        ret = i;
                        break;
                }
        }
        return ret;
 }
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+                                   int preferred)
 {
-        int i;
+        int i, ret = -ENOSPC;
-        s16 ret = OCFS2_INVALID_SLOT;
-        if (preferred >= 0 && preferred < si->si_num_slots) {
+        if ((preferred >= 0) && (preferred < si->si_num_slots)) {
-                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+                if (!si->si_slots[preferred].sl_valid) {
                        ret = preferred;
                        goto out;
                }
        }
        for(i = 0; i < si->si_num_slots; i++) {
-                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
+                if (!si->si_slots[i].sl_valid) {
-                        ret = (s16) i;
+                        ret = i;
                        break;
                }
        }
@@ -124,58 +287,155 @@ out:
        return ret;
 }
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
-                           s16 global)
 {
-        s16 ret;
+        int slot;
+        struct ocfs2_slot_info *si = osb->slot_info;
-        spin_lock(&si->si_lock);
+        spin_lock(&osb->osb_lock);
-        ret = __ocfs2_node_num_to_slot(si, global);
+        slot = __ocfs2_node_num_to_slot(si, node_num);
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
-        return ret;
+        return slot;
+}
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+                                  unsigned int *node_num)
+{
+        struct ocfs2_slot_info *si = osb->slot_info;
+        assert_spin_locked(&osb->osb_lock);
+        BUG_ON(slot_num < 0);
+        BUG_ON(slot_num > osb->max_slots);
+        if (!si->si_slots[slot_num].sl_valid)
+                return -ENOENT;
+        *node_num = si->si_slots[slot_num].sl_node_num;
+        return 0;
 }
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
-                              s16 slot_num,
-                              s16 node_num)
 {
-        BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+        unsigned int i;
-        BUG_ON(slot_num >= si->si_num_slots);
-        BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
+        if (si == NULL)
-               (node_num >= O2NM_MAX_NODES));
+                return;
+        if (si->si_inode)
+                iput(si->si_inode);
+        if (si->si_bh) {
+                for (i = 0; i < si->si_blocks; i++) {
+                        if (si->si_bh[i]) {
+                                brelse(si->si_bh[i]);
+                                si->si_bh[i] = NULL;
+                        }
+                }
+                kfree(si->si_bh);
+        }
-        si->si_global_node_nums[slot_num] = node_num;
+        kfree(si);
 }
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
-                      s16 slot_num)
 {
-        spin_lock(&si->si_lock);
+        struct ocfs2_slot_info *si = osb->slot_info;
-        __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
-        spin_unlock(&si->si_lock);
+        if (si == NULL)
+                return 0;
+        spin_lock(&osb->osb_lock);
+        ocfs2_invalidate_slot(si, slot_num);
+        spin_unlock(&osb->osb_lock);
+        return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
 }
-int ocfs2_init_slot_info(struct ocfs2_super *osb)
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+                                  struct ocfs2_slot_info *si)
 {
-        int status, i;
+        int status = 0;
        u64 blkno;
+        unsigned long long blocks, bytes;
+        unsigned int i;
+        struct buffer_head *bh;
+        status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+        if (status)
+                goto bail;
+        blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+        BUG_ON(blocks > UINT_MAX);
+        si->si_blocks = blocks;
+        if (!si->si_blocks)
+                goto bail;
+        if (si->si_extended)
+                si->si_slots_per_block =
+                        (osb->sb->s_blocksize /
+                         sizeof(struct ocfs2_extended_slot));
+        else
+                si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+        /* The size checks above should ensure this */
+        BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+        mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+             si->si_blocks, bytes);
+        si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+                            GFP_KERNEL);
+        if (!si->si_bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        for (i = 0; i < si->si_blocks; i++) {
+                status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+                                                     &blkno, NULL, NULL);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                mlog(0, "Reading slot map block %u at %llu\n", i,
+                     (unsigned long long)blkno);
+                bh = NULL;  /* Acquire a fresh bh */
+                status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                si->si_bh[i] = bh;
+        }
+bail:
+        return status;
+}
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+        int status;
        struct inode *inode = NULL;
-        struct buffer_head *bh = NULL;
        struct ocfs2_slot_info *si;
-        si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+        si = kzalloc(sizeof(struct ocfs2_slot_info) +
+                     (sizeof(struct ocfs2_slot) * osb->max_slots),
+                     GFP_KERNEL);
        if (!si) {
                status = -ENOMEM;
                mlog_errno(status);
                goto bail;
        }
-        spin_lock_init(&si->si_lock);
+        si->si_extended = ocfs2_uses_extended_slot_map(osb);
        si->si_num_slots = osb->max_slots;
-        si->si_size = OCFS2_MAX_SLOTS;
+        si->si_slots = (struct ocfs2_slot *)((char *)si +
+                                             sizeof(struct ocfs2_slot_info));
-        for(i = 0; i < si->si_num_slots; i++)
-                si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
        inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
                                            OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
+        si->si_inode = inode;
-        if (status < 0) {
+        status = ocfs2_map_slot_buffers(osb, si);
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        si->si_inode = inode;
+        osb->slot_info = (struct ocfs2_slot_info *)si;
-        si->si_bh = bh;
-        osb->slot_info = si;
 bail:
        if (status < 0 && si)
-                ocfs2_free_slot_info(si);
+                __ocfs2_free_slot_info(si);
        return status;
 }
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
 {
-        if (si->si_inode)
+        struct ocfs2_slot_info *si = osb->slot_info;
-                iput(si->si_inode);
-        if (si->si_bh)
+        osb->slot_info = NULL;
-                brelse(si->si_bh);
+        __ocfs2_free_slot_info(si);
-        kfree(si);
 }
 int ocfs2_find_slot(struct ocfs2_super *osb)
 {
        int status;
-        s16 slot;
+        int slot;
        struct ocfs2_slot_info *si;
        mlog_entry_void();
        si = osb->slot_info;
+        spin_lock(&osb->osb_lock);
        ocfs2_update_slot_info(si);
-        spin_lock(&si->si_lock);
        /* search for ourselves first and take the slot if it already
         * exists. Perhaps we need to mark this in a variable for our
         * own journal recovery? Possibly not, though we certainly
         * need to warn to the user */
        slot = __ocfs2_node_num_to_slot(si, osb->node_num);
-        if (slot == OCFS2_INVALID_SLOT) {
+        if (slot < 0) {
                /* if no slot yet, then just take 1st available
                 * one. */
                slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
-                if (slot == OCFS2_INVALID_SLOT) {
+                if (slot < 0) {
-                        spin_unlock(&si->si_lock);
+                        spin_unlock(&osb->osb_lock);
                        mlog(ML_ERROR, "no free slots available!\n");
                        status = -EINVAL;
                        goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
                mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
                     slot);
-        __ocfs2_fill_slot(si, slot, osb->node_num);
+        ocfs2_set_slot(si, slot, osb->node_num);
        osb->slot_num = slot;
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
        mlog(0, "taking node slot %d\n", osb->slot_num);
-        status = ocfs2_update_disk_slots(osb, si);
+        status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
        if (status < 0)
                mlog_errno(status);
@@ -265,27 +517,27 @@ bail:
 void ocfs2_put_slot(struct ocfs2_super *osb)
 {
-        int status;
+        int status, slot_num;
        struct ocfs2_slot_info *si = osb->slot_info;
        if (!si)
                return;
+        spin_lock(&osb->osb_lock);
        ocfs2_update_slot_info(si);
-        spin_lock(&si->si_lock);
+        slot_num = osb->slot_num;
-        __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+        ocfs2_invalidate_slot(si, osb->slot_num);
        osb->slot_num = OCFS2_INVALID_SLOT;
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
-        status = ocfs2_update_disk_slots(osb, si);
+        status = ocfs2_update_disk_slot(osb, si, slot_num);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
 bail:
-        osb->slot_info = NULL;
+        ocfs2_free_slot_info(osb);
-        ocfs2_free_slot_info(si);
 }
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaade..601c95fd7003 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,38 +27,18 @@
 #ifndef SLOTMAP_H
 #define SLOTMAP_H
-struct ocfs2_slot_info {
-        spinlock_t si_lock;
-        struct inode *si_inode;
-        struct buffer_head *si_bh;
-        unsigned int si_num_slots;
-        unsigned int si_size;
-        s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
 int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
 int ocfs2_find_slot(struct ocfs2_super *osb);
 void ocfs2_put_slot(struct ocfs2_super *osb);
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-                            struct ocfs2_slot_info *si);
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                           s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
-                      s16 slot_num);
-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
-                                      int slot_num)
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
-{
+                                  unsigned int *node_num);
-        BUG_ON(slot_num == OCFS2_INVALID_SLOT);
-        assert_spin_locked(&si->si_lock);
-        return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
-}
 #endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000000..ac1d74c63bf5
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,420 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/crc32.h>
+#include <linux/module.h>
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "stackglue.h"
+struct o2dlm_private {
+        struct dlm_eviction_cb op_eviction_cb;
+};
+static struct ocfs2_stack_plugin o2cb_stack;
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+        BUG_ON(mode > LKM_MAXMODE);
+        return mode;
+}
+#define map_flag(_generic, _o2dlm)              \
+        if (flags & (_generic)) {               \
+                flags &= ~(_generic);           \
+                o2dlm_flags |= (_o2dlm);        \
+        }
+static int flags_to_o2dlm(u32 flags)
+{
+        int o2dlm_flags = 0;
+        map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+        map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+        map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+        map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+        map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+        map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+        map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+        map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+        map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+        /* map_flag() should have cleared every flag passed in */
+        BUG_ON(flags != 0);
+        return o2dlm_flags;
+}
+#undef map_flag
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:          0
+ * DLM_NOTQUEUED:       -EAGAIN
+ * DLM_CANCELGRANT:     -EBUSY
+ * DLM_CANCEL:          -DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+        [DLM_NORMAL]                    = 0,            /* Success */
+        [DLM_GRANTED]                   = -EINVAL,
+        [DLM_DENIED]                    = -EACCES,
+        [DLM_DENIED_NOLOCKS]            = -EACCES,
+        [DLM_WORKING]                   = -EACCES,
+        [DLM_BLOCKED]                   = -EINVAL,
+        [DLM_BLOCKED_ORPHAN]            = -EINVAL,
+        [DLM_DENIED_GRACE_PERIOD]       = -EACCES,
+        [DLM_SYSERR]                    = -ENOMEM,      /* It is what it is */
+        [DLM_NOSUPPORT]                 = -EPROTO,
+        [DLM_CANCELGRANT]               = -EBUSY,       /* Cancel after grant */
+        [DLM_IVLOCKID]                  = -EINVAL,
+        [DLM_SYNC]                      = -EINVAL,
+        [DLM_BADTYPE]                   = -EINVAL,
+        [DLM_BADRESOURCE]               = -EINVAL,
+        [DLM_MAXHANDLES]                = -ENOMEM,
+        [DLM_NOCLINFO]                  = -EINVAL,
+        [DLM_NOLOCKMGR]                 = -EINVAL,
+        [DLM_NOPURGED]                  = -EINVAL,
+        [DLM_BADARGS]                   = -EINVAL,
+        [DLM_VOID]                      = -EINVAL,
+        [DLM_NOTQUEUED]                 = -EAGAIN,      /* Trylock failed */
+        [DLM_IVBUFLEN]                  = -EINVAL,
+        [DLM_CVTUNGRANT]                = -EPERM,
+        [DLM_BADPARAM]                  = -EINVAL,
+        [DLM_VALNOTVALID]               = -EINVAL,
+        [DLM_REJECTED]                  = -EPERM,
+        [DLM_ABORT]                     = -EINVAL,
+        [DLM_CANCEL]                    = -DLM_ECANCEL, /* Successful cancel */
+        [DLM_IVRESHANDLE]               = -EINVAL,
+        [DLM_DEADLOCK]                  = -EDEADLK,
+        [DLM_DENIED_NOASTS]             = -EINVAL,
+        [DLM_FORWARD]                   = -EINVAL,
+        [DLM_TIMEOUT]                   = -ETIMEDOUT,
+        [DLM_IVGROUPID]                 = -EINVAL,
+        [DLM_VERS_CONFLICT]             = -EOPNOTSUPP,
+        [DLM_BAD_DEVICE_PATH]           = -ENOENT,
+        [DLM_NO_DEVICE_PERMISSION]      = -EPERM,
+        [DLM_NO_CONTROL_DEVICE]         = -ENOENT,
+        [DLM_RECOVERING]                = -ENOTCONN,
+        [DLM_MIGRATING]                 = -ERESTART,
+        [DLM_MAXSTATS]                  = -EINVAL,
+};
+static int dlm_status_to_errno(enum dlm_status status)
+{
+        BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+        return status_map[status];
+}
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        o2cb_stack.sp_proto->lp_lock_ast(astarg);
+}
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+        int error = dlm_status_to_errno(status);
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        /*
+         * In o2dlm, you can get both the lock_ast() for the lock being
+         * granted and the unlock_ast() for the CANCEL failing.  A
+         * successful cancel sends DLM_NORMAL here.  If the
+         * lock grant happened before the cancel arrived, you get
+         * DLM_CANCELGRANT.
+         *
+         * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+         * we just ignore it.  We expect the lock_ast() to handle the
+         * granted lock.
+         */
+        if (status == DLM_CANCELGRANT)
+                return;
+        o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+}
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+                         int mode,
+                         union ocfs2_dlm_lksb *lksb,
+                         u32 flags,
+                         void *name,
+                         unsigned int namelen,
+                         void *astarg)
+{
+        enum dlm_status status;
+        int o2dlm_mode = mode_to_o2dlm(mode);
+        int o2dlm_flags = flags_to_o2dlm(flags);
+        int ret;
+        status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+                         o2dlm_flags, name, namelen,
+                         o2dlm_lock_ast_wrapper, astarg,
+                         o2dlm_blocking_ast_wrapper);
+        ret = dlm_status_to_errno(status);
+        return ret;
+}
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                           union ocfs2_dlm_lksb *lksb,
+                           u32 flags,
+                           void *astarg)
+{
+        enum dlm_status status;
+        int o2dlm_flags = flags_to_o2dlm(flags);
+        int ret;
+        status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+                           o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+        ret = dlm_status_to_errno(status);
+        return ret;
+}
+static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+        return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+        return (void *)(lksb->lksb_o2dlm.lvb);
+}
+static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+        dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+        struct ocfs2_cluster_connection *conn = data;
+        mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+             node_num, conn->cc_namelen, conn->cc_name);
+        conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+        int rc = 0;
+        u32 dlm_key;
+        struct dlm_ctxt *dlm;
+        struct o2dlm_private *priv;
+        struct dlm_protocol_version dlm_version;
+        BUG_ON(conn == NULL);
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        /* for now we only have one cluster/node, make sure we see it
+         * in the heartbeat universe */
+        if (!o2hb_check_local_node_heartbeating()) {
+                rc = -EINVAL;
+                goto out;
+        }
+        priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+        if (!priv) {
+                rc = -ENOMEM;
+                goto out_free;
+        }
+        /* This just fills the structure in.  It is safe to pass conn. */
+        dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+                              conn);
+        conn->cc_private = priv;
+        /* used by the dlm code to make message headers unique, each
+         * node in this domain must agree on this. */
+        dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+        dlm_version.pv_major = conn->cc_version.pv_major;
+        dlm_version.pv_minor = conn->cc_version.pv_minor;
+        dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+        if (IS_ERR(dlm)) {
+                rc = PTR_ERR(dlm);
+                mlog_errno(rc);
+                goto out_free;
+        }
+        conn->cc_version.pv_major = dlm_version.pv_major;
+        conn->cc_version.pv_minor = dlm_version.pv_minor;
+        conn->cc_lockspace = dlm;
+        dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+out_free:
+        if (rc && conn->cc_private)
+                kfree(conn->cc_private);
+out:
+        return rc;
+}
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                                   int hangup_pending)
+{
+        struct dlm_ctxt *dlm = conn->cc_lockspace;
+        struct o2dlm_private *priv = conn->cc_private;
+        dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+        conn->cc_private = NULL;
+        kfree(priv);
+        dlm_unregister_domain(dlm);
+        conn->cc_lockspace = NULL;
+        return 0;
+}
+static void o2hb_stop(const char *group)
+{
+        int ret;
+        char *argv[5], *envp[3];
+        argv[0] = (char *)o2nm_get_hb_ctl_path();
+        argv[1] = "-K";
+        argv[2] = "-u";
+        argv[3] = (char *)group;
+        argv[4] = NULL;
+        mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+        /* minimal command environment taken from cpu_run_sbin_hotplug */
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        if (ret < 0)
+                mlog_errno(ret);
+}
+/*
+ * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
+ * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
+ * happens regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
+ * the glue and provide a "hangup" API for super.c to call.
+ *
+ * Other stacks will eventually provide a NULL ->hangup() pointer.
+ */
+static void o2cb_cluster_hangup(const char *group, int grouplen)
+{
+        o2hb_stop(group);
+}
+static int o2cb_cluster_this_node(unsigned int *node)
+{
+        int node_num;
+        node_num = o2nm_this_node();
+        if (node_num == O2NM_INVALID_NODE_NUM)
+                return -ENOENT;
+        if (node_num >= O2NM_MAX_NODES)
+                return -EOVERFLOW;
+        *node = node_num;
+        return 0;
+}
+struct ocfs2_stack_operations o2cb_stack_ops = {
+        .connect        = o2cb_cluster_connect,
+        .disconnect     = o2cb_cluster_disconnect,
+        .hangup         = o2cb_cluster_hangup,
+        .this_node      = o2cb_cluster_this_node,
+        .dlm_lock       = o2cb_dlm_lock,
+        .dlm_unlock     = o2cb_dlm_unlock,
+        .lock_status    = o2cb_dlm_lock_status,
+        .lock_lvb       = o2cb_dlm_lvb,
+        .dump_lksb      = o2cb_dump_lksb,
+};
+static struct ocfs2_stack_plugin o2cb_stack = {
+        .sp_name        = "o2cb",
+        .sp_ops         = &o2cb_stack_ops,
+        .sp_owner       = THIS_MODULE,
+};
+static int __init o2cb_stack_init(void)
+{
+        return ocfs2_stack_glue_register(&o2cb_stack);
+}
+static void __exit o2cb_stack_exit(void)
+{
+        ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000000..7428663f9cbb
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,883 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+#include "ocfs2.h"  /* For struct ocfs2_lock_res */
+#include "stackglue.h"
+/*
+ * The control protocol starts with a handshake.  Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple.  First, the client reads until EOF.  Each line
+ * of output is a supported protocol tag.  All protocol tags are a single
+ * character followed by a two hex digit version number.  Currently the
+ * only things supported is T01, for "Text-base version 0x01".  Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'.  If the version tag written is
+ * unknown, -EINVAL is returned.  Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages.  First is the "SETN" message.
+ * It has the following syntax:
+ *
+ *  SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message.  It has the following syntax:
+ *
+ *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client.  The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
+ * number from the "SETV" message must match
+ * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * must be less than or equal to ...->lp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed.  From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO                     "T01\n"
+#define OCFS2_CONTROL_PROTO_LEN                 4
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID         (0)
+#define OCFS2_CONTROL_HANDSHAKE_READ            (1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL        (2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID           (3)
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN            4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP        "SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP     "SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN      11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP           "DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN    47
+#define OCFS2_TEXT_UUID_LEN                     32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN        2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN       8
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order.  Let's just be safe.
+ */
+struct ocfs2_live_connection {
+        struct list_head                oc_list;
+        struct ocfs2_cluster_connection *oc_conn;
+};
+struct ocfs2_control_private {
+        struct list_head op_list;
+        int op_state;
+        int op_this_node;
+        struct ocfs2_protocol_version op_proto;
+};
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+        char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        char    space;
+        char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+        char    newline;
+};
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+        char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        char    space1;
+        char    major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+        char    space2;
+        char    minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+        char    newline;
+};
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+        char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        char    space1;
+        char    uuid[OCFS2_TEXT_UUID_LEN];
+        char    space2;
+        char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+        char    newline;
+};
+union ocfs2_control_message {
+        char                                    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        struct ocfs2_control_message_setn       u_setn;
+        struct ocfs2_control_message_setv       u_setv;
+        struct ocfs2_control_message_down       u_down;
+};
+static struct ocfs2_stack_plugin user_stack;
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+                                                     int state)
+{
+        struct ocfs2_control_private *p = file->private_data;
+        p->op_state = state;
+}
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+        struct ocfs2_control_private *p = file->private_data;
+        return p->op_state;
+}
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+        size_t len = strlen(name);
+        struct ocfs2_live_connection *c;
+        BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+        list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+                if ((c->oc_conn->cc_namelen == len) &&
+                    !strncmp(c->oc_conn->cc_name, name, len))
+                        return c;
+        }
+        return c;
+}
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path.  Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+                                     struct ocfs2_live_connection **c_ret)
+{
+        int rc = 0;
+        struct ocfs2_live_connection *c;
+        c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+        if (!c)
+                return -ENOMEM;
+        mutex_lock(&ocfs2_control_lock);
+        c->oc_conn = conn;
+        if (atomic_read(&ocfs2_control_opened))
+                list_add(&c->oc_list, &ocfs2_live_connection_list);
+        else {
+                printk(KERN_ERR
+                       "ocfs2: Userspace control daemon is not present\n");
+                rc = -ESRCH;
+        }
+        mutex_unlock(&ocfs2_control_lock);
+        if (!rc)
+                *c_ret = c;
+        else
+                kfree(c);
+        return rc;
+}
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+        mutex_lock(&ocfs2_control_lock);
+        list_del_init(&c->oc_list);
+        c->oc_conn = NULL;
+        mutex_unlock(&ocfs2_control_lock);
+        kfree(c);
+}
+static int ocfs2_control_cfu(void *target, size_t target_len,
+                             const char __user *buf, size_t count)
+{
+        /* The T01 expects write(2) calls to have exactly one command */
+        if ((count != target_len) ||
+            (count > sizeof(union ocfs2_control_message)))
+                return -EINVAL;
+        if (copy_from_user(target, buf, target_len))
+                return -EFAULT;
+        return 0;
+}
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+                                               const char __user *buf,
+                                               size_t count)
+{
+        ssize_t ret;
+        char kbuf[OCFS2_CONTROL_PROTO_LEN];
+        ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+                                buf, count);
+        if (ret)
+                return ret;
+        if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+                return -EINVAL;
+        ocfs2_control_set_handshake_state(file,
+                                          OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+        return count;
+}
+static void ocfs2_control_send_down(const char *uuid,
+                                    int nodenum)
+{
+        struct ocfs2_live_connection *c;
+        mutex_lock(&ocfs2_control_lock);
+        c = ocfs2_connection_find(uuid);
+        if (c) {
+                BUG_ON(c->oc_conn == NULL);
+                c->oc_conn->cc_recovery_handler(nodenum,
+                                                c->oc_conn->cc_recovery_data);
+        }
+        mutex_unlock(&ocfs2_control_lock);
+}
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values.  If there is a problem, return an error.  Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+        int rc = 0;
+        int set_p = 1;
+        struct ocfs2_control_private *p = file->private_data;
+        BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+        mutex_lock(&ocfs2_control_lock);
+        if (p->op_this_node < 0) {
+                set_p = 0;
+        } else if ((ocfs2_control_this_node >= 0) &&
+                   (ocfs2_control_this_node != p->op_this_node)) {
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        if (!p->op_proto.pv_major) {
+                set_p = 0;
+        } else if (!list_empty(&ocfs2_live_connection_list) &&
+                   ((running_proto.pv_major != p->op_proto.pv_major) ||
+                    (running_proto.pv_minor != p->op_proto.pv_minor))) {
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        if (set_p) {
+                ocfs2_control_this_node = p->op_this_node;
+                running_proto.pv_major = p->op_proto.pv_major;
+                running_proto.pv_minor = p->op_proto.pv_minor;
+        }
+out_unlock:
+        mutex_unlock(&ocfs2_control_lock);
+        if (!rc && set_p) {
+                /* We set the global values successfully */
+                atomic_inc(&ocfs2_control_opened);
+                ocfs2_control_set_handshake_state(file,
+                                        OCFS2_CONTROL_HANDSHAKE_VALID);
+        }
+        return rc;
+}
+static int ocfs2_control_get_this_node(void)
+{
+        int rc;
+        mutex_lock(&ocfs2_control_lock);
+        if (ocfs2_control_this_node < 0)
+                rc = -EINVAL;
+        else
+                rc = ocfs2_control_this_node;
+        mutex_unlock(&ocfs2_control_lock);
+        return rc;
+}
+static int ocfs2_control_do_setnode_msg(struct file *file,
+                                        struct ocfs2_control_message_setn *msg)
+{
+        long nodenum;
+        char *ptr = NULL;
+        struct ocfs2_control_private *p = file->private_data;
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+                return -EINVAL;
+        if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+                return -EINVAL;
+        if ((msg->space != ' ') || (msg->newline != '\n'))
+                return -EINVAL;
+        msg->space = msg->newline = '\0';
+        nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+        if (!ptr || *ptr)
+                return -EINVAL;
+        if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+            (nodenum > INT_MAX) || (nodenum < 0))
+                return -ERANGE;
+        p->op_this_node = nodenum;
+        return ocfs2_control_install_private(file);
+}
+static int ocfs2_control_do_setversion_msg(struct file *file,
+                                           struct ocfs2_control_message_setv *msg)
+ {
+        long major, minor;
+        char *ptr = NULL;
+        struct ocfs2_control_private *p = file->private_data;
+        struct ocfs2_protocol_version *max =
+                &user_stack.sp_proto->lp_max_version;
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+                return -EINVAL;
+        if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+                return -EINVAL;
+        if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+            (msg->newline != '\n'))
+                return -EINVAL;
+        msg->space1 = msg->space2 = msg->newline = '\0';
+        major = simple_strtol(msg->major, &ptr, 16);
+        if (!ptr || *ptr)
+                return -EINVAL;
+        minor = simple_strtol(msg->minor, &ptr, 16);
+        if (!ptr || *ptr)
+                return -EINVAL;
+        /*
+         * The major must be between 1 and 255, inclusive.  The minor
+         * must be between 0 and 255, inclusive.  The version passed in
+         * must be within the maximum version supported by the filesystem.
+         */
+        if ((major == LONG_MIN) || (major == LONG_MAX) ||
+            (major > (u8)-1) || (major < 1))
+                return -ERANGE;
+        if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+            (minor > (u8)-1) || (minor < 0))
+                return -ERANGE;
+        if ((major != max->pv_major) ||
+            (minor > max->pv_minor))
+                return -EINVAL;
+        p->op_proto.pv_major = major;
+        p->op_proto.pv_minor = minor;
+        return ocfs2_control_install_private(file);
+}
+static int ocfs2_control_do_down_msg(struct file *file,
+                                     struct ocfs2_control_message_down *msg)
+{
+        long nodenum;
+        char *p = NULL;
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_VALID)
+                return -EINVAL;
+        if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+                return -EINVAL;
+        if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+            (msg->newline != '\n'))
+                return -EINVAL;
+        msg->space1 = msg->space2 = msg->newline = '\0';
+        nodenum = simple_strtol(msg->nodestr, &p, 16);
+        if (!p || *p)
+                return -EINVAL;
+        if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+            (nodenum > INT_MAX) || (nodenum < 0))
+                return -ERANGE;
+        ocfs2_control_send_down(msg->uuid, nodenum);
+        return 0;
+}
+static ssize_t ocfs2_control_message(struct file *file,
+                                     const char __user *buf,
+                                     size_t count)
+{
+        ssize_t ret;
+        union ocfs2_control_message msg;
+        /* Try to catch padding issues */
+        WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+                (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+        memset(&msg, 0, sizeof(union ocfs2_control_message));
+        ret = ocfs2_control_cfu(&msg, count, buf, count);
+        if (ret)
+                goto out;
+        if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+            !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+                     OCFS2_CONTROL_MESSAGE_OP_LEN))
+                ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+        else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+                 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+                          OCFS2_CONTROL_MESSAGE_OP_LEN))
+                ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+        else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+                 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+                          OCFS2_CONTROL_MESSAGE_OP_LEN))
+                ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+        else
+                ret = -EINVAL;
+out:
+        return ret ? ret : count;
+}
+static ssize_t ocfs2_control_write(struct file *file,
+                                   const char __user *buf,
+                                   size_t count,
+                                   loff_t *ppos)
+{
+        ssize_t ret;
+        switch (ocfs2_control_get_handshake_state(file)) {
+                case OCFS2_CONTROL_HANDSHAKE_INVALID:
+                        ret = -EINVAL;
+                        break;
+                case OCFS2_CONTROL_HANDSHAKE_READ:
+                        ret = ocfs2_control_validate_protocol(file, buf,
+                                                              count);
+                        break;
+                case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+                case OCFS2_CONTROL_HANDSHAKE_VALID:
+                        ret = ocfs2_control_message(file, buf, count);
+                        break;
+                default:
+                        BUG();
+                        ret = -EIO;
+                        break;
+        }
+        return ret;
+}
+/*
+ * This is a naive version.  If we ever have a new protocol, we'll expand
+ * it.  Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+                                  char __user *buf,
+                                  size_t count,
+                                  loff_t *ppos)
+{
+        char *proto_string = OCFS2_CONTROL_PROTO;
+        size_t to_write = 0;
+        if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+                return 0;
+        to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
+        if (to_write > count)
+                to_write = count;
+        if (copy_to_user(buf, proto_string + *ppos, to_write))
+                return -EFAULT;
+        *ppos += to_write;
+        /* Have we read the whole protocol list? */
+        if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+                ocfs2_control_set_handshake_state(file,
+                                                  OCFS2_CONTROL_HANDSHAKE_READ);
+        return to_write;
+}
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+        struct ocfs2_control_private *p = file->private_data;
+        mutex_lock(&ocfs2_control_lock);
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_VALID)
+                goto out;
+        if (atomic_dec_and_test(&ocfs2_control_opened)) {
+                if (!list_empty(&ocfs2_live_connection_list)) {
+                        /* XXX: Do bad things! */
+                        printk(KERN_ERR
+                               "ocfs2: Unexpected release of ocfs2_control!\n"
+                               "       Loss of cluster connection requires "
+                               "an emergency restart!\n");
+                        emergency_restart();
+                }
+                /*
+                 * Last valid close clears the node number and resets
+                 * the locking protocol version
+                 */
+                ocfs2_control_this_node = -1;
+                running_proto.pv_major = 0;
+                running_proto.pv_major = 0;
+        }
+out:
+        list_del_init(&p->op_list);
+        file->private_data = NULL;
+        mutex_unlock(&ocfs2_control_lock);
+        kfree(p);
+        return 0;
+}
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+        struct ocfs2_control_private *p;
+        p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+        if (!p)
+                return -ENOMEM;
+        p->op_this_node = -1;
+        mutex_lock(&ocfs2_control_lock);
+        file->private_data = p;
+        list_add(&p->op_list, &ocfs2_control_private_list);
+        mutex_unlock(&ocfs2_control_lock);
+        return 0;
+}
+static const struct file_operations ocfs2_control_fops = {
+        .open    = ocfs2_control_open,
+        .release = ocfs2_control_release,
+        .read    = ocfs2_control_read,
+        .write   = ocfs2_control_write,
+        .owner   = THIS_MODULE,
+};
+struct miscdevice ocfs2_control_device = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = "ocfs2_control",
+        .fops           = &ocfs2_control_fops,
+};
+static int ocfs2_control_init(void)
+{
+        int rc;
+        atomic_set(&ocfs2_control_opened, 0);
+        rc = misc_register(&ocfs2_control_device);
+        if (rc)
+                printk(KERN_ERR
+                       "ocfs2: Unable to register ocfs2_control device "
+                       "(errno %d)\n",
+                       -rc);
+        return rc;
+}
+static void ocfs2_control_exit(void)
+{
+        int rc;
+        rc = misc_deregister(&ocfs2_control_device);
+        if (rc)
+                printk(KERN_ERR
+                       "ocfs2: Unable to deregister ocfs2_control device "
+                       "(errno %d)\n",
+                       -rc);
+}
+static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
+{
+        struct ocfs2_lock_res *res = astarg;
+        return &res->l_lksb.lksb_fsdlm;
+}
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+        struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
+        int status = lksb->sb_status;
+        BUG_ON(user_stack.sp_proto == NULL);
+        /*
+         * For now we're punting on the issue of other non-standard errors
+         * where we can't tell if the unlock_ast or lock_ast should be called.
+         * The main "other error" that's possible is EINVAL which means the
+         * function was called with invalid args, which shouldn't be possible
+         * since the caller here is under our control.  Other non-standard
+         * errors probably fall into the same category, or otherwise are fatal
+         * which means we can't carry on anyway.
+         */
+        if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+                user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+        else
+                user_stack.sp_proto->lp_lock_ast(astarg);
+}
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+        BUG_ON(user_stack.sp_proto == NULL);
+        user_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+                         int mode,
+                         union ocfs2_dlm_lksb *lksb,
+                         u32 flags,
+                         void *name,
+                         unsigned int namelen,
+                         void *astarg)
+{
+        int ret;
+        if (!lksb->lksb_fsdlm.sb_lvbptr)
+                lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+                                             sizeof(struct dlm_lksb);
+        ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+                       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+                       fsdlm_lock_ast_wrapper, astarg,
+                       fsdlm_blocking_ast_wrapper);
+        return ret;
+}
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                           union ocfs2_dlm_lksb *lksb,
+                           u32 flags,
+                           void *astarg)
+{
+        int ret;
+        ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+                         flags, &lksb->lksb_fsdlm, astarg);
+        return ret;
+}
+static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+        return lksb->lksb_fsdlm.sb_status;
+}
+static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+        return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+}
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+                               struct ocfs2_protocol_version *request)
+{
+        if (existing->pv_major != request->pv_major)
+                return 1;
+        if (existing->pv_minor > request->pv_minor)
+                return 1;
+        if (existing->pv_minor < request->pv_minor)
+                request->pv_minor = existing->pv_minor;
+        return 0;
+}
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+        dlm_lockspace_t *fsdlm;
+        struct ocfs2_live_connection *control;
+        int rc = 0;
+        BUG_ON(conn == NULL);
+        rc = ocfs2_live_connection_new(conn, &control);
+        if (rc)
+                goto out;
+        /*
+         * running_proto must have been set before we allowed any mounts
+         * to proceed.
+         */
+        if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+                printk(KERN_ERR
+                       "Unable to mount with fs locking protocol version "
+                       "%u.%u because the userspace control daemon has "
+                       "negotiated %u.%u\n",
+                       conn->cc_version.pv_major, conn->cc_version.pv_minor,
+                       running_proto.pv_major, running_proto.pv_minor);
+                rc = -EPROTO;
+                ocfs2_live_connection_drop(control);
+                goto out;
+        }
+        rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+                               &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+        if (rc) {
+                ocfs2_live_connection_drop(control);
+                goto out;
+        }
+        conn->cc_private = control;
+        conn->cc_lockspace = fsdlm;
+out:
+        return rc;
+}
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                                   int hangup_pending)
+{
+        dlm_release_lockspace(conn->cc_lockspace, 2);
+        conn->cc_lockspace = NULL;
+        ocfs2_live_connection_drop(conn->cc_private);
+        conn->cc_private = NULL;
+        return 0;
+}
+static int user_cluster_this_node(unsigned int *this_node)
+{
+        int rc;
+        rc = ocfs2_control_get_this_node();
+        if (rc < 0)
+                return rc;
+        *this_node = rc;
+        return 0;
+}
+static struct ocfs2_stack_operations user_stack_ops = {
+        .connect        = user_cluster_connect,
+        .disconnect     = user_cluster_disconnect,
+        .this_node      = user_cluster_this_node,
+        .dlm_lock       = user_dlm_lock,
+        .dlm_unlock     = user_dlm_unlock,
+        .lock_status    = user_dlm_lock_status,
+        .lock_lvb       = user_dlm_lvb,
+        .dump_lksb      = user_dlm_dump_lksb,
+};
+static struct ocfs2_stack_plugin user_stack = {
+        .sp_name        = "user",
+        .sp_ops         = &user_stack_ops,
+        .sp_owner       = THIS_MODULE,
+};
+static int __init user_stack_init(void)
+{
+        int rc;
+        rc = ocfs2_control_init();
+        if (!rc) {
+                rc = ocfs2_stack_glue_register(&user_stack);
+                if (rc)
+                        ocfs2_control_exit();
+        }
+        return rc;
+}
+static void __exit user_stack_exit(void)
+{
+        ocfs2_stack_glue_unregister(&user_stack);
+        ocfs2_control_exit();
+}
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(user_stack_init);
+module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000000..119f60cea9cc
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,568 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#define OCFS2_STACK_PLUGIN_O2CB         "o2cb"
+#define OCFS2_STACK_PLUGIN_USER         "user"
+static struct ocfs2_locking_protocol *lproto;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+/*
+ * The stack currently in use.  If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+        struct ocfs2_stack_plugin *p;
+        assert_spin_locked(&ocfs2_stack_lock);
+        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                if (!strcmp(p->sp_name, name))
+                        return p;
+        }
+        return NULL;
+}
+static int ocfs2_stack_driver_request(const char *stack_name,
+                                      const char *plugin_name)
+{
+        int rc;
+        struct ocfs2_stack_plugin *p;
+        spin_lock(&ocfs2_stack_lock);
+        /*
+         * If the stack passed by the filesystem isn't the selected one,
+         * we can't continue.
+         */
+        if (strcmp(stack_name, cluster_stack_name)) {
+                rc = -EBUSY;
+                goto out;
+        }
+        if (active_stack) {
+                /*
+                 * If the active stack isn't the one we want, it cannot
+                 * be selected right now.
+                 */
+                if (!strcmp(active_stack->sp_name, plugin_name))
+                        rc = 0;
+                else
+                        rc = -EBUSY;
+                goto out;
+        }
+        p = ocfs2_stack_lookup(plugin_name);
+        if (!p || !try_module_get(p->sp_owner)) {
+                rc = -ENOENT;
+                goto out;
+        }
+        /* Ok, the stack is pinned */
+        p->sp_count++;
+        active_stack = p;
+        rc = 0;
+out:
+        spin_unlock(&ocfs2_stack_lock);
+        return rc;
+}
+/*
+ * This function looks up the appropriate stack and makes it active.  If
+ * there is no stack, it tries to load it.  It will fail if the stack still
+ * cannot be found.  It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+        int rc;
+        char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+        /*
+         * Classic stack does not pass in a stack name.  This is
+         * compatible with older tools as well.
+         */
+        if (!stack_name || !*stack_name)
+                stack_name = OCFS2_STACK_PLUGIN_O2CB;
+        if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+                printk(KERN_ERR
+                       "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+                       stack_name);
+                return -EINVAL;
+        }
+        /* Anything that isn't the classic stack is a user stack */
+        if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+                plugin_name = OCFS2_STACK_PLUGIN_USER;
+        rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+        if (rc == -ENOENT) {
+                request_module("ocfs2_stack_%s", plugin_name);
+                rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+        }
+        if (rc == -ENOENT) {
+                printk(KERN_ERR
+                       "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+                       plugin_name);
+        } else if (rc == -EBUSY) {
+                printk(KERN_ERR
+                       "ocfs2: A different cluster stack is in use\n");
+        }
+        return rc;
+}
+static void ocfs2_stack_driver_put(void)
+{
+        spin_lock(&ocfs2_stack_lock);
+        BUG_ON(active_stack == NULL);
+        BUG_ON(active_stack->sp_count == 0);
+        active_stack->sp_count--;
+        if (!active_stack->sp_count) {
+                module_put(active_stack->sp_owner);
+                active_stack = NULL;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+}
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+        int rc;
+        spin_lock(&ocfs2_stack_lock);
+        if (!ocfs2_stack_lookup(plugin->sp_name)) {
+                plugin->sp_count = 0;
+                plugin->sp_proto = lproto;
+                list_add(&plugin->sp_list, &ocfs2_stack_list);
+                printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+                       plugin->sp_name);
+                rc = 0;
+        } else {
+                printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+                       plugin->sp_name);
+                rc = -EEXIST;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+        struct ocfs2_stack_plugin *p;
+        spin_lock(&ocfs2_stack_lock);
+        p = ocfs2_stack_lookup(plugin->sp_name);
+        if (p) {
+                BUG_ON(p != plugin);
+                BUG_ON(plugin == active_stack);
+                BUG_ON(plugin->sp_count != 0);
+                list_del_init(&plugin->sp_list);
+                printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+                       plugin->sp_name);
+        } else {
+                printk(KERN_ERR "Stack \"%s\" is not registered\n",
+                       plugin->sp_name);
+        }
+        spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+{
+        struct ocfs2_stack_plugin *p;
+        BUG_ON(proto == NULL);
+        spin_lock(&ocfs2_stack_lock);
+        BUG_ON(active_stack != NULL);
+        lproto = proto;
+        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                p->sp_proto = lproto;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
+ * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
+ * underlying stack plugins need to pilfer the lksb off of the lock_res.
+ * If some other structure needs to be passed as an astarg, the plugins
+ * will need to be given a different avenue to the lksb.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+                   int mode,
+                   union ocfs2_dlm_lksb *lksb,
+                   u32 flags,
+                   void *name,
+                   unsigned int namelen,
+                   struct ocfs2_lock_res *astarg)
+{
+        BUG_ON(lproto == NULL);
+        return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+                                              name, namelen, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                     union ocfs2_dlm_lksb *lksb,
+                     u32 flags,
+                     struct ocfs2_lock_res *astarg)
+{
+        BUG_ON(lproto == NULL);
+        return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+        return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+/*
+ * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
+ * don't cast at the glue level.  The real answer is that the header
+ * ordering is nigh impossible.
+ */
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+        return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+        active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+int ocfs2_cluster_connect(const char *stack_name,
+                          const char *group,
+                          int grouplen,
+                          void (*recovery_handler)(int node_num,
+                                                   void *recovery_data),
+                          void *recovery_data,
+                          struct ocfs2_cluster_connection **conn)
+{
+        int rc = 0;
+        struct ocfs2_cluster_connection *new_conn;
+        BUG_ON(group == NULL);
+        BUG_ON(conn == NULL);
+        BUG_ON(recovery_handler == NULL);
+        if (grouplen > GROUP_NAME_MAX) {
+                rc = -EINVAL;
+                goto out;
+        }
+        new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+                           GFP_KERNEL);
+        if (!new_conn) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        memcpy(new_conn->cc_name, group, grouplen);
+        new_conn->cc_namelen = grouplen;
+        new_conn->cc_recovery_handler = recovery_handler;
+        new_conn->cc_recovery_data = recovery_data;
+        /* Start the new connection at our maximum compatibility level */
+        new_conn->cc_version = lproto->lp_max_version;
+        /* This will pin the stack driver if successful */
+        rc = ocfs2_stack_driver_get(stack_name);
+        if (rc)
+                goto out_free;
+        rc = active_stack->sp_ops->connect(new_conn);
+        if (rc) {
+                ocfs2_stack_driver_put();
+                goto out_free;
+        }
+        *conn = new_conn;
+out_free:
+        if (rc)
+                kfree(new_conn);
+out:
+        return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                             int hangup_pending)
+{
+        int ret;
+        BUG_ON(conn == NULL);
+        ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+        /* XXX Should we free it anyway? */
+        if (!ret) {
+                kfree(conn);
+                if (!hangup_pending)
+                        ocfs2_stack_driver_put();
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+        BUG_ON(group == NULL);
+        BUG_ON(group[grouplen] != '\0');
+        if (active_stack->sp_ops->hangup)
+                active_stack->sp_ops->hangup(group, grouplen);
+        /* cluster_disconnect() was called with hangup_pending==1 */
+        ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+        return active_stack->sp_ops->this_node(node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+/*
+ * Sysfs bits
+ */
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+                                               struct kobj_attribute *attr,
+                                               char *buf)
+{
+        ssize_t ret = 0;
+        spin_lock(&ocfs2_stack_lock);
+        if (lproto)
+                ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+                               lproto->lp_max_version.pv_major,
+                               lproto->lp_max_version.pv_minor);
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+        __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+               ocfs2_max_locking_protocol_show, NULL);
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+                                                 struct kobj_attribute *attr,
+                                                 char *buf)
+{
+        ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+        struct ocfs2_stack_plugin *p;
+        spin_lock(&ocfs2_stack_lock);
+        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                ret = snprintf(buf, remain, "%s\n",
+                               p->sp_name);
+                if (ret < 0) {
+                        total = ret;
+                        break;
+                }
+                if (ret == remain) {
+                        /* snprintf() didn't fit */
+                        total = -E2BIG;
+                        break;
+                }
+                total += ret;
+                remain -= ret;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return total;
+}
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+        __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+               ocfs2_loaded_cluster_plugins_show, NULL);
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+                                                struct kobj_attribute *attr,
+                                                char *buf)
+{
+        ssize_t ret = 0;
+        spin_lock(&ocfs2_stack_lock);
+        if (active_stack) {
+                ret = snprintf(buf, PAGE_SIZE, "%s\n",
+                               active_stack->sp_name);
+                if (ret == PAGE_SIZE)
+                        ret = -E2BIG;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+        __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+               ocfs2_active_cluster_plugin_show, NULL);
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        char *buf)
+{
+        ssize_t ret;
+        spin_lock(&ocfs2_stack_lock);
+        ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         const char *buf, size_t count)
+{
+        size_t len = count;
+        ssize_t ret;
+        if (len == 0)
+                return len;
+        if (buf[len - 1] == '\n')
+                len--;
+        if ((len != OCFS2_STACK_LABEL_LEN) ||
+            (strnlen(buf, len) != len))
+                return -EINVAL;
+        spin_lock(&ocfs2_stack_lock);
+        if (active_stack) {
+                if (!strncmp(buf, cluster_stack_name, len))
+                        ret = count;
+                else
+                        ret = -EBUSY;
+        } else {
+                memcpy(cluster_stack_name, buf, len);
+                ret = count;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+        __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+               ocfs2_cluster_stack_show,
+               ocfs2_cluster_stack_store);
+static struct attribute *ocfs2_attrs[] = {
+        &ocfs2_attr_max_locking_protocol.attr,
+        &ocfs2_attr_loaded_cluster_plugins.attr,
+        &ocfs2_attr_active_cluster_plugin.attr,
+        &ocfs2_attr_cluster_stack.attr,
+        NULL,
+};
+static struct attribute_group ocfs2_attr_group = {
+        .attrs = ocfs2_attrs,
+};
+static struct kset *ocfs2_kset;
+static void ocfs2_sysfs_exit(void)
+{
+        kset_unregister(ocfs2_kset);
+}
+static int ocfs2_sysfs_init(void)
+{
+        int ret;
+        ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+        if (!ocfs2_kset)
+                return -ENOMEM;
+        ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        kset_unregister(ocfs2_kset);
+        return ret;
+}
+static int __init ocfs2_stack_glue_init(void)
+{
+        strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+        return ocfs2_sysfs_init();
+}
+static void __exit ocfs2_stack_glue_exit(void)
+{
+        lproto = NULL;
+        ocfs2_sysfs_exit();
+}
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000000..005e4f170e0f
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,261 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+/*
+ * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
+ * some day, but right now we need it.  Let's fake it.  This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL           0x00100000
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h.  That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX          64
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior.  See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+        u8 pv_major;
+        u8 pv_minor;
+};
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+        struct ocfs2_protocol_version lp_max_version;
+        void (*lp_lock_ast)(void *astarg);
+        void (*lp_blocking_ast)(void *astarg, int level);
+        void (*lp_unlock_ast)(void *astarg, int error);
+};
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space.  This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+        struct dlm_lksb lksb;
+        char lvb[DLM_LVB_LEN];
+};
+/*
+ * A union of all lock status structures.  We define it here so that the
+ * size of the union is known.  Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+union ocfs2_dlm_lksb {
+        struct dlm_lockstatus lksb_o2dlm;
+        struct dlm_lksb lksb_fsdlm;
+        struct fsdlm_lksb_plus_lvb padding;
+};
+/*
+ * A cluster connection.  Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack.  ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+        char cc_name[GROUP_NAME_MAX];
+        int cc_namelen;
+        struct ocfs2_protocol_version cc_version;
+        void (*cc_recovery_handler)(int node_num, void *recovery_data);
+        void *cc_recovery_data;
+        void *cc_lockspace;
+        void *cc_private;
+};
+/*
+ * Each cluster stack implements the stack operations structure.  Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+        /*
+         * The fs code calls ocfs2_cluster_connect() to attach a new
+         * filesystem to the cluster stack.  The ->connect() op is passed
+         * an ocfs2_cluster_connection with the name and recovery field
+         * filled in.
+         *
+         * The stack must set up any notification mechanisms and create
+         * the filesystem lockspace in the DLM.  The lockspace should be
+         * stored on cc_lockspace.  Any other information can be stored on
+         * cc_private.
+         *
+         * ->connect() must not return until it is guaranteed that
+         *
+         *  - Node down notifications for the filesystem will be recieved
+         *    and passed to conn->cc_recovery_handler().
+         *  - Locking requests for the filesystem will be processed.
+         */
+        int (*connect)(struct ocfs2_cluster_connection *conn);
+        /*
+         * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+         * no longer needs cluster services.  All DLM locks have been
+         * dropped, and recovery notification is being ignored by the
+         * fs code.  The stack must disengage from the DLM and discontinue
+         * recovery notification.
+         *
+         * Once ->disconnect() has returned, the connection structure will
+         * be freed.  Thus, a stack must not return from ->disconnect()
+         * until it will no longer reference the conn pointer.
+         *
+         * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
+         * be dropping the reference on the module.
+         */
+        int (*disconnect)(struct ocfs2_cluster_connection *conn,
+                          int hangup_pending);
+        /*
+         * ocfs2_cluster_hangup() exists for compatibility with older
+         * ocfs2 tools.  Only the classic stack really needs it.  As such
+         * ->hangup() is not required of all stacks.  See the comment by
+         * ocfs2_cluster_hangup() for more details.
+         *
+         * Note that ocfs2_cluster_hangup() can only be called if
+         * hangup_pending was passed to ocfs2_cluster_disconnect().
+         */
+        void (*hangup)(const char *group, int grouplen);
+        /*
+         * ->this_node() returns the cluster's unique identifier for the
+         * local node.
+         */
+        int (*this_node)(unsigned int *node);
+        /*
+         * Call the underlying dlm lock function.  The ->dlm_lock()
+         * callback should convert the flags and mode as appropriate.
+         *
+         * ast and bast functions are not part of the call because the
+         * stack will likely want to wrap ast and bast calls before passing
+         * them to stack->sp_proto.
+         */
+        int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+                        int mode,
+                        union ocfs2_dlm_lksb *lksb,
+                        u32 flags,
+                        void *name,
+                        unsigned int namelen,
+                        void *astarg);
+        /*
+         * Call the underlying dlm unlock function.  The ->dlm_unlock()
+         * function should convert the flags as appropriate.
+         *
+         * The unlock ast is not passed, as the stack will want to wrap
+         * it before calling stack->sp_proto->lp_unlock_ast().
+         */
+        int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+                          union ocfs2_dlm_lksb *lksb,
+                          u32 flags,
+                          void *astarg);
+        /*
+         * Return the status of the current lock status block.  The fs
+         * code should never dereference the union.  The ->lock_status()
+         * callback pulls out the stack-specific lksb, converts the status
+         * to a proper errno, and returns it.
+         */
+        int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+        /*
+         * Pull the lvb pointer off of the stack-specific lksb.
+         */
+        void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+        /*
+         * This is an optoinal debugging hook.  If provided, the
+         * stack can dump debugging information about this lock.
+         */
+        void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+};
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure.  This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+        char *sp_name;
+        struct ocfs2_stack_operations *sp_ops;
+        struct module *sp_owner;
+        /* These are managed by the stackglue code. */
+        struct list_head sp_list;
+        unsigned int sp_count;
+        struct ocfs2_locking_protocol *sp_proto;
+};
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+                          const char *group,
+                          int grouplen,
+                          void (*recovery_handler)(int node_num,
+                                                   void *recovery_data),
+                          void *recovery_data,
+                          struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                             int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(unsigned int *node);
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+                   int mode,
+                   union ocfs2_dlm_lksb *lksb,
+                   u32 flags,
+                   void *name,
+                   unsigned int namelen,
+                   struct ocfs2_lock_res *astarg);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                     union ocfs2_dlm_lksb *lksb,
+                     u32 flags,
+                     struct ocfs2_lock_res *astarg);
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+#endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004df..d2d278fb9819 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,11 @@
 #include "buffer_head_io.h"
+#define NOT_ALLOC_NEW_GROUP             0
+#define ALLOC_NEW_GROUP                 1
+#define OCFS2_MAX_INODES_TO_STEAL       1024
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u64 *bg_blkno,
                                                u16 *bg_bit_off);
-void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
        struct inode *inode = ac->ac_inode;
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
                mutex_unlock(&inode->i_mutex);
                iput(inode);
+                ac->ac_inode = NULL;
        }
-        if (ac->ac_bh)
+        if (ac->ac_bh) {
                brelse(ac->ac_bh);
+                ac->ac_bh = NULL;
+        }
+}
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+        ocfs2_free_ac_resource(ac);
        kfree(ac);
 }
@@ -391,7 +404,8 @@ bail:
 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
-                                       u32 slot)
+                                       u32 slot,
+                                       int alloc_new_group)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        }
        ac->ac_inode = alloc_inode;
+        ac->ac_alloc_slot = slot;
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
+                if (alloc_new_group != ALLOC_NEW_GROUP) {
+                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
+                             "and we don't alloc a new group for it.\n",
+                             slot, bits_wanted, free_bits);
+                        status = -ENOSPC;
+                        goto bail;
+                }
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
                if (status < 0) {
                        if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
        (*ac)->ac_group_search = ocfs2_block_group_search;
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
-                                             EXTENT_ALLOC_SYSTEM_INODE, slot);
+                                             EXTENT_ALLOC_SYSTEM_INODE,
+                                             slot, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
        return status;
 }
+static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
+                                              struct ocfs2_alloc_context *ac)
+{
+        int i, status = -ENOSPC;
+        s16 slot = ocfs2_get_inode_steal_slot(osb);
+        /* Start to steal inodes from the first slot after ours. */
+        if (slot == OCFS2_INVALID_SLOT)
+                slot = osb->slot_num + 1;
+        for (i = 0; i < osb->max_slots; i++, slot++) {
+                if (slot == osb->max_slots)
+                        slot = 0;
+                if (slot == osb->slot_num)
+                        continue;
+                status = ocfs2_reserve_suballoc_bits(osb, ac,
+                                                     INODE_ALLOC_SYSTEM_INODE,
+                                                     slot, NOT_ALLOC_NEW_GROUP);
+                if (status >= 0) {
+                        ocfs2_set_inode_steal_slot(osb, slot);
+                        break;
+                }
+                ocfs2_free_ac_resource(ac);
+        }
+        return status;
+}
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                            struct ocfs2_alloc_context **ac)
 {
        int status;
+        s16 slot = ocfs2_get_inode_steal_slot(osb);
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
        (*ac)->ac_group_search = ocfs2_block_group_search;
+        /*
+         * slot is set when we successfully steal inode from other nodes.
+         * It is reset in 3 places:
+         * 1. when we flush the truncate log
+         * 2. when we complete local alloc recovery.
+         * 3. when we successfully allocate from our own slot.
+         * After it is set, we will go on stealing inodes until we find the
+         * need to check our slots to see whether there is some space for us.
+         */
+        if (slot != OCFS2_INVALID_SLOT &&
+            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+                goto inode_steal;
+        atomic_set(&osb->s_num_inodes_stolen, 0);
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                             osb->slot_num);
+                                             osb->slot_num, ALLOC_NEW_GROUP);
+        if (status >= 0) {
+                status = 0;
+                /*
+                 * Some inodes must be freed by us, so try to allocate
+                 * from our own next time.
+                 */
+                if (slot != OCFS2_INVALID_SLOT)
+                        ocfs2_init_inode_steal_slot(osb);
+                goto bail;
+        } else if (status < 0 && status != -ENOSPC) {
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_free_ac_resource(*ac);
+inode_steal:
+        status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+        atomic_inc(&osb->s_num_inodes_stolen);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                             OCFS2_INVALID_SLOT);
+                                             OCFS2_INVALID_SLOT,
+                                             ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb459..544c600662bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
        struct buffer_head *ac_bh; /* file entry bh */
+        u32    ac_alloc_slot;   /* which slot are we allocating from? */
        u32    ac_bits_wanted;
        u32    ac_bits_given;
 #define OCFS2_AC_USE_LOCAL 1
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9f..df63ba20ae90 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
 #include <linux/mount.h>
+#include <linux/seq_file.h>
-#include <cluster/nodemanager.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
        unsigned int    atime_quantum;
        signed short    slot;
        unsigned int    localalloc_opt;
+        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
        Opt_commit,
        Opt_localalloc,
        Opt_localflocks,
+        Opt_stack,
        Opt_err,
 };
@@ -172,6 +172,7 @@ static match_table_t tokens = {
        {Opt_commit, "commit=%u"},
        {Opt_localalloc, "localalloc=%d"},
        {Opt_localflocks, "localflocks"},
+        {Opt_stack, "cluster_stack=%s"},
        {Opt_err, NULL}
 };
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
                }
        }
+        if (ocfs2_userspace_stack(osb)) {
+                if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+                        mlog(ML_ERROR, "Userspace stack expected, but "
+                             "o2cb heartbeat arguments passed to mount\n");
+                        return -EINVAL;
+                }
+        }
        if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
-                if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) {
+                if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+                    !ocfs2_userspace_stack(osb)) {
                        mlog(ML_ERROR, "Heartbeat has to be started to mount "
                             "a read-write clustered device.\n");
                        return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
        return 0;
 }
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk.  If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+                                        struct mount_options *mopt)
+{
+        if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+                mlog(ML_ERROR,
+                     "cluster stack passed to mount, but this filesystem "
+                     "does not support it\n");
+                return -EINVAL;
+        }
+        if (ocfs2_userspace_stack(osb) &&
+            strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+                    OCFS2_STACK_LABEL_LEN)) {
+                mlog(ML_ERROR,
+                     "cluster stack passed to mount (\"%s\") does not "
+                     "match the filesystem (\"%s\")\n",
+                     mopt->cluster_stack,
+                     osb->osb_cluster_stack);
+                return -EINVAL;
+        }
+        return 0;
+}
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
-        /* for now we only have one cluster/node, make sure we see it
-         * in the heartbeat universe */
-        if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
-                if (!o2hb_check_local_node_heartbeating()) {
-                        status = -EINVAL;
-                        goto read_super_error;
-                }
-        }
        /* probe for superblock */
        status = ocfs2_sb_probe(sb, &bh, &sector_size);
        if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_size = parsed_options.localalloc_opt;
+        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+        if (status)
+                goto read_super_error;
        sb->s_magic = OCFS2_SUPER_MAGIC;
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        if (ocfs2_mount_local(osb))
                snprintf(nodestr, sizeof(nodestr), "local");
        else
-                snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+                snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
        printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
               "with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        mopt->cluster_stack[0] = '\0';
        if (!options) {
                status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
                        if (!is_remount)
                                mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
                        break;
+                case Opt_stack:
+                        /* Check both that the option we were passed
+                         * is of the right length and that it is a proper
+                         * string of the right length.
+                         */
+                        if (((args[0].to - args[0].from) !=
+                             OCFS2_STACK_LABEL_LEN) ||
+                            (strnlen(args[0].from,
+                                     OCFS2_STACK_LABEL_LEN) !=
+                             OCFS2_STACK_LABEL_LEN)) {
+                                mlog(ML_ERROR,
+                                     "Invalid cluster_stack option\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        memcpy(mopt->cluster_stack, args[0].from,
+                               OCFS2_STACK_LABEL_LEN);
+                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
                seq_printf(s, ",localflocks,");
+        if (osb->osb_cluster_stack[0])
+                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+                           osb->osb_cluster_stack);
        return 0;
 }
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
+        ocfs2_set_locking_protocol();
 leave:
        if (status < 0) {
                ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
        return 0;
 }
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
-        int status;
-        /* XXX hold a ref on the node while mounte?  easy enough, if
-         * desirable. */
-        if (ocfs2_mount_local(osb))
-                osb->node_num = 0;
-        else
-                osb->node_num = o2nm_this_node();
-        if (osb->node_num == O2NM_MAX_NODES) {
-                mlog(ML_ERROR, "could not find this host's node number\n");
-                status = -ENOENT;
-                goto bail;
-        }
-        mlog(0, "I am node %d\n", osb->node_num);
-        status = 0;
-bail:
-        return status;
-}
 static int ocfs2_mount_volume(struct super_block *sb)
 {
        int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
        if (ocfs2_is_hard_readonly(osb))
                goto leave;
-        status = ocfs2_fill_local_node_info(osb);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = ocfs2_dlm_init(osb);
        if (status < 0) {
                mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
        return status;
 }
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
-        mb();
-        return osb->recovery_thread_task != NULL;
-}
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
-        int tmp;
+        int tmp, hangup_needed = 0;
        struct ocfs2_super *osb = NULL;
        char nodestr[8];
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_truncate_log_shutdown(osb);
-        /* disable any new recovery threads and wait for any currently
+        /* This will disable recovery and flush any recovery work. */
-         * running ones to exit. Do this before setting the vol_state. */
+        ocfs2_recovery_exit(osb);
-        mutex_lock(&osb->recovery_lock);
-        osb->disable_recovery = 1;
-        mutex_unlock(&osb->recovery_lock);
-        wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-        /* At this point, we know that no more recovery threads can be
-         * launched, so wait for any recovery completion work to
-         * complete. */
-        flush_workqueue(ocfs2_wq);
        ocfs2_journal_shutdown(osb);
        ocfs2_sync_blockdev(sb);
-        /* No dlm means we've failed during mount, so skip all the
+        /* No cluster connection means we've failed during mount, so skip
-         * steps which depended on that to complete. */
+         * all the steps which depended on that to complete. */
-        if (osb->dlm) {
+        if (osb->cconn) {
                tmp = ocfs2_super_lock(osb, 1);
                if (tmp < 0) {
                        mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        if (osb->slot_num != OCFS2_INVALID_SLOT)
                ocfs2_put_slot(osb);
-        if (osb->dlm)
+        if (osb->cconn)
                ocfs2_super_unlock(osb, 1);
        ocfs2_release_system_inodes(osb);
-        if (osb->dlm)
+        /*
-                ocfs2_dlm_shutdown(osb);
+         * If we're dismounting due to mount error, mount.ocfs2 will clean
+         * up heartbeat.  If we're a local mount, there is no heartbeat.
+         * If we failed before we got a uuid_str yet, we can't stop
+         * heartbeat.  Otherwise, do it.
+         */
+        if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+                hangup_needed = 1;
+        if (osb->cconn)
+                ocfs2_dlm_shutdown(osb, hangup_needed);
        debugfs_remove(osb->osb_debug_root);
-        if (!mnt_err)
+        if (hangup_needed)
-                ocfs2_stop_heartbeat(osb);
+                ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
        atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
        if (ocfs2_mount_local(osb))
                snprintf(nodestr, sizeof(nodestr), "local");
        else
-                snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+                snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
        printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
               osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
-        osb->osb_locking_proto = ocfs2_locking_protocol;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
        /* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->s_sectsize_bits = blksize_bits(sector_size);
        BUG_ON(!osb->s_sectsize_bits);
-        init_waitqueue_head(&osb->recovery_event);
        spin_lock_init(&osb->dc_task_lock);
        init_waitqueue_head(&osb->dc_event);
        osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_LIST_HEAD(&osb->blocked_lock_list);
        osb->blocked_lock_count = 0;
        spin_lock_init(&osb->osb_lock);
+        ocfs2_init_inode_steal_slot(osb);
        atomic_set(&osb->alloc_stats.moves, 0);
        atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
-        mutex_init(&osb->recovery_lock);
+        status = ocfs2_recovery_init(osb);
+        if (status) {
-        osb->disable_recovery = 0;
+                mlog(ML_ERROR, "Unable to initialize recovery state\n");
-        osb->recovery_thread_task = NULL;
+                mlog_errno(status);
+                goto bail;
+        }
        init_waitqueue_head(&osb->checkpoint_event);
        atomic_set(&osb->needs_checkpoint, 0);
        osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
-        osb->node_num = O2NM_INVALID_NODE_NUM;
        osb->slot_num = OCFS2_INVALID_SLOT;
        osb->local_alloc_state = OCFS2_LA_UNUSED;
        osb->local_alloc_bh = NULL;
-        ocfs2_setup_hb_callbacks(osb);
        init_waitqueue_head(&osb->osb_mount_event);
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
+        if (ocfs2_userspace_stack(osb)) {
+                memcpy(osb->osb_cluster_stack,
+                       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+                       OCFS2_STACK_LABEL_LEN);
+                osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+                        mlog(ML_ERROR,
+                             "couldn't mount because of an invalid "
+                             "cluster stack label (%s) \n",
+                             osb->osb_cluster_stack);
+                        status = -EINVAL;
+                        goto bail;
+                }
+        } else {
+                /* The empty string is identical with classic tools that
+                 * don't know about s_cluster_info. */
+                osb->osb_cluster_stack[0] = '\0';
+        }
        get_random_bytes(&osb->s_next_generation, sizeof(u32));
        /* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        /* This function assumes that the caller has the main osb resource */
-        if (osb->slot_info)
+        ocfs2_free_slot_info(osb);
-                ocfs2_free_slot_info(osb->slot_info);
        kfree(osb->osb_orphan_wipes);
        /* FIXME
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4caa5f774fb7..13cd7835d0df 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -44,7 +44,9 @@ int seq_open_net(struct inode *ino, struct file *f,
                put_net(net);
                return -ENOMEM;
        }
+#ifdef CONFIG_NET_NS
        p->net = net;
+#endif
        return 0;
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
@@ -52,12 +54,10 @@ EXPORT_SYMBOL_GPL(seq_open_net);
 int seq_release_net(struct inode *ino, struct file *f)
 {
        struct seq_file *seq;
-        struct seq_net_private *p;
        seq = f->private_data;
-        p = seq->private;
-        put_net(p->net);
+        put_net(seq_file_net(seq));
        seq_release_private(ino, f);
        return 0;
 }
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 5f66c4466151..817f5966edca 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 void sysfs_remove_link(struct kobject * kobj, const char * name)
 {
-        sysfs_hash_and_remove(kobj->sd, name);
+        struct sysfs_dirent *parent_sd = NULL;
+        if (!kobj)
+                parent_sd = &sysfs_root;
+        else
+                parent_sd = kobj->sd;
+        sysfs_hash_and_remove(parent_sd, name);
 }
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35115bca036e..524021ff5436 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -35,18 +35,6 @@ config XFS_QUOTA
          with or without the generic quota support enabled (CONFIG_QUOTA) -
          they are completely independent subsystems.
-config XFS_SECURITY
-        bool "XFS Security Label support"
-        depends on XFS_FS
-        help
-          Security labels support alternative access control models
-          implemented by security modules like SELinux.  This option
-          enables an extended attribute namespace for inode security
-          labels in the XFS filesystem.
-          If you are not using a security module that requires using
-          extended attributes for inode security labels, say N.
 config XFS_POSIX_ACL
        bool "XFS POSIX ACL support"
        depends on XFS_FS
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index e040f1ce1b6a..9b1bb17a0501 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 #ifdef DEBUG
        if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
                printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-                        __FUNCTION__, (long)size);
+                        __func__, (long)size);
                dump_stack();
        }
 #endif
@@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                if (!(++retries % 100))
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
-                                        __FUNCTION__, lflags);
+                                        __func__, lflags);
                congestion_wait(WRITE, HZ/50);
        } while (1);
 }
@@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                if (!(++retries % 100))
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
-                                        __FUNCTION__, lflags);
+                                        __func__, lflags);
                congestion_wait(WRITE, HZ/50);
        } while (1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e0519529c26c..a55c3b26d840 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -243,8 +243,12 @@ xfs_end_bio_unwritten(
        size_t                  size = ioend->io_size;
        if (likely(!ioend->io_error)) {
-                if (!XFS_FORCED_SHUTDOWN(ip->i_mount))
+                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        xfs_iomap_write_unwritten(ip, offset, size);
+                        int error;
+                        error = xfs_iomap_write_unwritten(ip, offset, size);
+                        if (error)
+                                ioend->io_error = error;
+                }
                xfs_setfilesize(ioend);
        }
        xfs_destroy_ioend(ioend);
@@ -1532,9 +1536,9 @@ xfs_vm_bmap(
        struct xfs_inode        *ip = XFS_I(inode);
        xfs_itrace_entry(XFS_I(inode));
-        xfs_rwlock(ip, VRWLOCK_READ);
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
-        xfs_rwunlock(ip, VRWLOCK_READ);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e347bfd47c91..52f6846101d5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -400,7 +400,7 @@ _xfs_buf_lookup_pages(
                                printk(KERN_ERR
                                        "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
-                                        __FUNCTION__, gfp_mask);
+                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
                        xfsbufd_wakeup(0, gfp_mask);
@@ -598,7 +598,7 @@ xfs_buf_get_flags(
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
                        printk(KERN_WARNING "%s: failed to map pages\n",
-                                        __FUNCTION__);
+                                        __func__);
                        goto no_buffer;
                }
        }
@@ -778,7 +778,7 @@ xfs_buf_get_noaddr(
        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
        if (unlikely(error)) {
                printk(KERN_WARNING "%s: failed to map pages\n",
-                                __FUNCTION__);
+                                __func__);
                goto fail_free_mem;
        }
@@ -1060,7 +1060,7 @@ xfs_buf_iostart(
                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
                bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
                xfs_buf_delwri_queue(bp, 1);
-                return status;
+                return 0;
        }
        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a3d207de48b8..841d7883528d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
+/*
+ * No error can be returned from xfs_buf_iostart for delwri
+ * buffers as they are queued and no I/O is issued.
+ */
+static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
        bp->b_strat = xfs_bdstrat_cb;
        bp->b_fspriv3 = mp;
-        return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
+        (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
 }
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index e7f3da61c6c3..652721ce0ea5 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -30,7 +30,7 @@ typedef struct cred {
 extern struct cred *sys_cred;
 /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
+static inline int capable_cred(cred_t *cr, int cid)
 {
        return (cr == sys_cred) ? 1 : capable(cid);
 }
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index ca4f66c4de16..265f0168ab76 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -22,6 +22,7 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
+#include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
@@ -30,8 +31,6 @@
 #include "xfs_inode.h"
 #include "xfs_vfsops.h"
-static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
 /*
 * Note that we only accept fileids which are long enough rather than allow
 * the parent generation number to default to zero.  XFS considers zero a
@@ -66,7 +65,7 @@ xfs_fs_encode_fh(
        int                     len;
        /* Directories don't need their parent encoded, they have ".." */
-        if (S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) || !connectable)
                fileid_type = FILEID_INO32_GEN;
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
@@ -213,17 +212,16 @@ xfs_fs_get_parent(
        struct dentry           *child)
 {
        int                     error;
-        bhv_vnode_t             *cvp;
+        struct xfs_inode        *cip;
        struct dentry           *parent;
-        cvp = NULL;
+        error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
-        error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp);
        if (unlikely(error))
                return ERR_PTR(-error);
-        parent = d_alloc_anon(vn_to_inode(cvp));
+        parent = d_alloc_anon(cip->i_vnode);
        if (unlikely(!parent)) {
-                VN_RELE(cvp);
+                iput(cip->i_vnode);
                return ERR_PTR(-ENOMEM);
        }
        return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index edab1ffbb163..05905246434d 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -469,16 +469,11 @@ xfs_file_open_exec(
        struct inode    *inode)
 {
        struct xfs_mount *mp = XFS_M(inode->i_sb);
+        struct xfs_inode *ip = XFS_I(inode);
-        if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) {
+        if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
-                if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) {
+                     DM_EVENT_ENABLED(ip, DM_EVENT_READ))
-                        bhv_vnode_t *vp = vn_from_inode(inode);
+                return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
-                        return -XFS_SEND_DATA(mp, DM_EVENT_READ,
-                                                vp, 0, 0, 0, NULL);
-                }
-        }
        return 0;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index ac6d34cc355d..1eefe61f0e10 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -17,18 +17,7 @@
 */
 #include "xfs.h"
 #include "xfs_vnodeops.h"
-/*
- * The following six includes are needed so that we can include
- * xfs_inode.h.  What a mess..
- */
 #include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 int  fs_noerr(void) { return 0; }
@@ -42,11 +31,10 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
+        struct address_space *mapping = ip->i_vnode->i_mapping;
-        struct inode    *inode = vn_to_inode(vp);
-        if (VN_CACHED(vp))
+        if (mapping->nrpages)
-                truncate_inode_pages(inode->i_mapping, first);
+                truncate_inode_pages(mapping, first);
 }
 int
@@ -56,15 +44,14 @@ xfs_flushinval_pages(
        xfs_off_t       last,
        int             fiopt)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
+        struct address_space *mapping = ip->i_vnode->i_mapping;
-        struct inode    *inode = vn_to_inode(vp);
        int             ret = 0;
-        if (VN_CACHED(vp)) {
+        if (mapping->nrpages) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                ret = filemap_write_and_wait(inode->i_mapping);
+                ret = filemap_write_and_wait(mapping);
                if (!ret)
-                        truncate_inode_pages(inode->i_mapping, first);
+                        truncate_inode_pages(mapping, first);
        }
        return ret;
 }
@@ -77,17 +64,16 @@ xfs_flush_pages(
        uint64_t        flags,
        int             fiopt)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
+        struct address_space *mapping = ip->i_vnode->i_mapping;
-        struct inode    *inode = vn_to_inode(vp);
        int             ret = 0;
        int             ret2;
-        if (VN_DIRTY(vp)) {
+        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                ret = filemap_fdatawrite(inode->i_mapping);
+                ret = filemap_fdatawrite(mapping);
                if (flags & XFS_B_ASYNC)
                        return ret;
-                ret2 = filemap_fdatawait(inode->i_mapping);
+                ret2 = filemap_fdatawait(mapping);
                if (!ret)
                        ret = ret2;
        }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f34bd010eb51..bf7759793856 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -651,314 +651,6 @@ xfs_attrmulti_by_handle(
        return -error;
 }
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions.  Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-STATIC int
-xfs_ioc_space(
-        struct xfs_inode        *ip,
-        struct inode            *inode,
-        struct file             *filp,
-        int                     flags,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_bulkstat(
-        xfs_mount_t             *mp,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_fsgeometry_v1(
-        xfs_mount_t             *mp,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_fsgeometry(
-        xfs_mount_t             *mp,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_xattr(
-        xfs_inode_t             *ip,
-        struct file             *filp,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_fsgetxattr(
-        xfs_inode_t             *ip,
-        int                     attr,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_getbmap(
-        struct xfs_inode        *ip,
-        int                     flags,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_getbmapx(
-        struct xfs_inode        *ip,
-        void                    __user *arg);
-int
-xfs_ioctl(
-        xfs_inode_t             *ip,
-        struct file             *filp,
-        int                     ioflags,
-        unsigned int            cmd,
-        void                    __user *arg)
-{
-        struct inode            *inode = filp->f_path.dentry->d_inode;
-        xfs_mount_t             *mp = ip->i_mount;
-        int                     error;
-        xfs_itrace_entry(XFS_I(inode));
-        switch (cmd) {
-        case XFS_IOC_ALLOCSP:
-        case XFS_IOC_FREESP:
-        case XFS_IOC_RESVSP:
-        case XFS_IOC_UNRESVSP:
-        case XFS_IOC_ALLOCSP64:
-        case XFS_IOC_FREESP64:
-        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64:
-                /*
-                 * Only allow the sys admin to reserve space unless
-                 * unwritten extents are enabled.
-                 */
-                if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-                    !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
-        case XFS_IOC_DIOINFO: {
-                struct dioattr  da;
-                xfs_buftarg_t   *target =
-                        XFS_IS_REALTIME_INODE(ip) ?
-                        mp->m_rtdev_targp : mp->m_ddev_targp;
-                da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
-                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-                if (copy_to_user(arg, &da, sizeof(da)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_FSBULKSTAT_SINGLE:
-        case XFS_IOC_FSBULKSTAT:
-        case XFS_IOC_FSINUMBERS:
-                return xfs_ioc_bulkstat(mp, cmd, arg);
-        case XFS_IOC_FSGEOMETRY_V1:
-                return xfs_ioc_fsgeometry_v1(mp, arg);
-        case XFS_IOC_FSGEOMETRY:
-                return xfs_ioc_fsgeometry(mp, arg);
-        case XFS_IOC_GETVERSION:
-                return put_user(inode->i_generation, (int __user *)arg);
-        case XFS_IOC_FSGETXATTR:
-                return xfs_ioc_fsgetxattr(ip, 0, arg);
-        case XFS_IOC_FSGETXATTRA:
-                return xfs_ioc_fsgetxattr(ip, 1, arg);
-        case XFS_IOC_GETXFLAGS:
-        case XFS_IOC_SETXFLAGS:
-        case XFS_IOC_FSSETXATTR:
-                return xfs_ioc_xattr(ip, filp, cmd, arg);
-        case XFS_IOC_FSSETDM: {
-                struct fsdmidata        dmi;
-                if (copy_from_user(&dmi, arg, sizeof(dmi)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
-                                dmi.fsd_dmstate);
-                return -error;
-        }
-        case XFS_IOC_GETBMAP:
-        case XFS_IOC_GETBMAPA:
-                return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
-        case XFS_IOC_GETBMAPX:
-                return xfs_ioc_getbmapx(ip, arg);
-        case XFS_IOC_FD_TO_HANDLE:
-        case XFS_IOC_PATH_TO_HANDLE:
-        case XFS_IOC_PATH_TO_FSHANDLE:
-                return xfs_find_handle(cmd, arg);
-        case XFS_IOC_OPEN_BY_HANDLE:
-                return xfs_open_by_handle(mp, arg, filp, inode);
-        case XFS_IOC_FSSETDM_BY_HANDLE:
-                return xfs_fssetdm_by_handle(mp, arg, inode);
-        case XFS_IOC_READLINK_BY_HANDLE:
-                return xfs_readlink_by_handle(mp, arg, inode);
-        case XFS_IOC_ATTRLIST_BY_HANDLE:
-                return xfs_attrlist_by_handle(mp, arg, inode);
-        case XFS_IOC_ATTRMULTI_BY_HANDLE:
-                return xfs_attrmulti_by_handle(mp, arg, inode);
-        case XFS_IOC_SWAPEXT: {
-                error = xfs_swapext((struct xfs_swapext __user *)arg);
-                return -error;
-        }
-        case XFS_IOC_FSCOUNTS: {
-                xfs_fsop_counts_t out;
-                error = xfs_fs_counts(mp, &out);
-                if (error)
-                        return -error;
-                if (copy_to_user(arg, &out, sizeof(out)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_SET_RESBLKS: {
-                xfs_fsop_resblks_t inout;
-                __uint64_t         in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&inout, arg, sizeof(inout)))
-                        return -XFS_ERROR(EFAULT);
-                /* input parameter is passed in resblks field of structure */
-                in = inout.resblks;
-                error = xfs_reserve_blocks(mp, &in, &inout);
-                if (error)
-                        return -error;
-                if (copy_to_user(arg, &inout, sizeof(inout)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_GET_RESBLKS: {
-                xfs_fsop_resblks_t out;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                error = xfs_reserve_blocks(mp, NULL, &out);
-                if (error)
-                        return -error;
-                if (copy_to_user(arg, &out, sizeof(out)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_FSGROWFSDATA: {
-                xfs_growfs_data_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_growfs_data(mp, &in);
-                return -error;
-        }
-        case XFS_IOC_FSGROWFSLOG: {
-                xfs_growfs_log_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_growfs_log(mp, &in);
-                return -error;
-        }
-        case XFS_IOC_FSGROWFSRT: {
-                xfs_growfs_rt_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_growfs_rt(mp, &in);
-                return -error;
-        }
-        case XFS_IOC_FREEZE:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen == SB_UNFROZEN)
-                        freeze_bdev(inode->i_sb->s_bdev);
-                return 0;
-        case XFS_IOC_THAW:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen != SB_UNFROZEN)
-                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-                return 0;
-        case XFS_IOC_GOINGDOWN: {
-                __uint32_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (get_user(in, (__uint32_t __user *)arg))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_fs_goingdown(mp, in);
-                return -error;
-        }
-        case XFS_IOC_ERROR_INJECTION: {
-                xfs_error_injection_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_errortag_add(in.errtag, mp);
-                return -error;
-        }
-        case XFS_IOC_ERROR_CLEARALL:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                error = xfs_errortag_clearall(mp, 1);
-                return -error;
-        default:
-                return -ENOTTY;
-        }
-}
 STATIC int
 xfs_ioc_space(
        struct xfs_inode        *ip,
@@ -1179,85 +871,85 @@ xfs_ioc_fsgetxattr(
 }
 STATIC int
-xfs_ioc_xattr(
+xfs_ioc_fssetxattr(
        xfs_inode_t             *ip,
        struct file             *filp,
-        unsigned int            cmd,
        void                    __user *arg)
 {
        struct fsxattr          fa;
        struct bhv_vattr        *vattr;
-        int                     error = 0;
+        int                     error;
        int                     attr_flags;
-        unsigned int            flags;
+        if (copy_from_user(&fa, arg, sizeof(fa)))
+                return -EFAULT;
        vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
        if (unlikely(!vattr))
                return -ENOMEM;
-        switch (cmd) {
+        attr_flags = 0;
-        case XFS_IOC_FSSETXATTR: {
+        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                if (copy_from_user(&fa, arg, sizeof(fa))) {
+                attr_flags |= ATTR_NONBLOCK;
-                        error = -EFAULT;
-                        break;
-                }
-                attr_flags = 0;
+        vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-                if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+        vattr->va_xflags  = fa.fsx_xflags;
-                        attr_flags |= ATTR_NONBLOCK;
+        vattr->va_extsize = fa.fsx_extsize;
+        vattr->va_projid  = fa.fsx_projid;
-                vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
+        error = -xfs_setattr(ip, vattr, attr_flags, NULL);
-                vattr->va_xflags  = fa.fsx_xflags;
+        if (!error)
-                vattr->va_extsize = fa.fsx_extsize;
+                vn_revalidate(XFS_ITOV(ip));    /* update flags */
-                vattr->va_projid  = fa.fsx_projid;
+        kfree(vattr);
+        return 0;
+}
-                error = xfs_setattr(ip, vattr, attr_flags, NULL);
+STATIC int
-                if (likely(!error))
+xfs_ioc_getxflags(
-                        vn_revalidate(XFS_ITOV(ip));    /* update flags */
+        xfs_inode_t             *ip,
-                error = -error;
+        void                    __user *arg)
-                break;
+{
-        }
+        unsigned int            flags;
-        case XFS_IOC_GETXFLAGS: {
+        flags = xfs_di2lxflags(ip->i_d.di_flags);
-                flags = xfs_di2lxflags(ip->i_d.di_flags);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
-                if (copy_to_user(arg, &flags, sizeof(flags)))
+                return -EFAULT;
-                        error = -EFAULT;
+        return 0;
-                break;
+}
-        }
-        case XFS_IOC_SETXFLAGS: {
+STATIC int
-                if (copy_from_user(&flags, arg, sizeof(flags))) {
+xfs_ioc_setxflags(
-                        error = -EFAULT;
+        xfs_inode_t             *ip,
-                        break;
+        struct file             *filp,
-                }
+        void                    __user *arg)
+{
+        struct bhv_vattr        *vattr;
+        unsigned int            flags;
+        int                     attr_flags;
+        int                     error;
-                if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+        if (copy_from_user(&flags, arg, sizeof(flags)))
-                              FS_NOATIME_FL | FS_NODUMP_FL | \
+                return -EFAULT;
-                              FS_SYNC_FL)) {
-                        error = -EOPNOTSUPP;
-                        break;
-                }
-                attr_flags = 0;
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-                if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
-                        attr_flags |= ATTR_NONBLOCK;
+                      FS_SYNC_FL))
+                return -EOPNOTSUPP;
-                vattr->va_mask = XFS_AT_XFLAGS;
+        vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
-                vattr->va_xflags = xfs_merge_ioc_xflags(flags,
+        if (unlikely(!vattr))
-                                                        xfs_ip2xflags(ip));
+                return -ENOMEM;
-                error = xfs_setattr(ip, vattr, attr_flags, NULL);
+        attr_flags = 0;
-                if (likely(!error))
+        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                        vn_revalidate(XFS_ITOV(ip));    /* update flags */
+                attr_flags |= ATTR_NONBLOCK;
-                error = -error;
-                break;
-        }
-        default:
+        vattr->va_mask = XFS_AT_XFLAGS;
-                error = -ENOTTY;
+        vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-                break;
-        }
+        error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+        if (likely(!error))
+                vn_revalidate(XFS_ITOV(ip));    /* update flags */
        kfree(vattr);
        return error;
 }
@@ -1332,3 +1024,259 @@ xfs_ioc_getbmapx(
        return 0;
 }
+int
+xfs_ioctl(
+        xfs_inode_t             *ip,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        struct inode            *inode = filp->f_path.dentry->d_inode;
+        xfs_mount_t             *mp = ip->i_mount;
+        int                     error;
+        xfs_itrace_entry(XFS_I(inode));
+        switch (cmd) {
+        case XFS_IOC_ALLOCSP:
+        case XFS_IOC_FREESP:
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_ALLOCSP64:
+        case XFS_IOC_FREESP64:
+        case XFS_IOC_RESVSP64:
+        case XFS_IOC_UNRESVSP64:
+                /*
+                 * Only allow the sys admin to reserve space unless
+                 * unwritten extents are enabled.
+                 */
+                if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
+                    !capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+        case XFS_IOC_DIOINFO: {
+                struct dioattr  da;
+                xfs_buftarg_t   *target =
+                        XFS_IS_REALTIME_INODE(ip) ?
+                        mp->m_rtdev_targp : mp->m_ddev_targp;
+                da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+                if (copy_to_user(arg, &da, sizeof(da)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSBULKSTAT_SINGLE:
+        case XFS_IOC_FSBULKSTAT:
+        case XFS_IOC_FSINUMBERS:
+                return xfs_ioc_bulkstat(mp, cmd, arg);
+        case XFS_IOC_FSGEOMETRY_V1:
+                return xfs_ioc_fsgeometry_v1(mp, arg);
+        case XFS_IOC_FSGEOMETRY:
+                return xfs_ioc_fsgeometry(mp, arg);
+        case XFS_IOC_GETVERSION:
+                return put_user(inode->i_generation, (int __user *)arg);
+        case XFS_IOC_FSGETXATTR:
+                return xfs_ioc_fsgetxattr(ip, 0, arg);
+        case XFS_IOC_FSGETXATTRA:
+                return xfs_ioc_fsgetxattr(ip, 1, arg);
+        case XFS_IOC_FSSETXATTR:
+                return xfs_ioc_fssetxattr(ip, filp, arg);
+        case XFS_IOC_GETXFLAGS:
+                return xfs_ioc_getxflags(ip, arg);
+        case XFS_IOC_SETXFLAGS:
+                return xfs_ioc_setxflags(ip, filp, arg);
+        case XFS_IOC_FSSETDM: {
+                struct fsdmidata        dmi;
+                if (copy_from_user(&dmi, arg, sizeof(dmi)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+                                dmi.fsd_dmstate);
+                return -error;
+        }
+        case XFS_IOC_GETBMAP:
+        case XFS_IOC_GETBMAPA:
+                return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+        case XFS_IOC_GETBMAPX:
+                return xfs_ioc_getbmapx(ip, arg);
+        case XFS_IOC_FD_TO_HANDLE:
+        case XFS_IOC_PATH_TO_HANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE:
+                return xfs_find_handle(cmd, arg);
+        case XFS_IOC_OPEN_BY_HANDLE:
+                return xfs_open_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_FSSETDM_BY_HANDLE:
+                return xfs_fssetdm_by_handle(mp, arg, inode);
+        case XFS_IOC_READLINK_BY_HANDLE:
+                return xfs_readlink_by_handle(mp, arg, inode);
+        case XFS_IOC_ATTRLIST_BY_HANDLE:
+                return xfs_attrlist_by_handle(mp, arg, inode);
+        case XFS_IOC_ATTRMULTI_BY_HANDLE:
+                return xfs_attrmulti_by_handle(mp, arg, inode);
+        case XFS_IOC_SWAPEXT: {
+                error = xfs_swapext((struct xfs_swapext __user *)arg);
+                return -error;
+        }
+        case XFS_IOC_FSCOUNTS: {
+                xfs_fsop_counts_t out;
+                error = xfs_fs_counts(mp, &out);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &out, sizeof(out)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_SET_RESBLKS: {
+                xfs_fsop_resblks_t inout;
+                __uint64_t         in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&inout, arg, sizeof(inout)))
+                        return -XFS_ERROR(EFAULT);
+                /* input parameter is passed in resblks field of structure */
+                in = inout.resblks;
+                error = xfs_reserve_blocks(mp, &in, &inout);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &inout, sizeof(inout)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_GET_RESBLKS: {
+                xfs_fsop_resblks_t out;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                error = xfs_reserve_blocks(mp, NULL, &out);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &out, sizeof(out)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSGROWFSDATA: {
+                xfs_growfs_data_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_data(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSLOG: {
+                xfs_growfs_log_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_log(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSRT: {
+                xfs_growfs_rt_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_rt(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FREEZE:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (inode->i_sb->s_frozen == SB_UNFROZEN)
+                        freeze_bdev(inode->i_sb->s_bdev);
+                return 0;
+        case XFS_IOC_THAW:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (inode->i_sb->s_frozen != SB_UNFROZEN)
+                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+                return 0;
+        case XFS_IOC_GOINGDOWN: {
+                __uint32_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (get_user(in, (__uint32_t __user *)arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_fs_goingdown(mp, in);
+                return -error;
+        }
+        case XFS_IOC_ERROR_INJECTION: {
+                xfs_error_injection_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_errortag_add(in.errtag, mp);
+                return -error;
+        }
+        case XFS_IOC_ERROR_CLEARALL:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                error = xfs_errortag_clearall(mp, 1);
+                return -error;
+        default:
+                return -ENOTTY;
+        }
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cc4abd3daa49..0c958cf77758 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,12 +62,11 @@ void
 xfs_synchronize_atime(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp;
+        struct inode    *inode = ip->i_vnode;
-        vp = XFS_ITOV_NULL(ip);
+        if (inode) {
-        if (vp) {
+                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-                ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec;
+                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-                ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec;
        }
 }
@@ -80,11 +79,10 @@ void
 xfs_mark_inode_dirty_sync(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp;
+        struct inode    *inode = ip->i_vnode;
-        vp = XFS_ITOV_NULL(ip);
+        if (inode)
-        if (vp)
+                mark_inode_dirty_sync(inode);
-                mark_inode_dirty_sync(vn_to_inode(vp));
 }
 /*
@@ -215,66 +213,62 @@ xfs_validate_fields(
 */
 STATIC int
 xfs_init_security(
-        bhv_vnode_t     *vp,
+        struct inode    *inode,
        struct inode    *dir)
 {
-        struct inode    *ip = vn_to_inode(vp);
+        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
        void            *value;
        char            *name;
        int             error;
-        error = security_inode_init_security(ip, dir, &name, &value, &length);
+        error = security_inode_init_security(inode, dir, &name,
+                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
                        return 0;
                return -error;
        }
-        error = xfs_attr_set(XFS_I(ip), name, value,
+        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-                        length, ATTR_SECURE);
        if (!error)
-                xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED);
+                xfs_iflags_set(ip, XFS_IMODIFIED);
        kfree(name);
        kfree(value);
        return error;
 }
-/*
+static void
- * Determine whether a process has a valid fs_struct (kernel daemons
+xfs_dentry_to_name(
- * like knfsd don't have an fs_struct).
+        struct xfs_name *namep,
- *
+        struct dentry   *dentry)
- * XXX(hch):  nfsd is broken, better fix it instead.
- */
-STATIC_INLINE int
-xfs_has_fs_struct(struct task_struct *task)
 {
-        return (task->fs != init_task.fs);
+        namep->name = dentry->d_name.name;
+        namep->len = dentry->d_name.len;
 }
 STATIC void
 xfs_cleanup_inode(
        struct inode    *dir,
-        bhv_vnode_t     *vp,
+        struct inode    *inode,
        struct dentry   *dentry,
        int             mode)
 {
-        struct dentry   teardown = {};
+        struct xfs_name teardown;
        /* Oh, the horror.
         * If we can't add the ACL or we fail in
         * xfs_init_security we must back out.
         * ENOSPC can hit here, among other things.
         */
-        teardown.d_inode = vn_to_inode(vp);
+        xfs_dentry_to_name(&teardown, dentry);
-        teardown.d_name = dentry->d_name;
        if (S_ISDIR(mode))
-                xfs_rmdir(XFS_I(dir), &teardown);
+                xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
        else
-                xfs_remove(XFS_I(dir), &teardown);
+                xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-        VN_RELE(vp);
+        iput(inode);
 }
 STATIC int
@@ -284,9 +278,10 @@ xfs_vn_mknod(
        int             mode,
        dev_t           rdev)
 {
-        struct inode    *ip;
+        struct inode    *inode;
-        bhv_vnode_t     *vp = NULL, *dvp = vn_from_inode(dir);
+        struct xfs_inode *ip = NULL;
        xfs_acl_t       *default_acl = NULL;
+        struct xfs_name name;
        attrexists_t    test_default_acl = _ACL_DEFAULT_EXISTS;
        int             error;
@@ -297,59 +292,67 @@ xfs_vn_mknod(
        if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
                return -EINVAL;
-        if (unlikely(test_default_acl && test_default_acl(dvp))) {
+        if (test_default_acl && test_default_acl(dir)) {
                if (!_ACL_ALLOC(default_acl)) {
                        return -ENOMEM;
                }
-                if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
+                if (!_ACL_GET_DEFAULT(dir, default_acl)) {
                        _ACL_FREE(default_acl);
                        default_acl = NULL;
                }
        }
-        if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current))
+        xfs_dentry_to_name(&name, dentry);
+        if (IS_POSIXACL(dir) && !default_acl)
                mode &= ~current->fs->umask;
        switch (mode & S_IFMT) {
-        case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
                rdev = sysv_encode_dev(rdev);
        case S_IFREG:
-                error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
+                error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
                break;
        case S_IFDIR:
-                error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL);
+                error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
                break;
        default:
                error = EINVAL;
                break;
        }
-        if (unlikely(!error)) {
+        if (unlikely(error))
-                error = xfs_init_security(vp, dir);
+                goto out_free_acl;
-                if (error)
-                        xfs_cleanup_inode(dir, vp, dentry, mode);
-        }
-        if (unlikely(default_acl)) {
+        inode = ip->i_vnode;
-                if (!error) {
-                        error = _ACL_INHERIT(vp, mode, default_acl);
+        error = xfs_init_security(inode, dir);
-                        if (!error)
+        if (unlikely(error))
-                                xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
+                goto out_cleanup_inode;
-                        else
-                                xfs_cleanup_inode(dir, vp, dentry, mode);
+        if (default_acl) {
-                }
+                error = _ACL_INHERIT(inode, mode, default_acl);
+                if (unlikely(error))
+                        goto out_cleanup_inode;
+                xfs_iflags_set(ip, XFS_IMODIFIED);
                _ACL_FREE(default_acl);
        }
-        if (likely(!error)) {
-                ASSERT(vp);
-                ip = vn_to_inode(vp);
-                if (S_ISDIR(mode))
+        if (S_ISDIR(mode))
-                        xfs_validate_fields(ip);
+                xfs_validate_fields(inode);
-                d_instantiate(dentry, ip);
+        d_instantiate(dentry, inode);
-                xfs_validate_fields(dir);
+        xfs_validate_fields(dir);
-        }
+        return -error;
+ out_cleanup_inode:
+        xfs_cleanup_inode(dir, inode, dentry, mode);
+ out_free_acl:
+        if (default_acl)
+                _ACL_FREE(default_acl);
        return -error;
 }
@@ -378,13 +381,15 @@ xfs_vn_lookup(
        struct dentry   *dentry,
        struct nameidata *nd)
 {
-        bhv_vnode_t     *cvp;
+        struct xfs_inode *cip;
+        struct xfs_name name;
        int             error;
        if (dentry->d_name.len >= MAXNAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
-        error = xfs_lookup(XFS_I(dir), dentry, &cvp);
+        xfs_dentry_to_name(&name, dentry);
+        error = xfs_lookup(XFS_I(dir), &name, &cip);
        if (unlikely(error)) {
                if (unlikely(error != ENOENT))
                        return ERR_PTR(-error);
@@ -392,7 +397,7 @@ xfs_vn_lookup(
                return NULL;
        }
-        return d_splice_alias(vn_to_inode(cvp), dentry);
+        return d_splice_alias(cip->i_vnode, dentry);
 }
 STATIC int
@@ -401,23 +406,24 @@ xfs_vn_link(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-        struct inode    *ip;    /* inode of guy being linked to */
+        struct inode    *inode; /* inode of guy being linked to */
-        bhv_vnode_t     *vp;    /* vp of name being linked */
+        struct xfs_name name;
        int             error;
-        ip = old_dentry->d_inode;       /* inode being linked to */
+        inode = old_dentry->d_inode;
-        vp = vn_from_inode(ip);
+        xfs_dentry_to_name(&name, dentry);
-        VN_HOLD(vp);
+        igrab(inode);
-        error = xfs_link(XFS_I(dir), vp, dentry);
+        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
        if (unlikely(error)) {
-                VN_RELE(vp);
+                iput(inode);
-        } else {
+                return -error;
-                xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
-                xfs_validate_fields(ip);
-                d_instantiate(dentry, ip);
        }
-        return -error;
+        xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+        xfs_validate_fields(inode);
+        d_instantiate(dentry, inode);
+        return 0;
 }
 STATIC int
@@ -426,11 +432,13 @@ xfs_vn_unlink(
        struct dentry   *dentry)
 {
        struct inode    *inode;
+        struct xfs_name name;
        int             error;
        inode = dentry->d_inode;
+        xfs_dentry_to_name(&name, dentry);
-        error = xfs_remove(XFS_I(dir), dentry);
+        error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
        if (likely(!error)) {
                xfs_validate_fields(dir);       /* size needs update */
                xfs_validate_fields(inode);
@@ -444,29 +452,34 @@ xfs_vn_symlink(
        struct dentry   *dentry,
        const char      *symname)
 {
-        struct inode    *ip;
+        struct inode    *inode;
-        bhv_vnode_t     *cvp;   /* used to lookup symlink to put in dentry */
+        struct xfs_inode *cip = NULL;
+        struct xfs_name name;
        int             error;
        mode_t          mode;
-        cvp = NULL;
        mode = S_IFLNK |
                (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+        xfs_dentry_to_name(&name, dentry);
-        error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
+        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
-                            &cvp, NULL);
+        if (unlikely(error))
-        if (likely(!error && cvp)) {
+                goto out;
-                error = xfs_init_security(cvp, dir);
-                if (likely(!error)) {
+        inode = cip->i_vnode;
-                        ip = vn_to_inode(cvp);
-                        d_instantiate(dentry, ip);
+        error = xfs_init_security(inode, dir);
-                        xfs_validate_fields(dir);
+        if (unlikely(error))
-                        xfs_validate_fields(ip);
+                goto out_cleanup_inode;
-                } else {
-                        xfs_cleanup_inode(dir, cvp, dentry, 0);
+        d_instantiate(dentry, inode);
-                }
+        xfs_validate_fields(dir);
-        }
+        xfs_validate_fields(inode);
+        return 0;
+ out_cleanup_inode:
+        xfs_cleanup_inode(dir, inode, dentry, 0);
+ out:
        return -error;
 }
@@ -476,9 +489,12 @@ xfs_vn_rmdir(
        struct dentry   *dentry)
 {
        struct inode    *inode = dentry->d_inode;
+        struct xfs_name name;
        int             error;
-        error = xfs_rmdir(XFS_I(dir), dentry);
+        xfs_dentry_to_name(&name, dentry);
+        error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
        if (likely(!error)) {
                xfs_validate_fields(inode);
                xfs_validate_fields(dir);
@@ -494,12 +510,15 @@ xfs_vn_rename(
        struct dentry   *ndentry)
 {
        struct inode    *new_inode = ndentry->d_inode;
-        bhv_vnode_t     *tvp;   /* target directory */
+        struct xfs_name oname;
+        struct xfs_name nname;
        int             error;
-        tvp = vn_from_inode(ndir);
+        xfs_dentry_to_name(&oname, odentry);
+        xfs_dentry_to_name(&nname, ndentry);
-        error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry);
+        error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+                                                        XFS_I(ndir), &nname);
        if (likely(!error)) {
                if (new_inode)
                        xfs_validate_fields(new_inode);
@@ -700,11 +719,19 @@ xfs_vn_setattr(
        return -error;
 }
+/*
+ * block_truncate_page can return an error, but we can't propagate it
+ * at all here. Leave a complaint + stack trace in the syslog because
+ * this could be bad. If it is bad, we need to propagate the error further.
+ */
 STATIC void
 xfs_vn_truncate(
        struct inode    *inode)
 {
-        block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks);
+        int     error;
+        error = block_truncate_page(inode->i_mapping, inode->i_size,
+                                                        xfs_get_blocks);
+        WARN_ON(error);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 3ca39c4e5d2a..e5143323e71f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -99,7 +99,6 @@
 /*
 * Feature macros (disable/enable)
 */
-#undef  HAVE_REFCACHE   /* reference cache not needed for NFS in 2.6 */
 #define HAVE_SPLICE     /* a splice(2) exists in 2.6, but not in 2.4 */
 #ifdef CONFIG_SMP
 #define HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 166353388490..21c0dbc74093 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -176,7 +176,6 @@ xfs_read(
 {
        struct file             *file = iocb->ki_filp;
        struct inode            *inode = file->f_mapping->host;
-        bhv_vnode_t             *vp = XFS_ITOV(ip);
        xfs_mount_t             *mp = ip->i_mount;
        size_t                  size = 0;
        ssize_t                 ret = 0;
@@ -228,11 +227,11 @@ xfs_read(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                bhv_vrwlock_t locktype = VRWLOCK_READ;
                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+                int iolock = XFS_IOLOCK_SHARED;
-                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
+                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
-                                        dmflags, &locktype);
+                                        dmflags, &iolock);
                if (ret) {
                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
                        if (unlikely(ioflags & IO_ISDIRECT))
@@ -242,7 +241,7 @@ xfs_read(
        }
        if (unlikely(ioflags & IO_ISDIRECT)) {
-                if (VN_CACHED(vp))
+                if (inode->i_mapping->nrpages)
                        ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
                                                    -1, FI_REMAPF_LOCKED);
                mutex_unlock(&inode->i_mutex);
@@ -276,7 +275,6 @@ xfs_splice_read(
        int                     flags,
        int                     ioflags)
 {
-        bhv_vnode_t             *vp = XFS_ITOV(ip);
        xfs_mount_t             *mp = ip->i_mount;
        ssize_t                 ret;
@@ -287,11 +285,11 @@ xfs_splice_read(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                bhv_vrwlock_t locktype = VRWLOCK_READ;
+                int iolock = XFS_IOLOCK_SHARED;
                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(infilp), &locktype);
+                                        FILP_DELAY_FLAG(infilp), &iolock);
                if (error) {
                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
                        return -error;
@@ -317,7 +315,6 @@ xfs_splice_write(
        int                     flags,
        int                     ioflags)
 {
-        bhv_vnode_t             *vp = XFS_ITOV(ip);
        xfs_mount_t             *mp = ip->i_mount;
        ssize_t                 ret;
        struct inode            *inode = outfilp->f_mapping->host;
@@ -330,11 +327,11 @@ xfs_splice_write(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-                bhv_vrwlock_t locktype = VRWLOCK_WRITE;
+                int iolock = XFS_IOLOCK_EXCL;
                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(outfilp), &locktype);
+                                        FILP_DELAY_FLAG(outfilp), &iolock);
                if (error) {
                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return -error;
@@ -573,14 +570,12 @@ xfs_write(
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
        struct inode            *inode = mapping->host;
-        bhv_vnode_t             *vp = XFS_ITOV(xip);
        unsigned long           segs = nsegs;
        xfs_mount_t             *mp;
        ssize_t                 ret = 0, error = 0;
        xfs_fsize_t             isize, new_size;
        int                     iolock;
        int                     eventsent = 0;
-        bhv_vrwlock_t           locktype;
        size_t                  ocount = 0, count;
        loff_t                  pos;
        int                     need_i_mutex;
@@ -607,11 +602,9 @@ xfs_write(
 relock:
        if (ioflags & IO_ISDIRECT) {
                iolock = XFS_IOLOCK_SHARED;
-                locktype = VRWLOCK_WRITE_DIRECT;
                need_i_mutex = 0;
        } else {
                iolock = XFS_IOLOCK_EXCL;
-                locktype = VRWLOCK_WRITE;
                need_i_mutex = 1;
                mutex_lock(&inode->i_mutex);
        }
@@ -634,9 +627,8 @@ start:
                        dmflags |= DM_FLAGS_IMUX;
                xfs_iunlock(xip, XFS_ILOCK_EXCL);
-                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
+                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
-                                      pos, count,
+                                      pos, count, dmflags, &iolock);
-                                      dmflags, &locktype);
                if (error) {
                        goto out_unlock_internal;
                }
@@ -664,10 +656,9 @@ start:
                        return XFS_ERROR(-EINVAL);
                }
-                if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+                if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
                        iolock = XFS_IOLOCK_EXCL;
-                        locktype = VRWLOCK_WRITE;
                        need_i_mutex = 1;
                        mutex_lock(&inode->i_mutex);
                        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
@@ -727,7 +718,7 @@ retry:
        current->backing_dev_info = mapping->backing_dev_info;
        if ((ioflags & IO_ISDIRECT)) {
-                if (VN_CACHED(vp)) {
+                if (mapping->nrpages) {
                        WARN_ON(need_i_mutex == 0);
                        xfs_inval_cached_trace(xip, pos, -1,
                                        (pos & PAGE_CACHE_MASK), -1);
@@ -744,7 +735,6 @@ retry:
                        mutex_unlock(&inode->i_mutex);
                        iolock = XFS_IOLOCK_SHARED;
-                        locktype = VRWLOCK_WRITE_DIRECT;
                        need_i_mutex = 0;
                }
@@ -781,15 +771,15 @@ retry:
        if (ret == -ENOSPC &&
            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-                xfs_rwunlock(xip, locktype);
+                xfs_iunlock(xip, iolock);
                if (need_i_mutex)
                        mutex_unlock(&inode->i_mutex);
-                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
+                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
-                                DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+                                DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
                                0, 0, 0); /* Delay flag intentionally  unused */
                if (need_i_mutex)
                        mutex_lock(&inode->i_mutex);
-                xfs_rwlock(xip, locktype);
+                xfs_ilock(xip, iolock);
                if (error)
                        goto out_unlock_internal;
                pos = xip->i_size;
@@ -817,7 +807,8 @@ retry:
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
                int error2;
-                xfs_rwunlock(xip, locktype);
+                xfs_iunlock(xip, iolock);
                if (need_i_mutex)
                        mutex_unlock(&inode->i_mutex);
                error2 = sync_page_range(inode, mapping, pos, ret);
@@ -825,7 +816,7 @@ retry:
                        error = error2;
                if (need_i_mutex)
                        mutex_lock(&inode->i_mutex);
-                xfs_rwlock(xip, locktype);
+                xfs_ilock(xip, iolock);
                error2 = xfs_write_sync_logforce(mp, xip);
                if (!error)
                        error = error2;
@@ -846,7 +837,7 @@ retry:
                        xip->i_d.di_size = xip->i_size;
                xfs_iunlock(xip, XFS_ILOCK_EXCL);
        }
-        xfs_rwunlock(xip, locktype);
+        xfs_iunlock(xip, iolock);
 out_unlock_mutex:
        if (need_i_mutex)
                mutex_unlock(&inode->i_mutex);
@@ -884,28 +875,23 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
 }
 /*
- * Wrapper around bdstrat so that we can stop data
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
- * from going to disk in case we are shutting down the filesystem.
+ * we are shutting down the filesystem.  Typically user data goes thru this
- * Typically user data goes thru this path; one of the exceptions
+ * path; one of the exceptions is the superblock.
- * is the superblock.
 */
-int
+void
 xfsbdstrat(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
        ASSERT(mp);
        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                /* Grio redirection would go here
-                 * if (XFS_BUF_IS_GRIO(bp)) {
-                 */
                xfs_buf_iorequest(bp);
-                return 0;
+                return;
        }
        xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-        return (xfs_bioerror_relse(bp));
+        xfs_bioerror_relse(bp);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e200253139cf..e1d498b4ba7a 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *,
 #define xfs_inval_cached_trace(ip, offset, len, first, last)
 #endif
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+/* errors from xfsbdstrat() must be extracted from the buffer */
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 8ba7a2fa6c1d..afd0b0d5fdb2 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void);
 # define XFS_STATS_DEC(count)
 # define XFS_STATS_ADD(count, inc)
-static __inline void xfs_init_procfs(void) { };
+static inline void xfs_init_procfs(void) { };
-static __inline void xfs_cleanup_procfs(void) { };
+static inline void xfs_cleanup_procfs(void) { };
 #endif  /* !CONFIG_PROC_FS */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8831d9518790..865eb708aa95 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
-        int                     error = 0, flags = FLUSH_INODE;
+        int                     error = 0;
+        int                     flags = 0;
        xfs_itrace_entry(XFS_I(inode));
        if (sync) {
@@ -934,7 +935,7 @@ xfs_fs_clear_inode(
                xfs_inactive(ip);
                xfs_iflags_clear(ip, XFS_IMODIFIED);
                if (xfs_reclaim(ip))
-                        panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode);
+                        panic("%s: cannot reclaim 0x%p\n", __func__, inode);
        }
        ASSERT(XFS_I(inode) == NULL);
@@ -1027,8 +1028,7 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR |
+                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-                                     SYNC_REFCACHE | SYNC_SUPER);
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
 }
@@ -1306,7 +1306,7 @@ xfs_fs_fill_super(
        void                    *data,
        int                     silent)
 {
-        struct inode            *rootvp;
+        struct inode            *root;
        struct xfs_mount        *mp = NULL;
        struct xfs_mount_args   *args = xfs_args_allocate(sb, silent);
        int                     error;
@@ -1344,19 +1344,18 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        rootvp = igrab(mp->m_rootip->i_vnode);
+        root = igrab(mp->m_rootip->i_vnode);
-        if (!rootvp) {
+        if (!root) {
                error = ENOENT;
                goto fail_unmount;
        }
+        if (is_bad_inode(root)) {
-        sb->s_root = d_alloc_root(vn_to_inode(rootvp));
+                error = EINVAL;
-        if (!sb->s_root) {
-                error = ENOMEM;
                goto fail_vnrele;
        }
-        if (is_bad_inode(sb->s_root->d_inode)) {
+        sb->s_root = d_alloc_root(root);
-                error = EINVAL;
+        if (!sb->s_root) {
+                error = ENOMEM;
                goto fail_vnrele;
        }
@@ -1378,7 +1377,7 @@ fail_vnrele:
                dput(sb->s_root);
                sb->s_root = NULL;
        } else {
-                VN_RELE(rootvp);
+                iput(root);
        }
 fail_unmount:
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efcf45b14ab..3efb7c6d3303 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -50,13 +50,7 @@ extern void xfs_qm_exit(void);
 # define set_posix_acl_flag(sb) do { } while (0)
 #endif
-#ifdef CONFIG_XFS_SECURITY
+#define XFS_SECURITY_STRING     "security attributes, "
-# define XFS_SECURITY_STRING    "security attributes, "
-# define ENOSECURITY            0
-#else
-# define XFS_SECURITY_STRING
-# define ENOSECURITY            EOPNOTSUPP
-#endif
 #ifdef CONFIG_XFS_RT
 # define XFS_REALTIME_STRING    "realtime, "
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 4da03a4e3520..7e60c7776b1c 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work {
 #define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
 #define SYNC_IOWAIT             0x0100  /* wait for all I/O to complete */
-#define SYNC_SUPER              0x0200  /* flush superblock to disk */
 /*
 * When remounting a filesystem read-only or freezing the filesystem,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b5ea418693b1..8b4d63ce8694 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -23,8 +23,6 @@ struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
-typedef struct dentry   bhv_vname_t;
-typedef __u64           bhv_vnumber_t;
 typedef struct inode    bhv_vnode_t;
 #define VN_ISLNK(vp)    S_ISLNK((vp)->i_mode)
@@ -46,18 +44,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
 }
 /*
- * Values for the vop_rwlock/rwunlock flags parameter.
- */
-typedef enum bhv_vrwlock {
-        VRWLOCK_NONE,
-        VRWLOCK_READ,
-        VRWLOCK_WRITE,
-        VRWLOCK_WRITE_DIRECT,
-        VRWLOCK_TRY_READ,
-        VRWLOCK_TRY_WRITE
-} bhv_vrwlock_t;
-/*
 * Return values for xfs_inactive.  A return value of
 * VN_INACTIVE_NOCACHE implies that the file system behavior
 * has disassociated its state and bhv_desc_t from the vnode.
@@ -73,12 +59,9 @@ typedef enum bhv_vrwlock {
 #define IO_INVIS        0x00020         /* don't update inode timestamps */
 /*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
 */
 #define FLUSH_SYNC              1       /* wait for flush to complete   */
-#define FLUSH_INODE             2       /* flush the inode itself       */
-#define FLUSH_LOG               4       /* force the last log entry for
-                                         * this inode out to disk       */
 /*
 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
@@ -226,13 +209,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
 }
 /*
- * Vname handling macros.
- */
-#define VNAME(dentry)           ((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry)        ((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry)  (vn_from_inode((dentry)->d_inode))
-/*
 * Dealing with bad inodes
 */
 static inline int VN_BAD(bhv_vnode_t *vp)
@@ -303,9 +279,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
 extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
 extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
 #define xfs_itrace_entry(ip)    \
-        _xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address)
+        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit(ip)     \
-        _xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address)
+        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit_tag(ip, tag)    \
        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
 #define xfs_itrace_ref(ip)      \
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 665babcca6a6..631ebb31b295 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1291,7 +1291,7 @@ xfs_qm_dqflush(
        if (flags & XFS_QMOPT_DELWRI) {
                xfs_bdwrite(mp, bp);
        } else if (flags & XFS_QMOPT_ASYNC) {
-                xfs_bawrite(mp, bp);
+                error = xfs_bawrite(mp, bp);
        } else {
                error = xfs_bwrite(mp, bp);
        }
@@ -1439,9 +1439,7 @@ xfs_qm_dqpurge(
        uint            flags)
 {
        xfs_dqhash_t    *thishash;
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = dqp->q_mount;
-        mp = dqp->q_mount;
        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
        ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
@@ -1485,6 +1483,7 @@ xfs_qm_dqpurge(
         * we're unmounting, we do care, so we flush it and wait.
         */
        if (XFS_DQ_IS_DIRTY(dqp)) {
+                int     error;
                xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
                /* dqflush unlocks dqflock */
                /*
@@ -1495,7 +1494,10 @@ xfs_qm_dqpurge(
                 * We don't care about getting disk errors here. We need
                 * to purge this dquot anyway, so we go ahead regardless.
                 */
-                (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+                error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+                if (error)
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
        ASSERT(dqp->q_pincount == 0);
@@ -1580,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait(
                    XFS_INCORE_TRYLOCK);
        if (bp != NULL) {
                if (XFS_BUF_ISDELAYWRITE(bp)) {
+                        int     error;
                        if (XFS_BUF_ISPINNED(bp)) {
                                xfs_log_force(dqp->q_mount,
                                              (xfs_lsn_t)0,
                                              XFS_LOG_FORCE);
                        }
-                        xfs_bawrite(dqp->q_mount, bp);
+                        error = xfs_bawrite(dqp->q_mount, bp);
+                        if (error)
+                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                                        "xfs_qm_dqflock_pushbuf_wait: "
+                                        "pushbuf error %d on dqp %p, bp %p",
+                                        error, dqp, bp);
                } else {
                        xfs_buf_relse(bp);
                }
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1800e8d1f646..36e05ca78412 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push(
        xfs_dq_logitem_t        *logitem)
 {
        xfs_dquot_t     *dqp;
+        int             error;
        dqp = logitem->qli_dquot;
@@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push(
         * lock without sleeping, then there must not have been
         * anyone in the process of flushing the dquot.
         */
-        xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+                        error, dqp);
        xfs_dqunlock(dqp);
 }
@@ -262,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf(
                                              XFS_LOG_FORCE);
                        }
                        if (dopush) {
+                                int     error;
 #ifdef XFSRACEDEBUG
                                delay_for_intr();
                                delay(300);
 #endif
-                                xfs_bawrite(mp, bp);
+                                error = xfs_bawrite(mp, bp);
+                                if (error)
+                                        xfs_fs_cmn_err(CE_WARN, mp,
+        "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
+                                                        error, qip, bp);
                        } else {
                                xfs_buf_relse(bp);
                        }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8e9c5ae6504d..40ea56409561 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy(
 * necessary data structures like quotainfo.  This is also responsible for
 * running a quotacheck as necessary.  We are guaranteed that the superblock
 * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
 */
-int
+void
 xfs_qm_mount_quotas(
        xfs_mount_t     *mp,
        int             mfsi_flags)
@@ -313,7 +316,6 @@ xfs_qm_mount_quotas(
        int             error = 0;
        uint            sbf;
        /*
         * If quotas on realtime volumes is not supported, we disable
         * quotas immediately.
@@ -332,7 +334,8 @@ xfs_qm_mount_quotas(
         * Allocate the quotainfo structure inside the mount struct, and
         * create quotainode(s), and change/rev superblock if necessary.
         */
-        if ((error = xfs_qm_init_quotainfo(mp))) {
+        error = xfs_qm_init_quotainfo(mp);
+        if (error) {
                /*
                 * We must turn off quotas.
                 */
@@ -344,12 +347,11 @@ xfs_qm_mount_quotas(
         * If any of the quotas are not consistent, do a quotacheck.
         */
        if (XFS_QM_NEED_QUOTACHECK(mp) &&
-                !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+            !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-                if ((error = xfs_qm_quotacheck(mp))) {
+                error = xfs_qm_quotacheck(mp);
-                        /* Quotacheck has failed and quotas have
+                if (error) {
-                         * been disabled.
+                        /* Quotacheck failed and disabled quotas. */
-                         */
+                        return;
-                        return XFS_ERROR(error);
                }
        }
        /* 
@@ -357,12 +359,10 @@ xfs_qm_mount_quotas(
         * quotachecked status, since we won't be doing accounting for
         * that type anymore.
         */
-        if (!XFS_IS_UQUOTA_ON(mp)) {
+        if (!XFS_IS_UQUOTA_ON(mp))
                mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-        }
+        if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
-        if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
                mp->m_qflags &= ~XFS_OQUOTA_CHKD;
-        }
 write_changes:
        /*
@@ -392,7 +392,7 @@ xfs_qm_mount_quotas(
                xfs_fs_cmn_err(CE_WARN, mp,
                        "Failed to initialize disk quotas.");
        }
-        return XFS_ERROR(error);
+        return;
 }
 /*
@@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc(
 }
-STATIC int
+STATIC void
 xfs_qm_reset_dqcounts(
        xfs_mount_t     *mp,
        xfs_buf_t       *bp,
@@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts(
                ddq->d_rtbwarns = 0;
                ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
        }
-        return 0;
 }
 STATIC int
@@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs(
                if (error)
                        break;
-                (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+                xfs_qm_reset_dqcounts(mp, bp, firstid, type);
                xfs_bdwrite(mp, bp);
                /*
                 * goto the next block.
@@ -1810,7 +1808,7 @@ xfs_qm_dqusage_adjust(
         * Now release the inode. This will send it to 'inactive', and
         * possibly even free blocks.
         */
-        VN_RELE(XFS_ITOV(ip));
+        IRELE(ip);
        /*
         * Goto next inode.
@@ -1880,6 +1878,14 @@ xfs_qm_quotacheck(
        } while (! done);
        /*
+         * We've made all the changes that we need to make incore.
+         * Flush them down to disk buffers if everything was updated
+         * successfully.
+         */
+        if (!error)
+                error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+        /*
         * We can get this error if we couldn't do a dquot allocation inside
         * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
         * dirty dquots that might be cached, we just want to get rid of them
@@ -1890,11 +1896,6 @@ xfs_qm_quotacheck(
                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
                goto error_return;
        }
-        /*
-         * We've made all the changes that we need to make incore.
-         * Now flush_them down to disk buffers.
-         */
-        xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
        /*
         * We didn't log anything, because if we crashed, we'll have to
@@ -1926,7 +1927,10 @@ xfs_qm_quotacheck(
                ASSERT(mp->m_quotainfo != NULL);
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
-                (void)xfs_mount_reset_sbqflags(mp);
+                if (xfs_mount_reset_sbqflags(mp)) {
+                        cmn_err(CE_WARN, "XFS quotacheck %s: "
+                                "Failed to reset quota flags.", mp->m_fsname);
+                }
        } else {
                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
        }
@@ -1968,7 +1972,7 @@ xfs_qm_init_quotainos(
                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
                                             0, 0, &gip, 0))) {
                                if (uip)
-                                        VN_RELE(XFS_ITOV(uip));
+                                        IRELE(uip);
                                return XFS_ERROR(error);
                        }
                }
@@ -1999,7 +2003,7 @@ xfs_qm_init_quotainos(
                                          sbflags | XFS_SB_GQUOTINO, flags);
                if (error) {
                        if (uip)
-                                VN_RELE(XFS_ITOV(uip));
+                                IRELE(uip);
                        return XFS_ERROR(error);
                }
@@ -2093,12 +2097,17 @@ xfs_qm_shake_freelist(
                 * dirty dquots.
                 */
                if (XFS_DQ_IS_DIRTY(dqp)) {
+                        int     error;
                        xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
                        /*
                         * We flush it delayed write, so don't bother
                         * releasing the mplock.
                         */
-                        (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        if (error) {
+                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
                        dqp = dqp->dq_flnext;
                        continue;
@@ -2265,12 +2274,17 @@ xfs_qm_dqreclaim_one(void)
                 * dirty dquots.
                 */
                if (XFS_DQ_IS_DIRTY(dqp)) {
+                        int     error;
                        xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
                        /*
                         * We flush it delayed write, so don't bother
                         * releasing the freelist lock.
                         */
-                        (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        if (error) {
+                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
                        continue;
                }
@@ -2378,9 +2392,9 @@ xfs_qm_write_sb_changes(
        }
        xfs_mod_sb(tp, flags);
-        (void) xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
-        return 0;
+        return error;
 }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index baf537c1c177..cd2300e374af 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int              xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void             xfs_qm_mount_quotas(xfs_mount_t *, int);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
index a50ffabcf554..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void);
 # define XQM_STATS_INC(count)   do { } while (0)
-static __inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index d2b8be7e75f9..8342823dbdc3 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff(
        /*
         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
-         * and synchronously.
+         * and synchronously. If we fail to write, we should abort the
+         * operation as it cannot be recovered safely if we crash.
         */
-        xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+        error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+        if (error)
+                goto out_error;
        /*
         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff(
         * So, we have QUOTAOFF start and end logitems; the start
         * logitem won't get overwritten until the end logitem appears...
         */
-        xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+        error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+        if (error) {
+                /* We're screwed now. Shutdown is the only option. */
+                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                goto out_error;
+        }
        /*
         * If quotas is completely disabled, close shop.
@@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff(
                XFS_PURGE_INODE(XFS_QI_GQIP(mp));
                XFS_QI_GQIP(mp) = NULL;
        }
+out_error:
        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
        return (error);
@@ -371,12 +380,11 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-        int             error;
+        int             error = 0, error2 = 0;
        xfs_inode_t     *qip;
        if (!capable(CAP_SYS_ADMIN))
                return XFS_ERROR(EPERM);
-        error = 0;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
@@ -384,22 +392,22 @@ xfs_qm_scall_trunc_qfiles(
        if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
                error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
-                if (! error) {
+                if (!error) {
-                        (void) xfs_truncate_file(mp, qip);
+                        error = xfs_truncate_file(mp, qip);
-                        VN_RELE(XFS_ITOV(qip));
+                        IRELE(qip);
                }
        }
        if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
            mp->m_sb.sb_gquotino != NULLFSINO) {
-                error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+                error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
-                if (! error) {
+                if (!error2) {
-                        (void) xfs_truncate_file(mp, qip);
+                        error2 = xfs_truncate_file(mp, qip);
-                        VN_RELE(XFS_ITOV(qip));
+                        IRELE(qip);
                }
        }
-        return (error);
+        return error ? error : error2;
 }
@@ -552,13 +560,13 @@ xfs_qm_scall_getqstat(
                out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
                out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
                if (tempuqip)
-                        VN_RELE(XFS_ITOV(uip));
+                        IRELE(uip);
        }
        if (gip) {
                out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
                out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
                if (tempgqip)
-                        VN_RELE(XFS_ITOV(gip));
+                        IRELE(gip);
        }
        if (mp->m_quotainfo) {
                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
@@ -726,12 +734,12 @@ xfs_qm_scall_setqlim(
        xfs_trans_log_dquot(tp, dqp);
        xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-        return (0);
+        return error;
 }
 STATIC int
@@ -1095,7 +1103,7 @@ again:
                 * inactive code in hell.
                 */
                if (vnode_refd)
-                        VN_RELE(vp);
+                        IRELE(ip);
                XFS_MOUNT_ILOCK(mp);
                /*
                 * If an inode was inserted or removed, we gotta
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 129067cfcb86..0b75d302508f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -24,7 +24,7 @@ static int          ktrace_zentries;
 void __init
 ktrace_init(int zentries)
 {
-        ktrace_zentries = zentries;
+        ktrace_zentries = roundup_pow_of_two(zentries);
        ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
                                        "ktrace_hdr");
@@ -47,13 +47,16 @@ ktrace_uninit(void)
 * ktrace_alloc()
 *
 * Allocate a ktrace header and enough buffering for the given
- * number of entries.
+ * number of entries. Round the number of entries up to a
+ * power of 2 so we can do fast masking to get the index from
+ * the atomic index counter.
 */
 ktrace_t *
 ktrace_alloc(int nentries, unsigned int __nocast sleep)
 {
        ktrace_t        *ktp;
        ktrace_entry_t  *ktep;
+        int             entries;
        ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
@@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
        /*
         * Special treatment for buffers with the ktrace_zentries entries
         */
-        if (nentries == ktrace_zentries) {
+        entries = roundup_pow_of_two(nentries);
+        if (entries == ktrace_zentries) {
                ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
                                                            sleep);
        } else {
-                ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+                ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
                                                            sleep | KM_LARGE);
        }
@@ -91,8 +95,10 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
        }
        ktp->kt_entries  = ktep;
-        ktp->kt_nentries = nentries;
+        ktp->kt_nentries = entries;
-        ktp->kt_index    = 0;
+        ASSERT(is_power_of_2(entries));
+        ktp->kt_index_mask = entries - 1;
+        atomic_set(&ktp->kt_index, 0);
        ktp->kt_rollover = 0;
        return ktp;
 }
@@ -151,8 +157,6 @@ ktrace_enter(
        void            *val14,
        void            *val15)
 {
-        static DEFINE_SPINLOCK(wrap_lock);
-        unsigned long   flags;
        int             index;
        ktrace_entry_t  *ktep;
@@ -161,12 +165,8 @@ ktrace_enter(
        /*
         * Grab an entry by pushing the index up to the next one.
         */
-        spin_lock_irqsave(&wrap_lock, flags);
+        index = atomic_add_return(1, &ktp->kt_index);
-        index = ktp->kt_index;
+        index = (index - 1) & ktp->kt_index_mask;
-        if (++ktp->kt_index == ktp->kt_nentries)
-                ktp->kt_index = 0;
-        spin_unlock_irqrestore(&wrap_lock, flags);
        if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
                ktp->kt_rollover = 1;
@@ -199,11 +199,12 @@ int
 ktrace_nentries(
        ktrace_t        *ktp)
 {
-        if (ktp == NULL) {
+        int     index;
+        if (ktp == NULL)
                return 0;
-        }
-        return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+        index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
+        return (ktp->kt_rollover ? ktp->kt_nentries : index);
 }
 /*
@@ -228,7 +229,7 @@ ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
        int             nentries;
        if (ktp->kt_rollover)
-                index = ktp->kt_index;
+                index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
        else
                index = 0;
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 56e72b40a859..741d6947ca60 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -30,7 +30,8 @@ typedef struct ktrace_entry {
 */
 typedef struct ktrace {
        int             kt_nentries;    /* number of entries in trace buf */
-        int             kt_index;       /* current index in entries */
+        atomic_t        kt_index;       /* current index in entries */
+        unsigned int    kt_index_mask;
        int             kt_rollover;
        ktrace_entry_t  *kt_entries;    /* buffer of entries */
 } ktrace_t;
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..765aaf65e2d3 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
 #define STATIC
 #define DEBUG 1
 #define XFS_BUF_LOCK_TRACKING 1
-/* #define QUOTADEBUG 1 */
+#define QUOTADEBUG 1
 #endif
 #ifdef CONFIG_XFS_TRACE
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7272fe39a92d..8e130b9720ae 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -307,12 +307,13 @@ xfs_acl_vset(
        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
-        if (error)
-                goto out;
        /* Incoming ACL exists, set file mode based on its value */
-        if (kind == _ACL_TYPE_ACCESS)
+        if (!error && kind == _ACL_TYPE_ACCESS)
-                xfs_acl_setmode(vp, xfs_acl, &basicperms);
+                error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
+        if (error)
+                goto out;
        /*
         * If we have more than std unix permissions, set up the actual attr.
@@ -323,7 +324,7 @@ xfs_acl_vset(
        if (!basicperms) {
                xfs_acl_set_attr(vp, xfs_acl, kind, &error);
        } else {
-                xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+                error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
        }
 out:
@@ -707,7 +708,9 @@ xfs_acl_inherit(
        memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
        xfs_acl_filter_mode(mode, cacl);
-        xfs_acl_setmode(vp, cacl, &basicperms);
+        error = xfs_acl_setmode(vp, cacl, &basicperms);
+        if (error)
+                goto out_error;
        /*
         * Set the Default and Access ACL on the file.  The mode is already
@@ -720,6 +723,7 @@ xfs_acl_inherit(
                xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
        if (!error && !basicperms)
                xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+out_error:
        _ACL_FREE(cacl);
        return error;
 }
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdbfbbee4959..1956f83489f1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -45,7 +45,7 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
                    xfs_agnumber_t agno,
                    xfs_agblock_t bno,
@@ -55,24 +55,24 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 ktrace_t *xfs_alloc_trace_buf;
 #define TRACE_ALLOC(s,a)        \
-        xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__)
+        xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
 #define TRACE_FREE(s,a,b,x,f)   \
-        xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__)
+        xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
 #define TRACE_MODAGF(s,a,f)     \
-        xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__)
+        xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
-#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp)       \
+#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp)   \
-        xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+        xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp)   \
+#define TRACE_UNBUSY(__func__,s,ag,sl,tp)       \
-        xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+        xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \
+#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp)        \
-        xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+        xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
 #else
 #define TRACE_ALLOC(s,a)
 #define TRACE_FREE(s,a,b,x,f)
 #define TRACE_MODAGF(s,a,f)
 #define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
 #define TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
 #endif  /* XFS_ALLOC_TRACE */
 /*
@@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 * Compute aligned version of the found extent.
 * Takes alignment and min length into account.
 */
-STATIC int                              /* success (>= minlen) */
+STATIC void
 xfs_alloc_compute_aligned(
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
@@ -116,7 +116,6 @@ xfs_alloc_compute_aligned(
        }
        *resbno = bno;
        *reslen = len;
-        return len >= minlen;
 }
 /*
@@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if (!xfs_alloc_compute_aligned(ltbno, ltlen,
+                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
-                                        args->alignment, args->minlen,
+                                        args->minlen, &ltbnoa, &ltlena);
-                                        &ltbnoa, &ltlena))
+                        if (ltlena < args->minlen)
                                continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
@@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if (xfs_alloc_compute_aligned(ltbno, ltlen,
+                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
-                                        args->alignment, args->minlen,
+                                        args->minlen, &ltbnoa, &ltlena);
-                                        &ltbnoa, &ltlena))
+                        if (ltlena >= args->minlen)
                                break;
                        if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
@@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if (xfs_alloc_compute_aligned(gtbno, gtlen,
+                        xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
-                                        args->alignment, args->minlen,
+                                        args->minlen, &gtbnoa, &gtlena);
-                                        &gtbnoa, &gtlena))
+                        if (gtlena >= args->minlen)
                                break;
                        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
                                goto error0;
@@ -2562,9 +2561,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
 /*
- * returns non-zero if any of (agno,bno):len is in a busy list
+ * If we find the extent in the busy list, force the log out to get the
+ * extent out of the busy list so the caller can use it straight away.
 */
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
                    xfs_agnumber_t agno,
                    xfs_agblock_t bno,
@@ -2572,7 +2572,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 {
        xfs_mount_t             *mp;
        xfs_perag_busy_t        *bsy;
-        int                     n;
        xfs_agblock_t           uend, bend;
        xfs_lsn_t               lsn;
        int                     cnt;
@@ -2585,21 +2584,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
        uend = bno + len - 1;
        /* search pagb_list for this slot, skipping open slots */
-        for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+        for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
-             cnt; bsy++, n++) {
                /*
                 * (start1,length1) within (start2, length2)
                 */
                if (bsy->busy_tp != NULL) {
                        bend = bsy->busy_start + bsy->busy_length - 1;
-                        if ((bno > bend) ||
+                        if ((bno > bend) || (uend < bsy->busy_start)) {
-                            (uend < bsy->busy_start)) {
                                cnt--;
                        } else {
                                TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-                                                 "found1", agno, bno, len, n,
+                                         "found1", agno, bno, len, tp);
-                                                 tp);
                                break;
                        }
                }
@@ -2610,15 +2606,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
         * transaction that freed the block
         */
        if (cnt) {
-                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
                lsn = bsy->busy_tp->t_commit_lsn;
                spin_unlock(&mp->m_perag[agno].pagb_lock);
                xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
        } else {
-                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
+                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
-                n = -1;
                spin_unlock(&mp->m_perag[agno].pagb_lock);
        }
-        return n;
 }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e58f321fdae9..36d781ee5fcc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2647,14 +2647,6 @@ attr_trusted_capable(
 }
 STATIC int
-attr_secure_capable(
-        bhv_vnode_t     *vp,
-        cred_t          *cred)
-{
-        return -ENOSECURITY;
-}
-STATIC int
 attr_system_set(
        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
@@ -2724,7 +2716,7 @@ struct attrnames attr_secure = {
        .attr_get       = attr_generic_get,
        .attr_set       = attr_generic_set,
        .attr_remove    = attr_generic_remove,
-        .attr_capable   = attr_secure_capable,
+        .attr_capable   = (attrcapable_t)fs_noerr,
 };
 struct attrnames attr_user = {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 96ba6aa4ed8c..303d41e4217b 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
        if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
                if (bytes <= XFS_IFORK_ASIZE(dp))
-                        return mp->m_attroffset >> 3;
+                        return dp->i_d.di_forkoff;
                return 0;
        }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2def273855a2..eb198c01c35d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update(
        int             whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)       \
-        xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w)
+        xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
 #define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
-        xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w)
+        xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
 #define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)    \
-        xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w)
+        xfs_bmap_trace_post_update(__func__,d,ip,i,w)
 #define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)     \
-        xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w)
+        xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
 #else
 #define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
 #define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
@@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align(
 #define XFS_ALLOC_GAP_UNITS     4
-STATIC int
+STATIC void
 xfs_bmap_adjacent(
        xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
 {
@@ -2548,7 +2548,6 @@ xfs_bmap_adjacent(
                        ap->rval = gotbno;
        }
 #undef ISVALID
-        return 0;
 }
 STATIC int
@@ -4154,16 +4153,21 @@ xfs_bmap_compute_maxlevels(
         * number of leaf entries, is controlled by the type of di_nextents
         * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
         * (a signed 16-bit number, xfs_aextnum_t).
+         *
+         * Note that we can no longer assume that if we are in ATTR1 that
+         * the fork offset of all the inodes will be (m_attroffset >> 3)
+         * because we could have mounted with ATTR2 and then mounted back
+         * with ATTR1, keeping the di_forkoff's fixed but probably at
+         * various positions. Therefore, for both ATTR1 and ATTR2
+         * we have to assume the worst case scenario of a minimum size
+         * available.
         */
        if (whichfork == XFS_DATA_FORK) {
                maxleafents = MAXEXTNUM;
-                sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
+                sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-                        XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
        } else {
                maxleafents = MAXAEXTNUM;
-                sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
+                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
-                        XFS_BMDR_SPACE_CALC(MINABTPTRS) :
-                        mp->m_sb.sb_inodesize - mp->m_attroffset;
        }
        maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
@@ -5772,7 +5776,6 @@ xfs_getbmap(
        int                     error;          /* return value */
        __int64_t               fixlen;         /* length for -1 case */
        int                     i;              /* extent number */
-        bhv_vnode_t             *vp;            /* corresponding vnode */
        int                     lock;           /* lock state */
        xfs_bmbt_irec_t         *map;           /* buffer for user's data */
        xfs_mount_t             *mp;            /* file system mount point */
@@ -5789,7 +5792,6 @@ xfs_getbmap(
        int                     bmapi_flags;    /* flags for xfs_bmapi */
        __int32_t               oflags;         /* getbmapx bmv_oflags field */
-        vp = XFS_ITOV(ip);
        mp = ip->i_mount;
        whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -5811,7 +5813,7 @@ xfs_getbmap(
        if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
            whichfork == XFS_DATA_FORK) {
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
                if (error)
                        return XFS_ERROR(error);
        }
@@ -5869,6 +5871,10 @@ xfs_getbmap(
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
                error = xfs_flush_pages(ip, (xfs_off_t)0,
                                               -1, 0, FI_REMAPF);
+                if (error) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                return error;
+                }
        }
        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
@@ -6162,10 +6168,10 @@ xfs_check_block(
                        }
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
-                                        __FUNCTION__, j, i,
+                                        __func__, j, i,
                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
-                                        __FUNCTION__);
+                                        __func__);
                        }
                }
        }
@@ -6192,7 +6198,7 @@ xfs_bmap_check_leaf_extents(
        xfs_mount_t             *mp;    /* file system mount structure */
        __be64                  *pp;    /* pointer to block address */
        xfs_bmbt_rec_t          *ep;    /* pointer to current extent */
-        xfs_bmbt_rec_t          *lastp; /* pointer to previous extent */
+        xfs_bmbt_rec_t          last = {0, 0}; /* last extent in prev block */
        xfs_bmbt_rec_t          *nextp; /* pointer to next extent */
        int                     bp_release = 0;
@@ -6262,7 +6268,6 @@ xfs_bmap_check_leaf_extents(
        /*
         * Loop over all leaf nodes checking that all extents are in the right order.
         */
-        lastp = NULL;
        for (;;) {
                xfs_fsblock_t   nextbno;
                xfs_extnum_t    num_recs;
@@ -6283,18 +6288,16 @@ xfs_bmap_check_leaf_extents(
                 */
                ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                if (i) {
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                }
                for (j = 1; j < num_recs; j++) {
                        nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
-                        if (lastp) {
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
-                                xfs_btree_check_rec(XFS_BTNUM_BMAP,
-                                        (void *)lastp, (void *)ep);
-                        }
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
-                                (void *)(nextp));
-                        lastp = ep;
                        ep = nextp;
                }
+                last = *ep;
                i += num_recs;
                if (bp_release) {
                        bp_release = 0;
@@ -6325,13 +6328,13 @@ xfs_bmap_check_leaf_extents(
        return;
 error0:
-        cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
+        cmn_err(CE_WARN, "%s: at error0", __func__);
        if (bp_release)
                xfs_trans_brelse(NULL, bp);
 error_norelse:
        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
-                __FUNCTION__, i);
+                __func__, i);
-        panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
+        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
        return;
 }
 #endif
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 87224b7d7984..6ff70cda451c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -151,7 +151,7 @@ xfs_bmap_trace_exlist(
        xfs_extnum_t            cnt,            /* count of entries in list */
        int                     whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
-        xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w)
+        xfs_bmap_trace_exlist(__func__,ip,c,w)
 #else
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987326a3..4f0e849d973e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor(
 }
 #define XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-        xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__)
+        xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
 #define XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-        xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__)
+        xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
 #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-        xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__)
+        xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
 #define XFS_BMBT_TRACE_ARGI(c,i)        \
-        xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__)
+        xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
 #define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-        xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__)
+        xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
 #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-        xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__)
+        xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
 #define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-        xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__)
+        xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
 #define XFS_BMBT_TRACE_CURSOR(c,s)      \
-        xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__)
+        xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
 #else
 #define XFS_BMBT_TRACE_ARGBI(c,b,i)
 #define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
 /*
 * Insert the current record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. It appears, however, that some callers assume that the cursor is
+ * always valid. Hence if we do a multi-level split we need to revalidate the
+ * cursor.
+ *
+ * When a split occurs, we will see a new cursor returned. Use that as a
+ * trigger to determine if we need to revalidate the original cursor. If we get
+ * a split, then use the original irec to lookup up the path of the record we
+ * just inserted.
+ *
+ * Note that the fact that the btree root is in the inode means that we can
+ * have the level of the tree change without a "split" occurring at the root
+ * level. What happens is that the root is migrated to an allocated block and
+ * the inode root is pointed to it. This means a single split can change the
+ * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
+ * the level change should be accounted as a split so as to correctly trigger a
+ * revalidation of the old cursor.
 */
 int                                     /* error */
 xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
        xfs_fsblock_t   nbno;
        xfs_btree_cur_t *ncur;
        xfs_bmbt_rec_t  nrec;
+        xfs_bmbt_irec_t oirec;          /* original irec */
        xfs_btree_cur_t *pcur;
+        int             splits = 0;
        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
        level = 0;
        nbno = NULLFSBLOCK;
+        oirec = cur->bc_rec.b;
        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
        ncur = NULL;
        pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
                                &i))) {
                        if (pcur != cur)
                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
-                        return error;
                }
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+                        /* allocating a new root is effectively a split */
+                        if (cur->bc_nlevels != pcur->bc_nlevels)
+                                splits++;
                        cur->bc_nlevels = pcur->bc_nlevels;
                        cur->bc_private.b.allocated +=
                                pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
                }
                if (ncur) {
+                        splits++;
                        pcur = ncur;
                        ncur = NULL;
                }
        } while (nbno != NULLFSBLOCK);
+        if (splits > 1) {
+                /* revalidate the old cursor as we had a multi-level split */
+                error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
+                                oirec.br_startblock, oirec.br_blockcount, &i);
+                if (error)
+                        goto error0;
+                ASSERT(i == 1);
+        }
        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
        *stat = i;
        return 0;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 63debd147eb5..53a71c62025d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -645,7 +645,12 @@ xfs_buf_item_push(
        bp = bip->bli_buf;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
-                xfs_bawrite(bip->bli_item.li_mountp, bp);
+                int     error;
+                error = xfs_bawrite(bip->bli_item.li_mountp, bp);
+                if (error)
+                        xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
+                        "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
+                                        error, bip, bp);
        } else {
                xfs_buf_relse(bp);
        }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index e92e73f0e6af..7cb26529766b 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,6 +44,7 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
+struct xfs_name xfs_name_dotdot = {"..", 2};
 void
 xfs_dir_mount(
@@ -146,8 +147,7 @@ int
 xfs_dir_createname(
        xfs_trans_t             *tp,
        xfs_inode_t             *dp,
-        char                    *name,
+        struct xfs_name         *name,
-        int                     namelen,
        xfs_ino_t               inum,           /* new entry inode number */
        xfs_fsblock_t           *first,         /* bmap's firstblock */
        xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
@@ -162,9 +162,9 @@ xfs_dir_createname(
                return rval;
        XFS_STATS_INC(xs_dir_create);
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
        args.inumber = inum;
        args.dp = dp;
        args.firstblock = first;
@@ -197,8 +197,7 @@ int
 xfs_dir_lookup(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,
+        struct xfs_name *name,
-        int             namelen,
        xfs_ino_t       *inum)          /* out: inode number */
 {
        xfs_da_args_t   args;
@@ -207,18 +206,14 @@ xfs_dir_lookup(
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_lookup);
+        memset(&args, 0, sizeof(xfs_da_args_t));
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
-        args.inumber = 0;
        args.dp = dp;
-        args.firstblock = NULL;
-        args.flist = NULL;
-        args.total = 0;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.justcheck = args.addname = 0;
        args.oknoent = 1;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -247,8 +242,7 @@ int
 xfs_dir_removename(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,
+        struct xfs_name *name,
-        int             namelen,
        xfs_ino_t       ino,
        xfs_fsblock_t   *first,         /* bmap's firstblock */
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
@@ -261,9 +255,9 @@ xfs_dir_removename(
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_remove);
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
        args.inumber = ino;
        args.dp = dp;
        args.firstblock = first;
@@ -329,8 +323,7 @@ int
 xfs_dir_replace(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,          /* name of entry to replace */
+        struct xfs_name *name,          /* name of entry to replace */
-        int             namelen,
        xfs_ino_t       inum,           /* new inode number */
        xfs_fsblock_t   *first,         /* bmap's firstblock */
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
@@ -345,9 +338,9 @@ xfs_dir_replace(
        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
        args.inumber = inum;
        args.dp = dp;
        args.firstblock = first;
@@ -374,28 +367,29 @@ xfs_dir_replace(
 /*
 * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
 */
 int
 xfs_dir_canenter(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,          /* name of entry to add */
+        struct xfs_name *name,          /* name of entry to add */
-        int             namelen)
+        uint            resblks)
 {
        xfs_da_args_t   args;
        int             rval;
        int             v;              /* type-checking value */
+        if (resblks)
+                return 0;
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        memset(&args, 0, sizeof(xfs_da_args_t));
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
-        args.inumber = 0;
        args.dp = dp;
-        args.firstblock = NULL;
-        args.flist = NULL;
-        args.total = 0;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
        args.justcheck = args.addname = args.oknoent = 1;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index b265197e74cf..6392f939029f 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,6 +59,8 @@ typedef	__uint32_t	xfs_dir2_db_t;
 */
 typedef xfs_off_t       xfs_dir2_off_t;
+extern struct xfs_name  xfs_name_dotdot;
 /*
 * Generic directory interface routines
 */
@@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
 extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_inode *pdp);
 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t inum,
+                                struct xfs_name *name, xfs_ino_t inum,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t *inum);
+                                struct xfs_name *name, xfs_ino_t *inum);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t ino,
+                                struct xfs_name *name, xfs_ino_t ino,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t inum,
+                                struct xfs_name *name, xfs_ino_t inum,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen);
+                                struct xfs_name *name, uint resblks);
 extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 /*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eb03eab5ca52..3f3785b10804 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -73,7 +73,7 @@ xfs_filestreams_trace(
 #define TRACE4(mp,t,a0,a1,a2,a3)        TRACE6(mp,t,a0,a1,a2,a3,0,0)
 #define TRACE5(mp,t,a0,a1,a2,a3,a4)     TRACE6(mp,t,a0,a1,a2,a3,a4,0)
 #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
-        xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
+        xfs_filestreams_trace(mp, t, __func__, __LINE__, \
                                (__psunsigned_t)a0, (__psunsigned_t)a1, \
                                (__psunsigned_t)a2, (__psunsigned_t)a3, \
                                (__psunsigned_t)a4, (__psunsigned_t)a5)
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5a146cb22980..a64dfbd565a5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -107,6 +107,16 @@ xfs_ialloc_log_di(
 /*
 * Allocation group level functions.
 */
+static inline int
+xfs_ialloc_cluster_alignment(
+        xfs_alloc_arg_t *args)
+{
+        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+            args->mp->m_sb.sb_inoalignmt >=
+             XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+                return args->mp->m_sb.sb_inoalignmt;
+        return 1;
+}
 /*
 * Allocate new inodes in the allocation group specified by agbp.
@@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc(
                args.mod = args.total = args.wasdel = args.isfl =
                        args.userdata = args.minalignslop = 0;
                args.prod = 1;
-                args.alignment = 1;
                /*
-                 * Allow space for the inode btree to split.
+                 * We need to take into account alignment here to ensure that
+                 * we don't modify the free list if we fail to have an exact
+                 * block. If we don't have an exact match, and every oher
+                 * attempt allocation attempt fails, we'll end up cancelling
+                 * a dirty transaction and shutting down.
+                 *
+                 * For an exact allocation, alignment must be 1,
+                 * however we need to take cluster alignment into account when
+                 * fixing up the freelist. Use the minalignslop field to
+                 * indicate that extra blocks might be required for alignment,
+                 * but not to use them in the actual exact allocation.
                 */
+                args.alignment = 1;
+                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+                /* Allow space for the inode btree to split. */
                args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
@@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc(
                        ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
                        args.alignment = args.mp->m_dalign;
                        isaligned = 1;
-                } else if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
+                } else
-                           args.mp->m_sb.sb_inoalignmt >=
+                        args.alignment = xfs_ialloc_cluster_alignment(&args);
-                           XFS_B_TO_FSBT(args.mp,
-                                XFS_INODE_CLUSTER_SIZE(args.mp)))
-                                args.alignment = args.mp->m_sb.sb_inoalignmt;
-                else
-                        args.alignment = 1;
                /*
                 * Need to figure out where to allocate the inode blocks.
                 * Ideally they should be spaced out through the a.g.
@@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc(
                args.agbno = be32_to_cpu(agi->agi_root);
                args.fsbno = XFS_AGB_TO_FSB(args.mp,
                                be32_to_cpu(agi->agi_seqno), args.agbno);
-                if (xfs_sb_version_hasalign(&args.mp->m_sb) &&
+                args.alignment = xfs_ialloc_cluster_alignment(&args);
-                        args.mp->m_sb.sb_inoalignmt >=
-                        XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
-                                args.alignment = args.mp->m_sb.sb_inoalignmt;
-                else
-                        args.alignment = 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8e09b71f4104..e657c5128460 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -78,7 +78,6 @@ xfs_iget_core(
        xfs_inode_t     *ip;
        xfs_inode_t     *iq;
        int             error;
-        xfs_icluster_t  *icl, *new_icl = NULL;
        unsigned long   first_index, mask;
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
@@ -229,11 +228,9 @@ finish_inode:
        }
        /*
-         * This is a bit messy - we preallocate everything we _might_
+         * Preload the radix tree so we can insert safely under the
-         * need before we pick up the ici lock. That way we don't have to
+         * write spinlock.
-         * juggle locks and go all the way back to the start.
         */
-        new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
        if (radix_tree_preload(GFP_KERNEL)) {
                xfs_idestroy(ip);
                delay(1);
@@ -242,17 +239,6 @@ finish_inode:
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-        /*
-         * Find the cluster if it exists
-         */
-        icl = NULL;
-        if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
-                                                        first_index, 1)) {
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
-                        icl = iq->i_cluster;
-        }
        /*
         * insert the new inode
         */
@@ -267,30 +253,13 @@ finish_inode:
        }
        /*
-         * These values _must_ be set before releasing ihlock!
+         * These values _must_ be set before releasing the radix tree lock!
         */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        ASSERT(ip->i_cluster == NULL);
-        if (!icl) {
-                spin_lock_init(&new_icl->icl_lock);
-                INIT_HLIST_HEAD(&new_icl->icl_inodes);
-                icl = new_icl;
-                new_icl = NULL;
-        } else {
-                ASSERT(!hlist_empty(&icl->icl_inodes));
-        }
-        spin_lock(&icl->icl_lock);
-        hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
-        ip->i_cluster = icl;
-        spin_unlock(&icl->icl_lock);
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        if (new_icl)
-                kmem_zone_free(xfs_icluster_zone, new_icl);
        /*
         * Link ip to its mount and thread it on the mount's inode list.
@@ -529,18 +498,6 @@ xfs_iextract(
        xfs_put_perag(mp, pag);
        /*
-         * Remove from cluster list
-         */
-        mp = ip->i_mount;
-        spin_lock(&ip->i_cluster->icl_lock);
-        hlist_del(&ip->i_cnode);
-        spin_unlock(&ip->i_cluster->icl_lock);
-        /* was last inode in cluster? */
-        if (hlist_empty(&ip->i_cluster->icl_inodes))
-                kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
-        /*
         * Remove from mount's inode list.
         */
        XFS_MOUNT_ILOCK(mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f43a6e01d68f..ca12acb90394 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,7 +55,6 @@
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
-kmem_zone_t *xfs_icluster_zone;
 /*
 * Used in xfs_itruncate().  This is the maximum number of extents
@@ -126,6 +125,90 @@ xfs_inobp_check(
 #endif
 /*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_imap_t      *imap,
+        xfs_buf_t       **bpp,
+        uint            buf_flags,
+        uint            imap_flags)
+{
+        int             error;
+        int             i;
+        int             ni;
+        xfs_buf_t       *bp;
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                   (int)imap->im_len, buf_flags, &bp);
+        if (error) {
+                if (error != EAGAIN) {
+                        cmn_err(CE_WARN,
+                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                                "an error %d on %s.  Returning error.",
+                                error, mp->m_fsname);
+                } else {
+                        ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+                }
+                return error;
+        }
+        /*
+         * Validate the magic number and version of every inode in the buffer
+         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+         */
+#ifdef DEBUG
+        ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else   /* usual case */
+        ni = 1;
+#endif
+        for (i = 0; i < ni; i++) {
+                int             di_ok;
+                xfs_dinode_t    *dip;
+                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                        (i << mp->m_sb.sb_inodelog));
+                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                                XFS_ERRTAG_ITOBP_INOTOBP,
+                                                XFS_RANDOM_ITOBP_INOTOBP))) {
+                        if (imap_flags & XFS_IMAP_BULKSTAT) {
+                                xfs_trans_brelse(tp, bp);
+                                return XFS_ERROR(EINVAL);
+                        }
+                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+                                                XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+                        cmn_err(CE_PANIC,
+                                        "Device %s - bad inode magic/vsn "
+                                        "daddr %lld #%d (magic=%x)",
+                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
+                                (unsigned long long)imap->im_blkno, i,
+                                be16_to_cpu(dip->di_core.di_magic));
+#endif
+                        xfs_trans_brelse(tp, bp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+        }
+        xfs_inobp_check(mp, bp);
+        /*
+         * Mark the buffer as an inode buffer now that it looks good
+         */
+        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+        *bpp = bp;
+        return 0;
+}
+/*
 * This routine is called to map an inode number within a file
 * system to the buffer containing the on-disk version of the
 * inode.  It returns a pointer to the buffer containing the
@@ -147,72 +230,19 @@ xfs_inotobp(
        xfs_buf_t       **bpp,
        int             *offset)
 {
-        int             di_ok;
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        xfs_dinode_t    *dip;
-        /*
-         * Call the space management code to find the location of the
-         * inode on disk.
-         */
        imap.im_blkno = 0;
        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
-        if (error != 0) {
+        if (error)
-                cmn_err(CE_WARN,
-        "xfs_inotobp: xfs_imap()  returned an "
-        "error %d on %s.  Returning error.", error, mp->m_fsname);
                return error;
-        }
-        /*
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
-         * If the inode number maps to a block outside the bounds of the
+        if (error)
-         * file system then return NULL rather than calling read_buf
-         * and panicing when we get an error from the driver.
-         */
-        if ((imap.im_blkno + imap.im_len) >
-            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                cmn_err(CE_WARN,
-        "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
-        "of the file system %s.  Returning EINVAL.",
-                        (unsigned long long)imap.im_blkno,
-                        imap.im_len, mp->m_fsname);
-                return XFS_ERROR(EINVAL);
-        }
-        /*
-         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-         * default to just a read_buf() call.
-         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-        if (error) {
-                cmn_err(CE_WARN,
-        "xfs_inotobp: xfs_trans_read_buf()  returned an "
-        "error %d on %s.  Returning error.", error, mp->m_fsname);
                return error;
-        }
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
-        di_ok =
-                be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-                XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-        if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
-                        XFS_RANDOM_ITOBP_INOTOBP))) {
-                XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
-                xfs_trans_brelse(tp, bp);
-                cmn_err(CE_WARN,
-        "xfs_inotobp: XFS_TEST_ERROR()  returned an "
-        "error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        xfs_inobp_check(mp, bp);
-        /*
-         * Set *dipp to point to the on-disk inode in the buffer.
-         */
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        *offset = imap.im_boffset;
@@ -248,46 +278,21 @@ xfs_itobp(
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
        xfs_daddr_t     bno,
-        uint            imap_flags)
+        uint            imap_flags,
+        uint            buf_flags)
 {
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        int             i;
-        int             ni;
        if (ip->i_blkno == (xfs_daddr_t)0) {
-                /*
-                 * Call the space management code to find the location of the
-                 * inode on disk.
-                 */
                imap.im_blkno = bno;
-                if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
+                error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                        XFS_IMAP_LOOKUP | imap_flags)))
+                                        XFS_IMAP_LOOKUP | imap_flags);
+                if (error)
                        return error;
                /*
-                 * If the inode number maps to a block outside the bounds
-                 * of the file system then return NULL rather than calling
-                 * read_buf and panicing when we get an error from the
-                 * driver.
-                 */
-                if ((imap.im_blkno + imap.im_len) >
-                    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-                                        "(imap.im_blkno (0x%llx) "
-                                        "+ imap.im_len (0x%llx)) > "
-                                        " XFS_FSB_TO_BB(mp, "
-                                        "mp->m_sb.sb_dblocks) (0x%llx)",
-                                        (unsigned long long) imap.im_blkno,
-                                        (unsigned long long) imap.im_len,
-                                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
-                        return XFS_ERROR(EINVAL);
-                }
-                /*
                 * Fill in the fields in the inode that will be used to
                 * map the inode to its buffer from now on.
                 */
@@ -305,76 +310,17 @@ xfs_itobp(
        }
        ASSERT(bno == 0 || bno == imap.im_blkno);
-        /*
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
-         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
+        if (error)
-         * default to just a read_buf() call.
-         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-        if (error) {
-#ifdef DEBUG
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-                                "xfs_trans_read_buf() returned error %d, "
-                                "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
-                                error, (unsigned long long) imap.im_blkno,
-                                (unsigned long long) imap.im_len);
-#endif /* DEBUG */
                return error;
-        }
-        /*
-         * Validate the magic number and version of every inode in the buffer
-         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-         * No validation is done here in userspace (xfs_repair).
-         */
-#if !defined(__KERNEL__)
-        ni = 0;
-#elif defined(DEBUG)
-        ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else   /* usual case */
-        ni = 1;
-#endif
-        for (i = 0; i < ni; i++) {
-                int             di_ok;
-                xfs_dinode_t    *dip;
-                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+        if (!bp) {
-                                        (i << mp->m_sb.sb_inodelog));
+                ASSERT(buf_flags & XFS_BUF_TRYLOCK);
-                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                ASSERT(tp == NULL);
-                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                *bpp = NULL;
-                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                return EAGAIN;
-                                                XFS_ERRTAG_ITOBP_INOTOBP,
-                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (imap_flags & XFS_IMAP_BULKSTAT) {
-                                xfs_trans_brelse(tp, bp);
-                                return XFS_ERROR(EINVAL);
-                        }
-#ifdef DEBUG
-                        cmn_err(CE_ALERT,
-                                        "Device %s - bad inode magic/vsn "
-                                        "daddr %lld #%d (magic=%x)",
-                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
-                                (unsigned long long)imap.im_blkno, i,
-                                be16_to_cpu(dip->di_core.di_magic));
-#endif
-                        XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
-                                             mp, dip);
-                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
        }
-        xfs_inobp_check(mp, bp);
-        /*
-         * Mark the buffer as an inode buffer now that it looks good
-         */
-        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-        /*
-         * Set *dipp to point to the on-disk inode in the buffer.
-         */
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        return 0;
@@ -878,7 +824,7 @@ xfs_iread(
         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
         * know that this is a new incore inode.
         */
-        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
+        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
        if (error) {
                kmem_zone_free(xfs_inode_zone, ip);
                return error;
@@ -1518,51 +1464,50 @@ xfs_itruncate_start(
 }
 /*
- * Shrink the file to the given new_size.  The new
+ * Shrink the file to the given new_size.  The new size must be smaller than
- * size must be smaller than the current size.
+ * the current size.  This will free up the underlying blocks in the removed
- * This will free up the underlying blocks
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
- * in the removed range after a call to xfs_itruncate_start()
- * or xfs_atruncate_start().
 *
- * The transaction passed to this routine must have made
+ * The transaction passed to this routine must have made a permanent log
- * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
- * This routine may commit the given transaction and
+ * given transaction and start new ones, so make sure everything involved in
- * start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
- * the transaction is tidy before calling here.
+ * returned to the caller to be committed.  The incoming transaction must
- * Some transaction will be returned to the caller to be
+ * already include the inode, and both inode locks must be held exclusively.
- * committed.  The incoming transaction must already include
+ * The inode must also be "held" within the transaction.  On return the inode
- * the inode, and both inode locks must be held exclusively.
+ * will be "held" within the returned transaction.  This routine does NOT
- * The inode must also be "held" within the transaction.  On
+ * require any disk space to be reserved for it within the transaction.
- * return the inode will be "held" within the returned transaction.
- * This routine does NOT require any disk space to be reserved
- * for it within the transaction.
 *
- * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
- * and it indicates the fork which is to be truncated.  For the
+ * indicates the fork which is to be truncated.  For the attribute fork we only
- * attribute fork we only support truncation to size 0.
+ * support truncation to size 0.
 *
- * We use the sync parameter to indicate whether or not the first
+ * We use the sync parameter to indicate whether or not the first transaction
- * transaction we perform might have to be synchronous.  For the attr fork,
+ * we perform might have to be synchronous.  For the attr fork, it needs to be
- * it needs to be so if the unlink of the inode is not yet known to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
- * permanent in the log.  This keeps us from freeing and reusing the
+ * This keeps us from freeing and reusing the blocks of the attribute fork
- * blocks of the attribute fork before the unlink of the inode becomes
+ * before the unlink of the inode becomes permanent.
- * permanent.
 *
- * For the data fork, we normally have to run synchronously if we're
+ * For the data fork, we normally have to run synchronously if we're being
- * being called out of the inactive path or we're being called
+ * called out of the inactive path or we're being called out of the create path
- * out of the create path where we're truncating an existing file.
+ * where we're truncating an existing file.  Either way, the truncate needs to
- * Either way, the truncate needs to be sync so blocks don't reappear
+ * be sync so blocks don't reappear in the file with altered data in case of a
- * in the file with altered data in case of a crash.  wsync filesystems
+ * crash.  wsync filesystems can run the first case async because anything that
- * can run the first case async because anything that shrinks the inode
+ * shrinks the inode has to run sync so by the time we're called here from
- * has to run sync so by the time we're called here from inactive, the
+ * inactive, the inode size is permanently set to 0.
- * inode size is permanently set to 0.
 *
- * Calls from the truncate path always need to be sync unless we're
+ * Calls from the truncate path always need to be sync unless we're in a wsync
- * in a wsync filesystem and the file has already been unlinked.
+ * filesystem and the file has already been unlinked.
 *
- * The caller is responsible for correctly setting the sync parameter.
+ * The caller is responsible for correctly setting the sync parameter.  It gets
- * It gets too hard for us to guess here which path we're being called
+ * too hard for us to guess here which path we're being called out of just
- * out of just based on inode state.
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
 */
 int
 xfs_itruncate_finish(
@@ -1741,65 +1686,51 @@ xfs_itruncate_finish(
                 */
                error = xfs_bmap_finish(tp, &free_list, &committed);
                ntp = *tp;
+                if (committed) {
+                        /* link the inode into the next xact in the chain */
+                        xfs_trans_ijoin(ntp, ip,
+                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+                        xfs_trans_ihold(ntp, ip);
+                }
                if (error) {
                        /*
-                         * If the bmap finish call encounters an error,
+                         * If the bmap finish call encounters an error, return
-                         * return to the caller where the transaction
+                         * to the caller where the transaction can be properly
-                         * can be properly aborted.  We just need to
+                         * aborted.  We just need to make sure we're not
-                         * make sure we're not holding any resources
+                         * holding any resources that we were not when we came
-                         * that we were not when we came in.
+                         * in.
                         *
-                         * Aborting from this point might lose some
+                         * Aborting from this point might lose some blocks in
-                         * blocks in the file system, but oh well.
+                         * the file system, but oh well.
                         */
                        xfs_bmap_cancel(&free_list);
-                        if (committed) {
-                                /*
-                                 * If the passed in transaction committed
-                                 * in xfs_bmap_finish(), then we want to
-                                 * add the inode to this one before returning.
-                                 * This keeps things simple for the higher
-                                 * level code, because it always knows that
-                                 * the inode is locked and held in the
-                                 * transaction that returns to it whether
-                                 * errors occur or not.  We don't mark the
-                                 * inode dirty so that this transaction can
-                                 * be easily aborted if possible.
-                                 */
-                                xfs_trans_ijoin(ntp, ip,
-                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                                xfs_trans_ihold(ntp, ip);
-                        }
                        return error;
                }
                if (committed) {
                        /*
-                         * The first xact was committed,
+                         * Mark the inode dirty so it will be logged and
-                         * so add the inode to the new one.
+                         * moved forward in the log as part of every commit.
-                         * Mark it dirty so it will be logged
-                         * and moved forward in the log as
-                         * part of every commit.
                         */
-                        xfs_trans_ijoin(ntp, ip,
-                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        xfs_trans_ihold(ntp, ip);
                        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
                }
                ntp = xfs_trans_dup(ntp);
-                (void) xfs_trans_commit(*tp, 0);
+                error = xfs_trans_commit(*tp, 0);
                *tp = ntp;
-                error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                          XFS_TRANS_PERM_LOG_RES,
+                /* link the inode into the next transaction in the chain */
-                                          XFS_ITRUNCATE_LOG_COUNT);
-                /*
-                 * Add the inode being truncated to the next chained
-                 * transaction.
-                 */
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
                xfs_trans_ihold(ntp, ip);
+                if (!error)
+                        error = xfs_trans_reserve(ntp, 0,
+                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                        XFS_TRANS_PERM_LOG_RES,
+                                        XFS_ITRUNCATE_LOG_COUNT);
                if (error)
-                        return (error);
+                        return error;
        }
        /*
         * Only update the size in the case of the data fork, but
@@ -1967,7 +1898,7 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error)
                        return error;
@@ -2075,7 +2006,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2137,7 +2068,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2172,13 +2103,6 @@ xfs_iunlink_remove(
        return 0;
 }
-STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
-{
-        return (((ip->i_itemp == NULL) ||
-                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-                (ip->i_update_core == 0));
-}
 STATIC void
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
@@ -2400,7 +2324,7 @@ xfs_ifree(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
+        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
        if (error)
                return error;
@@ -2678,14 +2602,31 @@ xfs_imap(
        fsbno = imap->im_blkno ?
                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
        imap->im_len = XFS_FSB_TO_BB(mp, len);
        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
        imap->im_ioffset = (ushort)off;
        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+        /*
+         * If the inode number maps to a block outside the bounds
+         * of the file system then return NULL rather than calling
+         * read_buf and panicing when we get an error from the
+         * driver.
+         */
+        if ((imap->im_blkno + imap->im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        (unsigned long long) imap->im_blkno,
+                        (unsigned long long) imap->im_len,
+                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+                return EINVAL;
+        }
        return 0;
 }
@@ -2826,38 +2767,41 @@ xfs_iunpin(
 }
 /*
- * This is called to wait for the given inode to be unpinned.
+ * This is called to unpin an inode. It can be directed to wait or to return
- * It will sleep until this happens.  The caller must have the
+ * immediately without waiting for the inode to be unpinned.  The caller must
- * inode locked in at least shared mode so that the buffer cannot
+ * have the inode locked in at least shared mode so that the buffer cannot be
- * be subsequently pinned once someone is waiting for it to be
+ * subsequently pinned once someone is waiting for it to be unpinned.
- * unpinned.
 */
 STATIC void
-xfs_iunpin_wait(
+__xfs_iunpin_wait(
-        xfs_inode_t     *ip)
+        xfs_inode_t     *ip,
+        int             wait)
 {
-        xfs_inode_log_item_t    *iip;
+        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        xfs_lsn_t       lsn;
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+        if (atomic_read(&ip->i_pincount) == 0)
-        if (atomic_read(&ip->i_pincount) == 0) {
                return;
-        }
-        iip = ip->i_itemp;
+        /* Give the log a push to start the unpinning I/O */
-        if (iip && iip->ili_last_lsn) {
+        xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
-                lsn = iip->ili_last_lsn;
+                                iip->ili_last_lsn : 0, XFS_LOG_FORCE);
-        } else {
+        if (wait)
-                lsn = (xfs_lsn_t)0;
+                wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
-        }
+}
-        /*
+static inline void
-         * Give the log a push so we don't wait here too long.
+xfs_iunpin_wait(
-         */
+        xfs_inode_t     *ip)
-        xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+{
+        __xfs_iunpin_wait(ip, 1);
+}
-        wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+static inline void
+xfs_iunpin_nowait(
+        xfs_inode_t     *ip)
+{
+        __xfs_iunpin_wait(ip, 0);
 }
@@ -2932,7 +2876,7 @@ xfs_iextents_copy(
 * format indicates the current state of the fork.
 */
 /*ARGSUSED*/
-STATIC int
+STATIC void
 xfs_iflush_fork(
        xfs_inode_t             *ip,
        xfs_dinode_t            *dip,
@@ -2953,16 +2897,16 @@ xfs_iflush_fork(
        static const short      extflag[2] =
                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-        if (iip == NULL)
+        if (!iip)
-                return 0;
+                return;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        /*
         * This can happen if we gave up in iformat in an error path,
         * for the attribute fork.
         */
-        if (ifp == NULL) {
+        if (!ifp) {
                ASSERT(whichfork == XFS_ATTR_FORK);
-                return 0;
+                return;
        }
        cp = XFS_DFORK_PTR(dip, whichfork);
        mp = ip->i_mount;
@@ -3023,8 +2967,145 @@ xfs_iflush_fork(
                ASSERT(0);
                break;
        }
+}
+STATIC int
+xfs_iflush_cluster(
+        xfs_inode_t     *ip,
+        xfs_buf_t       *bp)
+{
+        xfs_mount_t             *mp = ip->i_mount;
+        xfs_perag_t             *pag = xfs_get_perag(mp, ip->i_ino);
+        unsigned long           first_index, mask;
+        int                     ilist_size;
+        xfs_inode_t             **ilist;
+        xfs_inode_t             *iq;
+        int                     nr_found;
+        int                     clcount = 0;
+        int                     bufwasdelwri;
+        int                     i;
+        ASSERT(pag->pagi_inodeok);
+        ASSERT(pag->pag_ici_init);
+        ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+        ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+        if (!ilist)
+                return 0;
+        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+        read_lock(&pag->pag_ici_lock);
+        /* really need a gang lookup range call here */
+        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+                                        first_index,
+                                        XFS_INODE_CLUSTER_SIZE(mp));
+        if (nr_found == 0)
+                goto out_free;
+        for (i = 0; i < nr_found; i++) {
+                iq = ilist[i];
+                if (iq == ip)
+                        continue;
+                /* if the inode lies outside this cluster, we're done. */
+                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                        break;
+                /*
+                 * Do an un-protected check to see if the inode is dirty and
+                 * is a candidate for flushing.  These checks will be repeated
+                 * later after the appropriate locks are acquired.
+                 */
+                if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+                        continue;
+                /*
+                 * Try to get locks.  If any are unavailable or it is pinned,
+                 * then this inode cannot be flushed and is skipped.
+                 */
+                if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+                        continue;
+                if (!xfs_iflock_nowait(iq)) {
+                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                        continue;
+                }
+                if (xfs_ipincount(iq)) {
+                        xfs_ifunlock(iq);
+                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                        continue;
+                }
+                /*
+                 * arriving here means that this inode can be flushed.  First
+                 * re-check that it's dirty before flushing.
+                 */
+                if (!xfs_inode_clean(iq)) {
+                        int     error;
+                        error = xfs_iflush_int(iq, bp);
+                        if (error) {
+                                xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                                goto cluster_corrupt_out;
+                        }
+                        clcount++;
+                } else {
+                        xfs_ifunlock(iq);
+                }
+                xfs_iunlock(iq, XFS_ILOCK_SHARED);
+        }
+        if (clcount) {
+                XFS_STATS_INC(xs_icluster_flushcnt);
+                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+        }
+out_free:
+        read_unlock(&pag->pag_ici_lock);
+        kmem_free(ilist, ilist_size);
        return 0;
+cluster_corrupt_out:
+        /*
+         * Corruption detected in the clustering loop.  Invalidate the
+         * inode buffer and shut down the filesystem.
+         */
+        read_unlock(&pag->pag_ici_lock);
+        /*
+         * Clean up the buffer.  If it was B_DELWRI, just release it --
+         * brelse can handle it with no problems.  If not, shut down the
+         * filesystem before releasing the buffer.
+         */
+        bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+        if (bufwasdelwri)
+                xfs_buf_relse(bp);
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+        if (!bufwasdelwri) {
+                /*
+                 * Just like incore_relse: if we have b_iodone functions,
+                 * mark the buffer as an error and call them.  Otherwise
+                 * mark it as stale and brelse.
+                 */
+                if (XFS_BUF_IODONE_FUNC(bp)) {
+                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+                        XFS_BUF_UNDONE(bp);
+                        XFS_BUF_STALE(bp);
+                        XFS_BUF_SHUT(bp);
+                        XFS_BUF_ERROR(bp,EIO);
+                        xfs_biodone(bp);
+                } else {
+                        XFS_BUF_STALE(bp);
+                        xfs_buf_relse(bp);
+                }
+        }
+        /*
+         * Unlocks the flush lock
+         */
+        xfs_iflush_abort(iq);
+        kmem_free(ilist, ilist_size);
+        return XFS_ERROR(EFSCORRUPTED);
 }
 /*
@@ -3046,11 +3127,7 @@ xfs_iflush(
        xfs_dinode_t            *dip;
        xfs_mount_t             *mp;
        int                     error;
-        /* REFERENCED */
+        int                     noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-        xfs_inode_t             *iq;
-        int                     clcount;        /* count of inodes clustered */
-        int                     bufwasdelwri;
-        struct hlist_node       *entry;
        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
        XFS_STATS_INC(xs_iflush_count);
@@ -3067,8 +3144,7 @@ xfs_iflush(
         * If the inode isn't dirty, then just release the inode
         * flush lock and do nothing.
         */
-        if ((ip->i_update_core == 0) &&
+        if (xfs_inode_clean(ip)) {
-            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
                ASSERT((iip != NULL) ?
                         !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
                xfs_ifunlock(ip);
@@ -3076,11 +3152,21 @@ xfs_iflush(
        }
        /*
-         * We can't flush the inode until it is unpinned, so
+         * We can't flush the inode until it is unpinned, so wait for it if we
-         * wait for it.  We know noone new can pin it, because
+         * are allowed to block.  We know noone new can pin it, because we are
-         * we are holding the inode lock shared and you need
+         * holding the inode lock shared and you need to hold it exclusively to
-         * to hold it exclusively to pin the inode.
+         * pin the inode.
+         *
+         * If we are not allowed to block, force the log out asynchronously so
+         * that when we come back the inode will be unpinned. If other inodes
+         * in the same cluster are dirty, they will probably write the inode
+         * out for us if they occur after the log force completes.
         */
+        if (noblock && xfs_ipincount(ip)) {
+                xfs_iunpin_nowait(ip);
+                xfs_ifunlock(ip);
+                return EAGAIN;
+        }
        xfs_iunpin_wait(ip);
        /*
@@ -3097,15 +3183,6 @@ xfs_iflush(
        }
        /*
-         * Get the buffer containing the on-disk inode.
-         */
-        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
-        if (error) {
-                xfs_ifunlock(ip);
-                return error;
-        }
-        /*
         * Decide how buffer will be flushed out.  This is done before
         * the call to xfs_iflush_int because this field is zeroed by it.
         */
@@ -3121,6 +3198,7 @@ xfs_iflush(
                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
                        flags = 0;
                        break;
+                case XFS_IFLUSH_ASYNC_NOBLOCK:
                case XFS_IFLUSH_ASYNC:
                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
                        flags = INT_ASYNC;
@@ -3140,6 +3218,7 @@ xfs_iflush(
                case XFS_IFLUSH_DELWRI:
                        flags = INT_DELWRI;
                        break;
+                case XFS_IFLUSH_ASYNC_NOBLOCK:
                case XFS_IFLUSH_ASYNC:
                        flags = INT_ASYNC;
                        break;
@@ -3154,94 +3233,41 @@ xfs_iflush(
        }
        /*
-         * First flush out the inode that xfs_iflush was called with.
+         * Get the buffer containing the on-disk inode.
         */
-        error = xfs_iflush_int(ip, bp);
+        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
-        if (error) {
+                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
-                goto corrupt_out;
+        if (error || !bp) {
+                xfs_ifunlock(ip);
+                return error;
        }
        /*
-         * inode clustering:
+         * First flush out the inode that xfs_iflush was called with.
-         * see if other inodes can be gathered into this write
         */
-        spin_lock(&ip->i_cluster->icl_lock);
+        error = xfs_iflush_int(ip, bp);
-        ip->i_cluster->icl_buf = bp;
+        if (error)
+                goto corrupt_out;
-        clcount = 0;
-        hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
-                if (iq == ip)
-                        continue;
-                /*
-                 * Do an un-protected check to see if the inode is dirty and
-                 * is a candidate for flushing.  These checks will be repeated
-                 * later after the appropriate locks are acquired.
-                 */
-                iip = iq->i_itemp;
-                if ((iq->i_update_core == 0) &&
-                    ((iip == NULL) ||
-                     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-                      xfs_ipincount(iq) == 0) {
-                        continue;
-                }
-                /*
-                 * Try to get locks.  If any are unavailable,
-                 * then this inode cannot be flushed and is skipped.
-                 */
-                /* get inode locks (just i_lock) */
-                if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
-                        /* get inode flush lock */
-                        if (xfs_iflock_nowait(iq)) {
-                                /* check if pinned */
-                                if (xfs_ipincount(iq) == 0) {
-                                        /* arriving here means that
-                                         * this inode can be flushed.
-                                         * first re-check that it's
-                                         * dirty
-                                         */
-                                        iip = iq->i_itemp;
-                                        if ((iq->i_update_core != 0)||
-                                            ((iip != NULL) &&
-                                             (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
-                                                clcount++;
-                                                error = xfs_iflush_int(iq, bp);
-                                                if (error) {
-                                                        xfs_iunlock(iq,
-                                                                    XFS_ILOCK_SHARED);
-                                                        goto cluster_corrupt_out;
-                                                }
-                                        } else {
-                                                xfs_ifunlock(iq);
-                                        }
-                                } else {
-                                        xfs_ifunlock(iq);
-                                }
-                        }
-                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
-                }
-        }
-        spin_unlock(&ip->i_cluster->icl_lock);
-        if (clcount) {
-                XFS_STATS_INC(xs_icluster_flushcnt);
-                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
-        }
        /*
-         * If the buffer is pinned then push on the log so we won't
+         * If the buffer is pinned then push on the log now so we won't
         * get stuck waiting in the write for too long.
         */
-        if (XFS_BUF_ISPINNED(bp)){
+        if (XFS_BUF_ISPINNED(bp))
                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-        }
+        /*
+         * inode clustering:
+         * see if other inodes can be gathered into this write
+         */
+        error = xfs_iflush_cluster(ip, bp);
+        if (error)
+                goto cluster_corrupt_out;
        if (flags & INT_DELWRI) {
                xfs_bdwrite(mp, bp);
        } else if (flags & INT_ASYNC) {
-                xfs_bawrite(mp, bp);
+                error = xfs_bawrite(mp, bp);
        } else {
                error = xfs_bwrite(mp, bp);
        }
@@ -3250,52 +3276,11 @@ xfs_iflush(
 corrupt_out:
        xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-        xfs_iflush_abort(ip);
-        /*
-         * Unlocks the flush lock
-         */
-        return XFS_ERROR(EFSCORRUPTED);
 cluster_corrupt_out:
-        /* Corruption detected in the clustering loop.  Invalidate the
-         * inode buffer and shut down the filesystem.
-         */
-        spin_unlock(&ip->i_cluster->icl_lock);
-        /*
-         * Clean up the buffer.  If it was B_DELWRI, just release it --
-         * brelse can handle it with no problems.  If not, shut down the
-         * filesystem before releasing the buffer.
-         */
-        if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
-                xfs_buf_relse(bp);
-        }
-        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-        if(!bufwasdelwri)  {
-                /*
-                 * Just like incore_relse: if we have b_iodone functions,
-                 * mark the buffer as an error and call them.  Otherwise
-                 * mark it as stale and brelse.
-                 */
-                if (XFS_BUF_IODONE_FUNC(bp)) {
-                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-                        XFS_BUF_UNDONE(bp);
-                        XFS_BUF_STALE(bp);
-                        XFS_BUF_SHUT(bp);
-                        XFS_BUF_ERROR(bp,EIO);
-                        xfs_biodone(bp);
-                } else {
-                        XFS_BUF_STALE(bp);
-                        xfs_buf_relse(bp);
-                }
-        }
-        xfs_iflush_abort(iq);
        /*
         * Unlocks the flush lock
         */
+        xfs_iflush_abort(ip);
        return XFS_ERROR(EFSCORRUPTED);
 }
@@ -3325,8 +3310,7 @@ xfs_iflush_int(
         * If the inode isn't dirty, then just release the inode
         * flush lock and do nothing.
         */
-        if ((ip->i_update_core == 0) &&
+        if (xfs_inode_clean(ip)) {
-            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
                xfs_ifunlock(ip);
                return 0;
        }
@@ -3459,16 +3443,9 @@ xfs_iflush_int(
                }
        }
-        if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
+        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
-                goto corrupt_out;
+        if (XFS_IFORK_Q(ip))
-        }
+                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
-        if (XFS_IFORK_Q(ip)) {
-                /*
-                 * The only error from xfs_iflush_fork is on the data fork.
-                 */
-                (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
-        }
        xfs_inobp_check(mp, bp);
        /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bfcd72cbaeea..93c37697a72c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -133,19 +133,6 @@ typedef struct dm_attrs_s {
 } dm_attrs_t;
 /*
- * This is the xfs inode cluster structure.  This structure is used by
- * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
- * the same time.
- */
-typedef struct xfs_icluster {
-        struct hlist_head       icl_inodes;     /* list of inodes on cluster */
-        xfs_daddr_t             icl_blkno;      /* starting block number of
-                                                 * the cluster */
-        struct xfs_buf          *icl_buf;       /* the inode buffer */
-        spinlock_t              icl_lock;       /* inode list lock */
-} xfs_icluster_t;
-/*
 * This is the xfs in-core inode structure.
 * Most of the on-disk inode is embedded in the i_d field.
 *
@@ -240,10 +227,6 @@ typedef struct xfs_inode {
        atomic_t                i_pincount;     /* inode pin count */
        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
-#ifdef HAVE_REFCACHE
-        struct xfs_inode        **i_refcache;   /* ptr to entry in ref cache */
-        struct xfs_inode        *i_release;     /* inode to unref */
-#endif
        /* Miscellaneous state. */
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
@@ -252,8 +235,6 @@ typedef struct xfs_inode {
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
-        xfs_icluster_t          *i_cluster;     /* cluster list header */
-        struct hlist_node       i_cnode;        /* cluster link node */
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
@@ -461,6 +442,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define XFS_IFLUSH_SYNC                 3
 #define XFS_IFLUSH_ASYNC                4
 #define XFS_IFLUSH_DELWRI               5
+#define XFS_IFLUSH_ASYNC_NOBLOCK        6
 /*
 * Flags for xfs_itruncate_start().
@@ -515,7 +497,7 @@ int		xfs_finish_reclaim_all(struct xfs_mount *, int);
 */
 int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                          xfs_daddr_t, uint);
+                          xfs_daddr_t, uint, uint);
 int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                          xfs_inode_t **, xfs_daddr_t, uint);
 int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
@@ -597,7 +579,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #define xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
-extern struct kmem_zone *xfs_icluster_zone;
 extern struct kmem_zone *xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2c775b4ae9e6..93b5db453ea2 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -40,6 +40,7 @@
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rw.h"
+#include "xfs_error.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
@@ -813,7 +814,12 @@ xfs_inode_item_pushbuf(
                                              XFS_LOG_FORCE);
                        }
                        if (dopush) {
-                                xfs_bawrite(mp, bp);
+                                int     error;
+                                error = xfs_bawrite(mp, bp);
+                                if (error)
+                                        xfs_fs_cmn_err(CE_WARN, mp,
+                "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
+                                                        error, iip, bp);
                        } else {
                                xfs_buf_relse(bp);
                        }
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bfe92ea17952..40513077ab36 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w)
        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+        return (!ip->i_itemp ||
+                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+               !ip->i_update_core;
+}
 #ifdef __KERNEL__
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fde37f87d52f..fb3cf1191419 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,8 +802,11 @@ xfs_iomap_write_allocate(
                         */
                        nimaps = 1;
                        end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
-                        xfs_bmap_last_offset(NULL, ip, &last_block,
+                        error = xfs_bmap_last_offset(NULL, ip, &last_block,
-                                XFS_DATA_FORK);
+                                                        XFS_DATA_FORK);
+                        if (error)
+                                goto trans_cancel;
                        last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
                        if ((map_start_fsb + count_fsb) > last_block) {
                                count_fsb = last_block - map_start_fsb;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f615e04364f4..eb85bdedad0c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -129,7 +129,7 @@ xfs_bulkstat_one_iget(
        return error;
 }
-STATIC int
+STATIC void
 xfs_bulkstat_one_dinode(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
@@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode(
                buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
                break;
        }
-        return 0;
 }
 STATIC int
@@ -614,7 +612,8 @@ xfs_bulkstat(
                                                        xfs_buf_relse(bp);
                                                error = xfs_itobp(mp, NULL, ip,
                                                                &dip, &bp, bno,
-                                                                XFS_IMAP_BULKSTAT);
+                                                                XFS_IMAP_BULKSTAT,
+                                                                XFS_BUF_LOCK);
                                                if (!error)
                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
                                                kmem_zone_free(xfs_inode_zone, ip);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 31f2b04f2c97..afaee301b0ee 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,7 @@
 #include "xfs_inode.h"
 #include "xfs_rw.h"
+kmem_zone_t     *xfs_log_ticket_zone;
 #define xlog_write_adv_cnt(ptr, len, off, bytes) \
        { (ptr) += (bytes); \
@@ -73,8 +74,6 @@ STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
                                       xlog_ticket_t    *ticket,
                                       int              *continued_write,
                                       int              *logoffsetp);
-STATIC void xlog_state_put_ticket(xlog_t        *log,
-                                  xlog_ticket_t *tic);
 STATIC int  xlog_state_release_iclog(xlog_t             *log,
                                     xlog_in_core_t     *iclog);
 STATIC void xlog_state_switch_iclogs(xlog_t             *log,
@@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 /* local ticket functions */
-STATIC void             xlog_state_ticket_alloc(xlog_t *log);
 STATIC xlog_ticket_t    *xlog_ticket_get(xlog_t *log,
                                         int    unit_bytes,
                                         int    count,
@@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t	*mp,
                 */
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
-                xlog_state_put_ticket(log, ticket);
+                xlog_ticket_put(log, ticket);
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
@@ -384,7 +382,27 @@ _xfs_log_force(
                return xlog_state_sync_all(log, flags, log_flushed);
        else
                return xlog_state_sync(log, lsn, flags, log_flushed);
-}       /* xfs_log_force */
+}       /* _xfs_log_force */
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+        xfs_mount_t     *mp,
+        xfs_lsn_t       lsn,
+        uint            flags)
+{
+        int     error;
+        error = _xfs_log_force(mp, lsn, flags, NULL);
+        if (error) {
+                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                        "error %d returned.", error);
+        }
+}
 /*
 * Attaches a new iclog I/O completion callback routine during
@@ -397,12 +415,10 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
               void               *iclog_hndl,  /* iclog to hang callback off */
               xfs_log_callback_t *cb)
 {
-        xlog_t *log = mp->m_log;
        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
        int     abortflg;
-        cb->cb_next = NULL;
+        spin_lock(&iclog->ic_callback_lock);
-        spin_lock(&log->l_icloglock);
        abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
        if (!abortflg) {
                ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
@@ -411,7 +427,7 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
                *(iclog->ic_callback_tail) = cb;
                iclog->ic_callback_tail = &(cb->cb_next);
        }
-        spin_unlock(&log->l_icloglock);
+        spin_unlock(&iclog->ic_callback_lock);
        return abortflg;
 }       /* xfs_log_notify */
@@ -471,6 +487,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
                                                  client, flags);
+                if (!internal_ticket)
+                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
                *ticket = internal_ticket;
                xlog_trace_loggrant(log, internal_ticket, 
@@ -636,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
-        xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+        error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+        ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 #ifdef DEBUG
        first_iclog = iclog = log->l_iclog;
@@ -675,10 +694,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
-                iclog->ic_refcnt++;
+                atomic_inc(&iclog->ic_refcnt);
                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
-                (void) xlog_state_release_iclog(log, iclog);
+                error = xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
@@ -695,7 +714,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (tic) {
                        xlog_trace_loggrant(log, tic, "unmount rec");
                        xlog_ungrant_log_space(log, tic);
-                        xlog_state_put_ticket(log, tic);
+                        xlog_ticket_put(log, tic);
                }
        } else {
                /*
@@ -713,11 +732,11 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                 */
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
-                iclog->ic_refcnt++;
+                atomic_inc(&iclog->ic_refcnt);
                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
-                (void) xlog_state_release_iclog(log, iclog);
+                error =  xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -732,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                }
        }
-        return 0;
+        return error;
 }       /* xfs_log_unmount_write */
 /*
@@ -1210,7 +1229,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
        spin_lock_init(&log->l_icloglock);
        spin_lock_init(&log->l_grant_lock);
        initnsema(&log->l_flushsema, 0, "ic-flush");
-        xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1240,9 +1258,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
                iclog->hic_data = bp->b_addr;
+#ifdef DEBUG
                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+#endif
                head = &iclog->ic_header;
                memset(head, 0, sizeof(xlog_rec_header_t));
                head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1253,10 +1271,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
                head->h_fmt = cpu_to_be32(XLOG_FMT);
                memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
                iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
                iclog->ic_state = XLOG_STATE_ACTIVE;
                iclog->ic_log = log;
+                atomic_set(&iclog->ic_refcnt, 0);
+                spin_lock_init(&iclog->ic_callback_lock);
                iclog->ic_callback_tail = &(iclog->ic_callback);
                iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
@@ -1405,7 +1424,7 @@ xlog_sync(xlog_t		*log,
        int             v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
        XFS_STATS_INC(xs_log_writes);
-        ASSERT(iclog->ic_refcnt == 0);
+        ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
        /* Add for LR header */
        count_init = log->l_iclog_hsize + iclog->ic_offset;
@@ -1538,7 +1557,6 @@ STATIC void
 xlog_dealloc_log(xlog_t *log)
 {
        xlog_in_core_t  *iclog, *next_iclog;
-        xlog_ticket_t   *tic, *next_tic;
        int             i;
        iclog = log->l_iclog;
@@ -1559,22 +1577,6 @@ xlog_dealloc_log(xlog_t *log)
        spinlock_destroy(&log->l_icloglock);
        spinlock_destroy(&log->l_grant_lock);
-        /* XXXsup take a look at this again. */
-        if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
-            !XLOG_FORCED_SHUTDOWN(log)) {
-                xfs_fs_cmn_err(CE_WARN, log->l_mp,
-                        "xlog_dealloc_log: (cnt: %d, total: %d)",
-                        log->l_ticket_cnt, log->l_ticket_tcnt);
-                /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
-        } else {
-                tic = log->l_unmount_free;
-                while (tic) {
-                        next_tic = tic->t_next;
-                        kmem_free(tic, PAGE_SIZE);
-                        tic = next_tic;
-                }
-        }
        xfs_buf_free(log->l_xbuf);
 #ifdef XFS_LOG_TRACE
        if (log->l_trace != NULL) {
@@ -1987,7 +1989,7 @@ xlog_state_clean_log(xlog_t *log)
                if (iclog->ic_state == XLOG_STATE_DIRTY) {
                        iclog->ic_state = XLOG_STATE_ACTIVE;
                        iclog->ic_offset       = 0;
-                        iclog->ic_callback      = NULL;   /* don't need to free */
+                        ASSERT(iclog->ic_callback == NULL);
                        /*
                         * If the number of ops in this iclog indicate it just
                         * contains the dummy transaction, we can
@@ -2190,37 +2192,40 @@ xlog_state_do_callback(
                                        be64_to_cpu(iclog->ic_header.h_lsn);
                                spin_unlock(&log->l_grant_lock);
-                                /*
-                                 * Keep processing entries in the callback list
-                                 * until we come around and it is empty.  We
-                                 * need to atomically see that the list is
-                                 * empty and change the state to DIRTY so that
-                                 * we don't miss any more callbacks being added.
-                                 */
-                                spin_lock(&log->l_icloglock);
                        } else {
+                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
                        }
-                        cb = iclog->ic_callback;
+                        /*
+                         * Keep processing entries in the callback list until
+                         * we come around and it is empty.  We need to
+                         * atomically see that the list is empty and change the
+                         * state to DIRTY so that we don't miss any more
+                         * callbacks being added.
+                         */
+                        spin_lock(&iclog->ic_callback_lock);
+                        cb = iclog->ic_callback;
                        while (cb) {
                                iclog->ic_callback_tail = &(iclog->ic_callback);
                                iclog->ic_callback = NULL;
-                                spin_unlock(&log->l_icloglock);
+                                spin_unlock(&iclog->ic_callback_lock);
                                /* perform callbacks in the order given */
                                for (; cb; cb = cb_next) {
                                        cb_next = cb->cb_next;
                                        cb->cb_func(cb->cb_arg, aborted);
                                }
-                                spin_lock(&log->l_icloglock);
+                                spin_lock(&iclog->ic_callback_lock);
                                cb = iclog->ic_callback;
                        }
                        loopdidcallbacks++;
                        funcdidcallbacks++;
+                        spin_lock(&log->l_icloglock);
                        ASSERT(iclog->ic_callback == NULL);
+                        spin_unlock(&iclog->ic_callback_lock);
                        if (!(iclog->ic_state & XLOG_STATE_IOERROR))
                                iclog->ic_state = XLOG_STATE_DIRTY;
@@ -2241,7 +2246,7 @@ xlog_state_do_callback(
                        repeats = 0;
                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
                                "%s: possible infinite loop (%d iterations)",
-                                __FUNCTION__, flushcnt);
+                                __func__, flushcnt);
                }
        } while (!ioerrors && loopdidcallbacks);
@@ -2309,7 +2314,7 @@ xlog_state_done_syncing(
        ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
               iclog->ic_state == XLOG_STATE_IOERROR);
-        ASSERT(iclog->ic_refcnt == 0);
+        ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
        ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
@@ -2391,7 +2396,7 @@ restart:
        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
        head = &iclog->ic_header;
-        iclog->ic_refcnt++;                     /* prevents sync */
+        atomic_inc(&iclog->ic_refcnt);  /* prevents sync */
        log_offset = iclog->ic_offset;
        /* On the 1st write to an iclog, figure out lsn.  This works
@@ -2423,12 +2428,12 @@ restart:
                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
                /* If I'm the only one writing to this iclog, sync it to disk */
-                if (iclog->ic_refcnt == 1) {
+                if (atomic_read(&iclog->ic_refcnt) == 1) {
                        spin_unlock(&log->l_icloglock);
                        if ((error = xlog_state_release_iclog(log, iclog)))
                                return error;
                } else {
-                        iclog->ic_refcnt--;
+                        atomic_dec(&iclog->ic_refcnt);
                        spin_unlock(&log->l_icloglock);
                }
                goto restart;
@@ -2792,18 +2797,6 @@ xlog_ungrant_log_space(xlog_t	     *log,
 /*
- * Atomically put back used ticket.
- */
-STATIC void
-xlog_state_put_ticket(xlog_t        *log,
-                      xlog_ticket_t *tic)
-{
-        spin_lock(&log->l_icloglock);
-        xlog_ticket_put(log, tic);
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_put_ticket */
-/*
 * Flush iclog to disk if this is the last reference to the given iclog and
 * the WANT_SYNC bit is set.
 *
@@ -2813,33 +2806,35 @@ xlog_state_put_ticket(xlog_t	    *log,
 *
 */
 STATIC int
-xlog_state_release_iclog(xlog_t         *log,
+xlog_state_release_iclog(
-                         xlog_in_core_t *iclog)
+        xlog_t          *log,
+        xlog_in_core_t  *iclog)
 {
        int             sync = 0;       /* do we sync? */
-        xlog_assign_tail_lsn(log->l_mp);
+        if (iclog->ic_state & XLOG_STATE_IOERROR)
+                return XFS_ERROR(EIO);
-        spin_lock(&log->l_icloglock);
+        ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+        if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+                return 0;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
                return XFS_ERROR(EIO);
        }
-        ASSERT(iclog->ic_refcnt > 0);
        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
               iclog->ic_state == XLOG_STATE_WANT_SYNC);
-        if (--iclog->ic_refcnt == 0 &&
+        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
-            iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+                /* update tail before writing to iclog */
+                xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
        /*
@@ -2849,11 +2844,9 @@ xlog_state_release_iclog(xlog_t		*log,
         * this iclog has consistent data, so we ignore IOERROR
         * flags after this point.
         */
-        if (sync) {
+        if (sync)
                return xlog_sync(log, iclog);
-        }
        return 0;
 }       /* xlog_state_release_iclog */
@@ -2953,7 +2946,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
                 * previous iclog and go to sleep.
                 */
                if (iclog->ic_state == XLOG_STATE_DIRTY ||
-                    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+                    (atomic_read(&iclog->ic_refcnt) == 0
+                     && iclog->ic_offset == 0)) {
                        iclog = iclog->ic_prev;
                        if (iclog->ic_state == XLOG_STATE_ACTIVE ||
                            iclog->ic_state == XLOG_STATE_DIRTY)
@@ -2961,14 +2955,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
                        else
                                goto maybe_sleep;
                } else {
-                        if (iclog->ic_refcnt == 0) {
+                        if (atomic_read(&iclog->ic_refcnt) == 0) {
                                /* We are the only one with access to this
                                 * iclog.  Flush it out now.  There should
                                 * be a roundoff of zero to show that someone
                                 * has already taken care of the roundoff from
                                 * the previous sync.
                                 */
-                                iclog->ic_refcnt++;
+                                atomic_inc(&iclog->ic_refcnt);
                                lsn = be64_to_cpu(iclog->ic_header.h_lsn);
                                xlog_state_switch_iclogs(log, iclog, 0);
                                spin_unlock(&log->l_icloglock);
@@ -3100,7 +3094,7 @@ try_again:
                        already_slept = 1;
                        goto try_again;
                } else {
-                        iclog->ic_refcnt++;
+                        atomic_inc(&iclog->ic_refcnt);
                        xlog_state_switch_iclogs(log, iclog, 0);
                        spin_unlock(&log->l_icloglock);
                        if (xlog_state_release_iclog(log, iclog))
@@ -3172,92 +3166,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 */
 /*
- *      Algorithm doesn't take into account page size. ;-(
+ * Free a used ticket.
- */
-STATIC void
-xlog_state_ticket_alloc(xlog_t *log)
-{
-        xlog_ticket_t   *t_list;
-        xlog_ticket_t   *next;
-        xfs_caddr_t     buf;
-        uint            i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2;
-        /*
-         * The kmem_zalloc may sleep, so we shouldn't be holding the
-         * global lock.  XXXmiken: may want to use zone allocator.
-         */
-        buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP);
-        spin_lock(&log->l_icloglock);
-        /* Attach 1st ticket to Q, so we can keep track of allocated memory */
-        t_list = (xlog_ticket_t *)buf;
-        t_list->t_next = log->l_unmount_free;
-        log->l_unmount_free = t_list++;
-        log->l_ticket_cnt++;
-        log->l_ticket_tcnt++;
-        /* Next ticket becomes first ticket attached to ticket free list */
-        if (log->l_freelist != NULL) {
-                ASSERT(log->l_tail != NULL);
-                log->l_tail->t_next = t_list;
-        } else {
-                log->l_freelist = t_list;
-        }
-        log->l_ticket_cnt++;
-        log->l_ticket_tcnt++;
-        /* Cycle through rest of alloc'ed memory, building up free Q */
-        for ( ; i > 0; i--) {
-                next = t_list + 1;
-                t_list->t_next = next;
-                t_list = next;
-                log->l_ticket_cnt++;
-                log->l_ticket_tcnt++;
-        }
-        t_list->t_next = NULL;
-        log->l_tail = t_list;
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_ticket_alloc */
-/*
- * Put ticket into free list
- *
- * Assumption: log lock is held around this call.
 */
 STATIC void
 xlog_ticket_put(xlog_t          *log,
                xlog_ticket_t   *ticket)
 {
        sv_destroy(&ticket->t_sema);
+        kmem_zone_free(xfs_log_ticket_zone, ticket);
-        /*
-         * Don't think caching will make that much difference.  It's
-         * more important to make debug easier.
-         */
-#if 0
-        /* real code will want to use LIFO for caching */
-        ticket->t_next = log->l_freelist;
-        log->l_freelist = ticket;
-        /* no need to clear fields */
-#else
-        /* When we debug, it is easier if tickets are cycled */
-        ticket->t_next     = NULL;
-        if (log->l_tail) {
-                log->l_tail->t_next = ticket;
-        } else {
-                ASSERT(log->l_freelist == NULL);
-                log->l_freelist = ticket;
-        }
-        log->l_tail         = ticket;
-#endif /* DEBUG */
-        log->l_ticket_cnt++;
 }       /* xlog_ticket_put */
 /*
- * Grab ticket off freelist or allocation some more
+ * Allocate and initialise a new log ticket.
 */
 STATIC xlog_ticket_t *
 xlog_ticket_get(xlog_t          *log,
@@ -3269,21 +3190,9 @@ xlog_ticket_get(xlog_t		*log,
        xlog_ticket_t   *tic;
        uint            num_headers;
- alloc:
+        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
-        if (log->l_freelist == NULL)
+        if (!tic)
-                xlog_state_ticket_alloc(log);           /* potentially sleep */
+                return NULL;
-        spin_lock(&log->l_icloglock);
-        if (log->l_freelist == NULL) {
-                spin_unlock(&log->l_icloglock);
-                goto alloc;
-        }
-        tic             = log->l_freelist;
-        log->l_freelist = tic->t_next;
-        if (log->l_freelist == NULL)
-                log->l_tail = NULL;
-        log->l_ticket_cnt--;
-        spin_unlock(&log->l_icloglock);
        /*
         * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3611,8 +3520,8 @@ xfs_log_force_umount(
         * before we mark the filesystem SHUTDOWN and wake
         * everybody up to tell the bad news.
         */
-        spin_lock(&log->l_grant_lock);
        spin_lock(&log->l_icloglock);
+        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        XFS_BUF_DONE(mp->m_sb_bp);
        /*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4cdac048df5e..d1d678ecb63e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -142,8 +142,9 @@ int	  _xfs_log_force(struct xfs_mount *mp,
                         xfs_lsn_t      lsn,
                         uint           flags,
                         int            *log_forced);
-#define xfs_log_force(mp, lsn, flags) \
+void      xfs_log_force(struct xfs_mount        *mp,
-        _xfs_log_force(mp, lsn, flags, NULL);
+                        xfs_lsn_t               lsn,
+                        uint                    flags);
 int       xfs_log_mount(struct xfs_mount        *mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c6244cc733c0..8952a392b5f3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -242,7 +242,7 @@ typedef struct xlog_res {
 typedef struct xlog_ticket {
        sv_t               t_sema;       /* sleep on this semaphore      : 20 */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header {
 * - ic_offset is the current number of bytes written to in this iclog.
 * - ic_refcnt is bumped when someone is writing to the log.
 * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *      - ic_callback_*
+ *      - ic_refcnt
+ *      - fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_iclog_fields {
        sv_t                    ic_forcesema;
@@ -332,17 +345,22 @@ typedef struct xlog_iclog_fields {
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
        struct log              *ic_log;
-        xfs_log_callback_t      *ic_callback;
-        xfs_log_callback_t      **ic_callback_tail;
-#ifdef XFS_LOG_TRACE
-        struct ktrace           *ic_trace;
-#endif
        int                     ic_size;
        int                     ic_offset;
-        int                     ic_refcnt;
        int                     ic_bwritecnt;
        ushort_t                ic_state;
        char                    *ic_datap;      /* pointer to iclog data */
+#ifdef XFS_LOG_TRACE
+        struct ktrace           *ic_trace;
+#endif
+        /* Callback structures need their own cacheline */
+        spinlock_t              ic_callback_lock ____cacheline_aligned_in_smp;
+        xfs_log_callback_t      *ic_callback;
+        xfs_log_callback_t      **ic_callback_tail;
+        /* reference counts need their own cacheline */
+        atomic_t                ic_refcnt ____cacheline_aligned_in_smp;
 } xlog_iclog_fields_t;
 typedef union xlog_in_core2 {
@@ -366,6 +384,7 @@ typedef struct xlog_in_core {
 #define ic_bp           hic_fields.ic_bp
 #define ic_log          hic_fields.ic_log
 #define ic_callback     hic_fields.ic_callback
+#define ic_callback_lock hic_fields.ic_callback_lock
 #define ic_callback_tail hic_fields.ic_callback_tail
 #define ic_trace        hic_fields.ic_trace
 #define ic_size         hic_fields.ic_size
@@ -383,43 +402,46 @@ typedef struct xlog_in_core {
 * that round off problems won't occur when releasing partial reservations.
 */
 typedef struct log {
+        /* The following fields don't need locking */
+        struct xfs_mount        *l_mp;          /* mount point */
+        struct xfs_buf          *l_xbuf;        /* extra buffer for log
+                                                 * wrapping */
+        struct xfs_buftarg      *l_targ;        /* buftarg of log */
+        uint                    l_flags;
+        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+        struct xfs_buf_cancel   **l_buf_cancel_table;
+        int                     l_iclog_hsize;  /* size of iclog header */
+        int                     l_iclog_heads;  /* # of iclog header sectors */
+        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectbb_mask;  /* sector size (in BBs)
+                                                 * alignment mask */
+        int                     l_iclog_size;   /* size of log in bytes */
+        int                     l_iclog_size_log; /* log power size of log */
+        int                     l_iclog_bufs;   /* number of iclog buffers */
+        xfs_daddr_t             l_logBBstart;   /* start block of log */
+        int                     l_logsize;      /* size of log in bytes */
+        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sema_t                  l_flushsema;    /* iclog flushing semaphore */
+        sema_t                  l_flushsema ____cacheline_aligned_in_smp;
+                                                /* iclog flushing semaphore */
        int                     l_flushcnt;     /* # of procs waiting on this
                                                 * sema */
-        int                     l_ticket_cnt;   /* free ticket count */
-        int                     l_ticket_tcnt;  /* total ticket count */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
-        xlog_ticket_t           *l_freelist;    /* free list of tickets */
-        xlog_ticket_t           *l_unmount_free;/* kmem_free these addresses */
-        xlog_ticket_t           *l_tail;        /* free list of tickets */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
                                                 * buffers */
        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
-        struct xfs_mount        *l_mp;          /* mount point */
-        struct xfs_buf          *l_xbuf;        /* extra buffer for log
-                                                 * wrapping */
-        struct xfs_buftarg      *l_targ;        /* buftarg of log */
-        xfs_daddr_t             l_logBBstart;   /* start block of log */
-        int                     l_logsize;      /* size of log in bytes */
-        int                     l_logBBsize;    /* size of log in BB chunks */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        int                     l_iclog_size;   /* size of log in bytes */
-        int                     l_iclog_size_log; /* log power size of log */
-        int                     l_iclog_bufs;   /* number of iclog buffers */
-        /* The following field are used for debugging; need to hold icloglock */
-        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
        /* The following block of fields are changed while holding grant_lock */
-        spinlock_t              l_grant_lock;
+        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
        xlog_ticket_t           *l_reserve_headq;
        xlog_ticket_t           *l_write_headq;
        int                     l_grant_reserve_cycle;
@@ -427,19 +449,16 @@ typedef struct log {
        int                     l_grant_write_cycle;
        int                     l_grant_write_bytes;
-        /* The following fields don't need locking */
 #ifdef XFS_LOG_TRACE
        struct ktrace           *l_trace;
        struct ktrace           *l_grant_trace;
 #endif
-        uint                    l_flags;
-        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+        /* The following field are used for debugging; need to hold icloglock */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+#ifdef DEBUG
-        int                     l_iclog_hsize;  /* size of iclog header */
+        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
-        int                     l_iclog_heads;  /* # of iclog header sectors */
+#endif
-        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
-        uint                    l_sectbb_mask;  /* sector size (in BBs)
-                                                 * alignment mask */
 } xlog_t;
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
@@ -459,6 +478,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void      xlog_put_bp(struct xfs_buf *);
 extern int       xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
+extern kmem_zone_t      *xfs_log_ticket_zone;
 /* iclog tracing */
 #define XLOG_TRACE_GRAB_FLUSH  1
 #define XLOG_TRACE_REL_FLUSH   2
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b2b70eba282c..e65ab4af0955 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
+#include "xfs_utils.h"
 STATIC int      xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int      xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -120,7 +121,8 @@ xlog_bread(
        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
-        if ((error = xfs_iowait(bp)))
+        error = xfs_iowait(bp);
+        if (error)
                xfs_ioerror_alert("xlog_bread", log->l_mp,
                                  bp, XFS_BUF_ADDR(bp));
        return error;
@@ -191,7 +193,7 @@ xlog_header_check_dump(
 {
        int                     b;
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
+        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
        for (b = 0; b < 16; b++)
                cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
@@ -1160,10 +1162,14 @@ xlog_write_log_records(
                if (j == 0 && (start_block + endcount > ealign)) {
                        offset = XFS_BUF_PTR(bp);
                        balign = BBTOB(ealign - start_block);
-                        XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
+                        error = XFS_BUF_SET_PTR(bp, offset + balign,
-                        if ((error = xlog_bread(log, ealign, sectbb, bp)))
+                                                BBTOB(sectbb));
+                        if (!error)
+                                error = xlog_bread(log, ealign, sectbb, bp);
+                        if (!error)
+                                error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+                        if (error)
                                break;
-                        XFS_BUF_SET_PTR(bp, offset, bufblks);
                }
                offset = xlog_align(log, start_block, endcount, bp);
@@ -2280,7 +2286,9 @@ xlog_recover_do_inode_trans(
                 * invalidate the buffer when we write it out below.
                 */
                imap.im_blkno = 0;
-                xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+                error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+                if (error)
+                        goto error;
        }
        /*
@@ -2964,7 +2972,7 @@ xlog_recover_process_data(
 * Process an extent free intent item that was recovered from
 * the log.  We need to free the extents that it describes.
 */
-STATIC void
+STATIC int
 xlog_recover_process_efi(
        xfs_mount_t             *mp,
        xfs_efi_log_item_t      *efip)
@@ -2972,6 +2980,7 @@ xlog_recover_process_efi(
        xfs_efd_log_item_t      *efdp;
        xfs_trans_t             *tp;
        int                     i;
+        int                     error = 0;
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
@@ -2995,23 +3004,32 @@ xlog_recover_process_efi(
                         * free the memory associated with it.
                         */
                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
-                        return;
+                        return XFS_ERROR(EIO);
                }
        }
        tp = xfs_trans_alloc(mp, 0);
-        xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+        if (error)
+                goto abort_error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
                extp = &(efip->efi_format.efi_extents[i]);
-                xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+                error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+                if (error)
+                        goto abort_error;
                xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
                                         extp->ext_len);
        }
        efip->efi_flags |= XFS_EFI_RECOVERED;
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        return error;
+abort_error:
+        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+        return error;
 }
 /*
@@ -3059,7 +3077,7 @@ xlog_recover_check_ail(
 * everything already in the AIL, we stop processing as soon as
 * we see something other than an EFI in the AIL.
 */
-STATIC void
+STATIC int
 xlog_recover_process_efis(
        xlog_t                  *log)
 {
@@ -3067,6 +3085,7 @@ xlog_recover_process_efis(
        xfs_efi_log_item_t      *efip;
        int                     gen;
        xfs_mount_t             *mp;
+        int                     error = 0;
        mp = log->l_mp;
        spin_lock(&mp->m_ail_lock);
@@ -3091,11 +3110,14 @@ xlog_recover_process_efis(
                }
                spin_unlock(&mp->m_ail_lock);
-                xlog_recover_process_efi(mp, efip);
+                error = xlog_recover_process_efi(mp, efip);
+                if (error)
+                        return error;
                spin_lock(&mp->m_ail_lock);
                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
        }
        spin_unlock(&mp->m_ail_lock);
+        return error;
 }
 /*
@@ -3115,21 +3137,18 @@ xlog_recover_clear_agi_bucket(
        int             error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-        xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        if (!error)
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                goto out_abort;
-                return;
-        }
+        error = EINVAL;
        agi = XFS_BUF_TO_AGI(agibp);
-        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
+        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
-                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                goto out_abort;
-                return;
-        }
        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
@@ -3137,7 +3156,17 @@ xlog_recover_clear_agi_bucket(
        xfs_trans_log_buf(tp, agibp, offset,
                          (offset + sizeof(xfs_agino_t) - 1));
-        (void) xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        if (error)
+                goto out_error;
+        return;
+out_abort:
+        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+                        "failed to clear agi %d. Continuing.", agno);
+        return;
 }
 /*
@@ -3214,7 +3243,8 @@ xlog_recover_process_iunlinks(
                                         * next inode in the bucket.
                                         */
                                        error = xfs_itobp(mp, NULL, ip, &dip,
-                                                        &ibp, 0, 0);
+                                                        &ibp, 0, 0,
+                                                        XFS_BUF_LOCK);
                                        ASSERT(error || (dip != NULL));
                                }
@@ -3247,7 +3277,7 @@ xlog_recover_process_iunlinks(
                                        if (ip->i_d.di_mode == 0)
                                                xfs_iput_new(ip, 0);
                                        else
-                                                VN_RELE(XFS_ITOV(ip));
+                                                IRELE(ip);
                                } else {
                                        /*
                                         * We can't read in the inode
@@ -3445,7 +3475,7 @@ xlog_valid_rec_header(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
                xlog_warn("XFS: %s: unrecognised log version (%d).",
-                        __FUNCTION__, be32_to_cpu(rhead->h_version));
+                        __func__, be32_to_cpu(rhead->h_version));
                return XFS_ERROR(EIO);
        }
@@ -3604,15 +3634,19 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
+                                wrapped_hblks = hblks - split_hblks;
                                bufaddr = XFS_BUF_PTR(hbp);
-                                XFS_BUF_SET_PTR(hbp,
+                                error = XFS_BUF_SET_PTR(hbp,
                                                bufaddr + BBTOB(split_hblks),
                                                BBTOB(hblks - split_hblks));
-                                wrapped_hblks = hblks - split_hblks;
+                                if (!error)
-                                error = xlog_bread(log, 0, wrapped_hblks, hbp);
+                                        error = xlog_bread(log, 0,
+                                                        wrapped_hblks, hbp);
+                                if (!error)
+                                        error = XFS_BUF_SET_PTR(hbp, bufaddr,
+                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
-                                XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
                                if (!offset)
                                        offset = xlog_align(log, 0,
                                                        wrapped_hblks, hbp);
@@ -3664,13 +3698,18 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                bufaddr = XFS_BUF_PTR(dbp);
-                                XFS_BUF_SET_PTR(dbp,
+                                error = XFS_BUF_SET_PTR(dbp,
                                                bufaddr + BBTOB(split_bblks),
                                                BBTOB(bblks - split_bblks));
-                                if ((error = xlog_bread(log, wrapped_hblks,
+                                if (!error)
-                                                bblks - split_bblks, dbp)))
+                                        error = xlog_bread(log, wrapped_hblks,
+                                                        bblks - split_bblks,
+                                                        dbp);
+                                if (!error)
+                                        error = XFS_BUF_SET_PTR(dbp, bufaddr,
+                                                        h_size);
+                                if (error)
                                        goto bread_err2;
-                                XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
                                if (!offset)
                                        offset = xlog_align(log, wrapped_hblks,
                                                bblks - split_bblks, dbp);
@@ -3826,7 +3865,8 @@ xlog_do_recover(
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
        xfsbdstrat(log->l_mp, bp);
-        if ((error = xfs_iowait(bp))) {
+        error = xfs_iowait(bp);
+        if (error) {
                xfs_ioerror_alert("xlog_do_recover",
                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
                ASSERT(0);
@@ -3917,7 +3957,14 @@ xlog_recover_finish(
         * rather than accepting new requests.
         */
        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
-                xlog_recover_process_efis(log);
+                int     error;
+                error = xlog_recover_process_efis(log);
+                if (error) {
+                        cmn_err(CE_ALERT,
+                                "Failed to recover EFIs on filesystem: %s",
+                                log->l_mp->m_fsname);
+                        return error;
+                }
                /*
                 * Sync the log to get all the EFIs out of the AIL.
                 * This isn't absolutely necessary, but it helps in
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8ed164eb9544..2fec452afbcc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,8 +43,9 @@
 #include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
+#include "xfs_utils.h"
-STATIC void     xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+STATIC int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int      xfs_uuid_mount(xfs_mount_t *);
 STATIC void     xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
@@ -57,7 +58,7 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 STATIC void     xfs_icsb_sync_counters(xfs_mount_t *);
 STATIC int      xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
                                                int64_t, int);
-STATIC int      xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
@@ -956,7 +957,6 @@ xfs_mountfs(
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
-        bhv_vnode_t     *rvp = NULL;
        __uint64_t      resblks;
        __int64_t       update_flags = 0LL;
        uint            quotamount, quotaflags;
@@ -964,11 +964,6 @@ xfs_mountfs(
        int             uuid_mounted = 0;
        int             error = 0;
-        if (mp->m_sb_bp == NULL) {
-                error = xfs_readsb(mp, mfsi_flags);
-                if (error)
-                        return error;
-        }
        xfs_mount_common(mp, sbp);
        /*
@@ -1163,7 +1158,6 @@ xfs_mountfs(
        }
        ASSERT(rip != NULL);
-        rvp = XFS_ITOV(rip);
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
                cmn_err(CE_WARN, "XFS: corrupted root inode");
@@ -1195,8 +1189,13 @@ xfs_mountfs(
        /*
         * If fs is not mounted readonly, then update the superblock changes.
         */
-        if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY))
+        if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_mount_log_sb(mp, update_flags);
+                error = xfs_mount_log_sb(mp, update_flags);
+                if (error) {
+                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
+                        goto error4;
+                }
+        }
        /*
         * Initialise the XFS quota management subsystem for this mount
@@ -1233,12 +1232,15 @@ xfs_mountfs(
         *
         * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
         * This may drive us straight to ENOSPC on mount, but that implies
-         * we were already there on the last unmount.
+         * we were already there on the last unmount. Warn if this occurs.
         */
        resblks = mp->m_sb.sb_dblocks;
        do_div(resblks, 20);
        resblks = min_t(__uint64_t, resblks, 1024);
-        xfs_reserve_blocks(mp, &resblks, NULL);
+        error = xfs_reserve_blocks(mp, &resblks, NULL);
+        if (error)
+                cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
+                                "Continuing without a reserve pool.");
        return 0;
@@ -1246,7 +1248,7 @@ xfs_mountfs(
        /*
         * Free up the root inode.
         */
-        VN_RELE(rvp);
+        IRELE(rip);
 error3:
        xfs_log_unmount_dealloc(mp);
 error2:
@@ -1274,6 +1276,7 @@ int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
        __uint64_t      resblks;
+        int             error = 0;
        /*
         * We can potentially deadlock here if we have an inode cluster
@@ -1317,9 +1320,15 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
         * value does not matter....
         */
        resblks = 0;
-        xfs_reserve_blocks(mp, &resblks, NULL);
+        error = xfs_reserve_blocks(mp, &resblks, NULL);
+        if (error)
+                cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+                                "Freespace may not be correct on next mount.");
-        xfs_log_sbcount(mp, 1);
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
@@ -1411,9 +1420,8 @@ xfs_log_sbcount(
        xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
        if (sync)
                xfs_trans_set_sync(tp);
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        return error;
-        return 0;
 }
 STATIC void
@@ -1462,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNASYNC(sbp);
                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
-                /* Nevermind errors we might get here. */
                error = xfs_iowait(sbp);
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
@@ -1911,24 +1918,27 @@ xfs_uuid_unmount(
 * be altered by the mount options, as well as any potential sb_features2
 * fixup. Only the first superblock is updated.
 */
-STATIC void
+STATIC int
 xfs_mount_log_sb(
        xfs_mount_t     *mp,
        __int64_t       fields)
 {
        xfs_trans_t     *tp;
+        int             error;
        ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-        if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-                                XFS_DEFAULT_LOG_COUNT)) {
+                                XFS_DEFAULT_LOG_COUNT);
+        if (error) {
                xfs_trans_cancel(tp, 0);
-                return;
+                return error;
        }
        xfs_mod_sb(tp, fields);
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        return error;
 }
@@ -2189,7 +2199,7 @@ xfs_icsb_counter_disabled(
        return test_bit(field, &mp->m_icsb_counters);
 }
-STATIC int
+STATIC void
 xfs_icsb_disable_counter(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field)
@@ -2207,7 +2217,7 @@ xfs_icsb_disable_counter(
         * the m_icsb_mutex.
         */
        if (xfs_icsb_counter_disabled(mp, field))
-                return 0;
+                return;
        xfs_icsb_lock_all_counters(mp);
        if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
@@ -2230,8 +2240,6 @@ xfs_icsb_disable_counter(
        }
        xfs_icsb_unlock_all_counters(mp);
-        return 0;
 }
 STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8a4728d847..1ed575110ff0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,17 +66,17 @@ struct xfs_mru_cache;
 * Prototypes and functions for the Data Migration subsystem.
 */
-typedef int     (*xfs_send_data_t)(int, bhv_vnode_t *,
+typedef int     (*xfs_send_data_t)(int, struct xfs_inode *,
-                        xfs_off_t, size_t, int, bhv_vrwlock_t *);
+                        xfs_off_t, size_t, int, int *);
 typedef int     (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int     (*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
+typedef int     (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-                        bhv_vnode_t *,
+                        struct xfs_inode *, dm_right_t,
-                        dm_right_t, bhv_vnode_t *, dm_right_t,
+                        struct xfs_inode *, dm_right_t,
-                        char *, char *, mode_t, int, int);
+                        const char *, const char *, mode_t, int, int);
 typedef int     (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
                        char *, char *);
-typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *,
+typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
                        dm_right_t, mode_t, int, int);
 typedef struct xfs_dmops {
@@ -88,20 +88,20 @@ typedef struct xfs_dmops {
        xfs_send_unmount_t      xfs_send_unmount;
 } xfs_dmops_t;
-#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
+#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
-        (*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock)
+        (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
 #define XFS_SEND_MMAP(mp, vma,fl) \
        (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, vp,right) \
+#define XFS_SEND_DESTROY(mp, ip,right) \
-        (*(mp)->m_dm_ops->xfs_send_destroy)(vp,right)
+        (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
 #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
        (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
        (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_MOUNT(mp,right,path,name) \
        (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \
+#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
-        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl)
+        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
 /*
@@ -220,7 +220,7 @@ extern void	xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
 #endif
 typedef struct xfs_ail {
-        xfs_ail_entry_t         xa_ail;
+        struct list_head        xa_ail;
        uint                    xa_gen;
        struct task_struct      *xa_task;
        xfs_lsn_t               xa_target;
@@ -401,7 +401,7 @@ typedef struct xfs_mount {
 /*
 * Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used. 
+ * "largeio" mount option is used.
 *
 * If compatibility mode is specified, simply return the basic unit of caching
 * so that we don't get inefficient read/modify/write I/O from user apps.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7eb157a59f9e..ee371890d85d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -36,7 +36,6 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
 #include "xfs_vnodeops.h"
@@ -84,25 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip;
 */
 STATIC int
 xfs_lock_for_rename(
-        xfs_inode_t     *dp1,   /* old (source) directory inode */
+        xfs_inode_t     *dp1,   /* in: old (source) directory inode */
-        xfs_inode_t     *dp2,   /* new (target) directory inode */
+        xfs_inode_t     *dp2,   /* in: new (target) directory inode */
-        bhv_vname_t     *vname1,/* old entry name */
+        xfs_inode_t     *ip1,   /* in: inode of old entry */
-        bhv_vname_t     *vname2,/* new entry name */
+        struct xfs_name *name2, /* in: new entry name */
-        xfs_inode_t     **ipp1, /* inode of old entry */
+        xfs_inode_t     **ipp2, /* out: inode of new entry, if it
-        xfs_inode_t     **ipp2, /* inode of new entry, if it
                                   already exists, NULL otherwise. */
-        xfs_inode_t     **i_tab,/* array of inode returned, sorted */
+        xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
-        int             *num_inodes)  /* number of inodes in array */
+        int             *num_inodes)  /* out: number of inodes in array */
 {
-        xfs_inode_t             *ip1, *ip2, *temp;
+        xfs_inode_t             *ip2 = NULL;
+        xfs_inode_t             *temp;
        xfs_ino_t               inum1, inum2;
        int                     error;
        int                     i, j;
        uint                    lock_mode;
        int                     diff_dirs = (dp1 != dp2);
-        ip2 = NULL;
        /*
         * First, find out the current inums of the entries so that we
         * can determine the initial locking order.  We'll have to
@@ -110,27 +107,20 @@ xfs_lock_for_rename(
         * to see if we still have the right inodes, directories, etc.
         */
        lock_mode = xfs_ilock_map_shared(dp1);
-        error = xfs_get_dir_entry(vname1, &ip1);
+        IHOLD(ip1);
-        if (error) {
+        xfs_itrace_ref(ip1);
-                xfs_iunlock_map_shared(dp1, lock_mode);
-                return error;
-        }
        inum1 = ip1->i_ino;
-        ASSERT(ip1);
-        xfs_itrace_ref(ip1);
        /*
         * Unlock dp1 and lock dp2 if they are different.
         */
        if (diff_dirs) {
                xfs_iunlock_map_shared(dp1, lock_mode);
                lock_mode = xfs_ilock_map_shared(dp2);
        }
-        error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2);
+        error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
        if (error == ENOENT) {          /* target does not need to exist. */
                inum2 = 0;
        } else if (error) {
@@ -162,6 +152,7 @@ xfs_lock_for_rename(
                *num_inodes = 4;
                i_tab[3] = ip2;
        }
+        *ipp2 = i_tab[3];
        /*
         * Sort the elements via bubble sort.  (Remember, there are at
@@ -199,21 +190,6 @@ xfs_lock_for_rename(
                xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
        }
-        /*
-         * Set the return value. Null out any unused entries in i_tab.
-         */
-        *ipp1 = *ipp2 = NULL;
-        for (i=0; i < *num_inodes; i++) {
-                if (i_tab[i]->i_ino == inum1) {
-                        *ipp1 = i_tab[i];
-                }
-                if (i_tab[i]->i_ino == inum2) {
-                        *ipp2 = i_tab[i];
-                }
-        }
-        for (;i < 4; i++) {
-                i_tab[i] = NULL;
-        }
        return 0;
 }
@@ -223,13 +199,13 @@ xfs_lock_for_rename(
 int
 xfs_rename(
        xfs_inode_t     *src_dp,
-        bhv_vname_t     *src_vname,
+        struct xfs_name *src_name,
-        bhv_vnode_t     *target_dir_vp,
+        xfs_inode_t     *src_ip,
-        bhv_vname_t     *target_vname)
+        xfs_inode_t     *target_dp,
+        struct xfs_name *target_name)
 {
-        bhv_vnode_t     *src_dir_vp = XFS_ITOV(src_dp);
        xfs_trans_t     *tp;
-        xfs_inode_t     *target_dp, *src_ip, *target_ip;
+        xfs_inode_t     *target_ip;
        xfs_mount_t     *mp = src_dp->i_mount;
        int             new_parent;             /* moving to a new dir */
        int             src_is_directory;       /* src_name is a directory */
@@ -243,29 +219,16 @@ xfs_rename(
        int             spaceres;
        int             target_link_zero = 0;
        int             num_inodes;
-        char            *src_name = VNAME(src_vname);
-        char            *target_name = VNAME(target_vname);
-        int             src_namelen = VNAMELEN(src_vname);
-        int             target_namelen = VNAMELEN(target_vname);
        xfs_itrace_entry(src_dp);
-        xfs_itrace_entry(xfs_vtoi(target_dir_vp));
+        xfs_itrace_entry(target_dp);
-        /*
-         * Find the XFS behavior descriptor for the target directory
-         * vnode since it was not handed to us.
-         */
-        target_dp = xfs_vtoi(target_dir_vp);
-        if (target_dp == NULL) {
-                return XFS_ERROR(EXDEV);
-        }
        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
            DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-                                        src_dir_vp, DM_RIGHT_NULL,
+                                        src_dp, DM_RIGHT_NULL,
-                                        target_dir_vp, DM_RIGHT_NULL,
+                                        target_dp, DM_RIGHT_NULL,
-                                        src_name, target_name,
+                                        src_name->name, target_name->name,
                                        0, 0, 0);
                if (error) {
                        return error;
@@ -282,10 +245,8 @@ xfs_rename(
         * does not exist in the source directory.
         */
        tp = NULL;
-        error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
+        error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
-                        target_vname, &src_ip, &target_ip, inodes,
+                                        &target_ip, inodes, &num_inodes);
-                        &num_inodes);
        if (error) {
                /*
                 * We have nothing locked, no inode references, and
@@ -331,7 +292,7 @@ xfs_rename(
        XFS_BMAP_INIT(&free_list, &first_block);
        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
        if (error == ENOSPC) {
@@ -365,10 +326,10 @@ xfs_rename(
         * them when they unlock the inodes.  Also, we need to be careful
         * not to add an inode to the transaction more than once.
         */
-        VN_HOLD(src_dir_vp);
+        IHOLD(src_dp);
        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
        if (new_parent) {
-                VN_HOLD(target_dir_vp);
+                IHOLD(target_dp);
                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
        }
        if ((src_ip != src_dp) && (src_ip != target_dp)) {
@@ -389,9 +350,8 @@ xfs_rename(
                 * If there's no space reservation, check the entry will
                 * fit before actually inserting it.
                 */
-                if (spaceres == 0 &&
+                error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
-                    (error = xfs_dir_canenter(tp, target_dp, target_name,
+                if (error)
-                                                target_namelen)))
                        goto error_return;
                /*
                 * If target does not exist and the rename crosses
@@ -399,8 +359,8 @@ xfs_rename(
                 * to account for the ".." reference from the new entry.
                 */
                error = xfs_dir_createname(tp, target_dp, target_name,
-                                           target_namelen, src_ip->i_ino,
+                                                src_ip->i_ino, &first_block,
-                                           &first_block, &free_list, spaceres);
+                                                &free_list, spaceres);
                if (error == ENOSPC)
                        goto error_return;
                if (error)
@@ -439,7 +399,7 @@ xfs_rename(
                 * name at the destination directory, remove it first.
                 */
                error = xfs_dir_replace(tp, target_dp, target_name,
-                                        target_namelen, src_ip->i_ino,
+                                        src_ip->i_ino,
                                        &first_block, &free_list, spaceres);
                if (error)
                        goto abort_return;
@@ -476,7 +436,8 @@ xfs_rename(
                 * Rewrite the ".." entry to point to the new
                 * directory.
                 */
-                error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+                                        target_dp->i_ino,
                                        &first_block, &free_list, spaceres);
                ASSERT(error != EEXIST);
                if (error)
@@ -512,8 +473,8 @@ xfs_rename(
                        goto abort_return;
        }
-        error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
+        error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
-                        src_ip->i_ino, &first_block, &free_list, spaceres);
+                                        &first_block, &free_list, spaceres);
        if (error)
                goto abort_return;
        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -580,10 +541,8 @@ xfs_rename(
         * the vnode references.
         */
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (target_ip != NULL) {
+        if (target_ip != NULL)
-                xfs_refcache_purge_ip(target_ip);
                IRELE(target_ip);
-        }
        /*
         * Let interposed file systems know about removed links.
         */
@@ -598,9 +557,9 @@ std_return:
        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
            DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
                (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-                                        src_dir_vp, DM_RIGHT_NULL,
+                                        src_dp, DM_RIGHT_NULL,
-                                        target_dir_vp, DM_RIGHT_NULL,
+                                        target_dp, DM_RIGHT_NULL,
-                                        src_name, target_name,
+                                        src_name->name, target_name->name,
                                        0, error, 0);
        }
        return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 47082c01872d..a0dc6e5bc5b9 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -44,6 +44,7 @@
 #include "xfs_rw.h"
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
+#include "xfs_utils.h"
 /*
@@ -123,14 +124,14 @@ xfs_growfs_rt_alloc(
                                XFS_GROWRTALLOC_LOG_RES(mp), 0,
                                XFS_TRANS_PERM_LOG_RES,
                                XFS_DEFAULT_PERM_LOG_COUNT)))
-                        goto error_exit;
+                        goto error_cancel;
                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
                /*
                 * Lock the inode.
                 */
                if ((error = xfs_trans_iget(mp, tp, ino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        goto error_exit;
+                        goto error_cancel;
                XFS_BMAP_INIT(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
@@ -143,14 +144,16 @@ xfs_growfs_rt_alloc(
                if (!error && nmap < 1)
                        error = XFS_ERROR(ENOSPC);
                if (error)
-                        goto error_exit;
+                        goto error_cancel;
                /*
                 * Free any blocks freed up in the transaction, then commit.
                 */
                error = xfs_bmap_finish(&tp, &flist, &committed);
                if (error)
-                        goto error_exit;
+                        goto error_cancel;
-                xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                if (error)
+                        goto error;
                /*
                 * Now we need to clear the allocated blocks.
                 * Do this one block per transaction, to keep it simple.
@@ -165,13 +168,13 @@ xfs_growfs_rt_alloc(
                         */
                        if ((error = xfs_trans_reserve(tp, 0,
                                        XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
-                                goto error_exit;
+                                goto error_cancel;
                        /*
                         * Lock the bitmap inode.
                         */
                        if ((error = xfs_trans_iget(mp, tp, ino, 0,
                                                        XFS_ILOCK_EXCL, &ip)))
-                                goto error_exit;
+                                goto error_cancel;
                        /*
                         * Get a buffer for the block.
                         */
@@ -180,14 +183,16 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = XFS_ERROR(EIO);
-                                goto error_exit;
+                                goto error_cancel;
                        }
                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
                        /*
                         * Commit the transaction.
                         */
-                        xfs_trans_commit(tp, 0);
+                        error = xfs_trans_commit(tp, 0);
+                        if (error)
+                                goto error;
                }
                /*
                 * Go on to the next extent, if any.
@@ -195,8 +200,9 @@ xfs_growfs_rt_alloc(
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
-error_exit:
+error_cancel:
        xfs_trans_cancel(tp, cancelflags);
+error:
        return error;
 }
@@ -1875,6 +1881,7 @@ xfs_growfs_rt(
        xfs_trans_t     *tp;            /* transaction pointer */
        sbp = &mp->m_sb;
+        cancelflags = 0;
        /*
         * Initial error checking.
         */
@@ -2041,13 +2048,15 @@ xfs_growfs_rt(
                 */
                mp->m_rsumlevels = nrsumlevels;
                mp->m_rsumsize = nrsumsize;
-                /*
-                 * Commit the transaction.
+                error = xfs_trans_commit(tp, 0);
-                 */
+                if (error) {
-                xfs_trans_commit(tp, 0);
+                        tp = NULL;
+                        break;
+                }
        }
-        if (error)
+        if (error && tp)
                xfs_trans_cancel(tp, cancelflags);
        /*
@@ -2278,7 +2287,7 @@ xfs_rtmount_inodes(
        ASSERT(sbp->sb_rsumino != NULLFSINO);
        error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
        if (error) {
-                VN_RELE(XFS_ITOV(mp->m_rbmip));
+                IRELE(mp->m_rbmip);
                return error;
        }
        ASSERT(mp->m_rsumip != NULL);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index cd3ece6cc918..b0f31c09a76d 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -126,11 +126,11 @@ xfs_write_sync_logforce(
                 * when we return.
                 */
                if (iip && iip->ili_last_lsn) {
-                        xfs_log_force(mp, iip->ili_last_lsn,
+                        error = _xfs_log_force(mp, iip->ili_last_lsn,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC);
+                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
                } else if (xfs_ipincount(ip) > 0) {
-                        xfs_log_force(mp, (xfs_lsn_t)0,
+                        error = _xfs_log_force(mp, (xfs_lsn_t)0,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC);
+                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
                }
        } else {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7f40628d85c7..0804207c7391 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,13 +113,8 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
-typedef struct xfs_ail_entry {
-        struct xfs_log_item     *ail_forw;      /* AIL forw pointer */
-        struct xfs_log_item     *ail_back;      /* AIL back pointer */
-} xfs_ail_entry_t;
 typedef struct xfs_log_item {
-        xfs_ail_entry_t                 li_ail;         /* AIL pointers */
+        struct list_head                li_ail;         /* AIL pointers */
        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
@@ -341,7 +336,6 @@ typedef struct xfs_trans {
        unsigned int            t_rtx_res;      /* # of rt extents resvd */
        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-        sema_t                  t_sema;         /* sema for commit completion */
        xfs_lsn_t               t_lsn;          /* log seq num of start of
                                                 * transaction. */
        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 76d470d8a1e6..1f77c00af566 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,13 +28,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 #else
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -57,7 +57,7 @@ xfs_trans_tail_ail(
        xfs_log_item_t  *lip;
        spin_lock(&mp->m_ail_lock);
-        lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+        lip = xfs_ail_min(&mp->m_ail);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
@@ -91,7 +91,7 @@ xfs_trans_push_ail(
 {
        xfs_log_item_t          *lip;
-        lip = xfs_ail_min(&mp->m_ail.xa_ail);
+        lip = xfs_ail_min(&mp->m_ail);
        if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
                if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
                        xfsaild_wakeup(mp, threshold_lsn);
@@ -111,15 +111,17 @@ xfs_trans_first_push_ail(
 {
        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+        lip = xfs_ail_min(&mp->m_ail);
        *gen = (int)mp->m_ail.xa_gen;
        if (lsn == 0)
                return lip;
-        while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0))
+        list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
-                lip = lip->li_ail.ail_forw;
+                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+                        return lip;
+        }
-        return lip;
+        return NULL;
 }
 /*
@@ -329,7 +331,7 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-        min_lip = xfs_ail_min(&mp->m_ail.xa_ail);
+        min_lip = xfs_ail_min(&mp->m_ail);
        if (min_lip == lip)
                xfs_log_move_tail(mp, 1);
@@ -357,15 +359,13 @@ xfs_trans_update_ail(
        xfs_log_item_t  *lip,
        xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
 {
-        xfs_ail_entry_t         *ailp;
        xfs_log_item_t          *dlip=NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
-        ailp = &(mp->m_ail.xa_ail);
+        mlip = xfs_ail_min(&mp->m_ail);
-        mlip = xfs_ail_min(ailp);
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                dlip = xfs_ail_delete(ailp, lip);
+                dlip = xfs_ail_delete(&mp->m_ail, lip);
                ASSERT(dlip == lip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
@@ -373,11 +373,11 @@ xfs_trans_update_ail(
        lip->li_lsn = lsn;
-        xfs_ail_insert(ailp, lip);
+        xfs_ail_insert(&mp->m_ail, lip);
        mp->m_ail.xa_gen++;
        if (mlip == dlip) {
-                mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+                mlip = xfs_ail_min(&mp->m_ail);
                spin_unlock(&mp->m_ail_lock);
                xfs_log_move_tail(mp, mlip->li_lsn);
        } else {
@@ -407,14 +407,12 @@ xfs_trans_delete_ail(
        xfs_mount_t     *mp,
        xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
 {
-        xfs_ail_entry_t         *ailp;
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                ailp = &(mp->m_ail.xa_ail);
+                mlip = xfs_ail_min(&mp->m_ail);
-                mlip = xfs_ail_min(ailp);
+                dlip = xfs_ail_delete(&mp->m_ail, lip);
-                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
@@ -423,7 +421,7 @@ xfs_trans_delete_ail(
                mp->m_ail.xa_gen++;
                if (mlip == dlip) {
-                        mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+                        mlip = xfs_ail_min(&mp->m_ail);
                        spin_unlock(&mp->m_ail_lock);
                        xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
                } else {
@@ -440,7 +438,7 @@ xfs_trans_delete_ail(
                else {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
-                                        __FUNCTION__);
+                                        __func__);
                        spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
@@ -461,7 +459,7 @@ xfs_trans_first_ail(
 {
        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+        lip = xfs_ail_min(&mp->m_ail);
        *gen = (int)mp->m_ail.xa_gen;
        return lip;
@@ -485,9 +483,9 @@ xfs_trans_next_ail(
        ASSERT(mp && lip && gen);
        if (mp->m_ail.xa_gen == *gen) {
-                nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip);
+                nlip = xfs_ail_next(&mp->m_ail, lip);
        } else {
-                nlip = xfs_ail_min(&(mp->m_ail).xa_ail);
+                nlip = xfs_ail_min(&mp->m_ail);
                *gen = (int)mp->m_ail.xa_gen;
                if (restarts != NULL) {
                        XFS_STATS_INC(xs_push_ail_restarts);
@@ -517,8 +515,7 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-        mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail;
+        INIT_LIST_HEAD(&mp->m_ail.xa_ail);
-        mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail;
        return xfsaild_start(mp);
 }
@@ -537,7 +534,7 @@ xfs_trans_ail_destroy(
 */
 STATIC void
 xfs_ail_insert(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -546,27 +543,22 @@ xfs_ail_insert(
        /*
         * If the list is empty, just insert the item.
         */
-        if (base->ail_back == (xfs_log_item_t*)base) {
+        if (list_empty(&ailp->xa_ail)) {
-                base->ail_forw = lip;
+                list_add(&lip->li_ail, &ailp->xa_ail);
-                base->ail_back = lip;
-                lip->li_ail.ail_forw = (xfs_log_item_t*)base;
-                lip->li_ail.ail_back = (xfs_log_item_t*)base;
                return;
        }
-        next_lip = base->ail_back;
+        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-        while ((next_lip != (xfs_log_item_t*)base) &&
+                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
+                        break;
-                next_lip = next_lip->li_ail.ail_back;
        }
-        ASSERT((next_lip == (xfs_log_item_t*)base) ||
+        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-        lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
-        lip->li_ail.ail_back = next_lip;
-        next_lip->li_ail.ail_forw = lip;
-        lip->li_ail.ail_forw->li_ail.ail_back = lip;
-        xfs_ail_check(base, lip);
+        list_add(&lip->li_ail, &next_lip->li_ail);
+        xfs_ail_check(ailp, lip);
        return;
 }
@@ -576,15 +568,13 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
-        xfs_ail_check(base, lip);
+        xfs_ail_check(ailp, lip);
-        lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
-        lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
+        list_del(&lip->li_ail);
-        lip->li_ail.ail_forw = NULL;
-        lip->li_ail.ail_back = NULL;
        return lip;
 }
@@ -595,14 +585,13 @@ xfs_ail_delete(
 */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-        xfs_ail_entry_t *base)
+        xfs_ail_t       *ailp)
 /* ARGSUSED */
 {
-        register xfs_log_item_t *forw = base->ail_forw;
+        if (list_empty(&ailp->xa_ail))
-        if (forw == (xfs_log_item_t*)base) {
                return NULL;
-        }
-        return forw;
+        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
 }
 /*
@@ -612,15 +601,14 @@ xfs_ail_min(
 */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
-        if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
-        }
-        return lip->li_ail.ail_forw;
+        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
 }
 #ifdef DEBUG
@@ -629,57 +617,40 @@ xfs_ail_next(
 */
 STATIC void
 xfs_ail_check(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
-        prev_lip = base->ail_forw;
+        if (list_empty(&ailp->xa_ail))
-        if (prev_lip == (xfs_log_item_t*)base) {
-                /*
-                 * Make sure the pointers are correct when the list
-                 * is empty.
-                 */
-                ASSERT(base->ail_back == (xfs_log_item_t*)base);
                return;
-        }
        /*
         * Check the next and previous entries are valid.
         */
        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-        prev_lip = lip->li_ail.ail_back;
+        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
-        if (prev_lip != (xfs_log_item_t*)base) {
+        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(prev_lip->li_ail.ail_forw == lip);
                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-        }
-        prev_lip = lip->li_ail.ail_forw;
+        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-        if (prev_lip != (xfs_log_item_t*)base) {
+        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(prev_lip->li_ail.ail_back == lip);
                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-        }
 #ifdef XFS_TRANS_DEBUG
        /*
-         * Walk the list checking forward and backward pointers,
+         * Walk the list checking lsn ordering, and that every entry has the
-         * lsn ordering, and that every entry has the XFS_LI_IN_AIL
+         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * flag set. This is really expensive, so only do it when
+         * when specifically debugging the transaction subsystem.
-         * specifically debugging the transaction subsystem.
         */
-        prev_lip = (xfs_log_item_t*)base;
+        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        while (lip != (xfs_log_item_t*)base) {
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (prev_lip != (xfs_log_item_t*)base) {
+                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(prev_lip->li_ail.ail_forw == lip);
                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                }
-                ASSERT(lip->li_ail.ail_back == prev_lip);
                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
                prev_lip = lip;
-                lip = lip->li_ail.ail_forw;
        }
-        ASSERT(lip == (xfs_log_item_t*)base);
-        ASSERT(base->ail_back == prev_lip);
 #endif /* XFS_TRANS_DEBUG */
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b898022b..cb0c5839154b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
        if (tp == NULL) {
                bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
                if (!bp)
-                        return XFS_ERROR(ENOMEM);
+                        return (flags & XFS_BUF_TRYLOCK) ?
+                                        EAGAIN : XFS_ERROR(ENOMEM);
                if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -353,17 +354,15 @@ xfs_trans_read_buf(
                        ASSERT(!XFS_BUF_ISASYNC(bp));
                        XFS_BUF_READ(bp);
                        xfsbdstrat(tp->t_mountp, bp);
-                        xfs_iowait(bp);
+                        error = xfs_iowait(bp);
-                        if (XFS_BUF_GETERROR(bp) != 0) {
+                        if (error) {
                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                                  bp, blkno);
-                                error = XFS_BUF_GETERROR(bp);
                                xfs_buf_relse(bp);
                                /*
-                                 * We can gracefully recover from most
+                                 * We can gracefully recover from most read
-                                 * read errors. Ones we can't are those
+                                 * errors. Ones we can't are those that happen
-                                 * that happen after the transaction's
+                                 * after the transaction's already dirty.
-                                 * already dirty.
                                 */
                                if (tp->t_flags & XFS_TRANS_DIRTY)
                                        xfs_force_shutdown(tp->t_mountp,
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 5c89be475464..0f5191644ab2 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -160,4 +160,9 @@ typedef enum {
        XFS_BTNUM_MAX
 } xfs_btnum_t;
+struct xfs_name {
+        const char      *name;
+        int             len;
+};
 #endif  /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 18a85e746680..2b8dc7e40772 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -40,34 +40,12 @@
 #include "xfs_itable.h"
 #include "xfs_utils.h"
-/*
- * xfs_get_dir_entry is used to get a reference to an inode given
- * its parent directory inode and the name of the file.  It does
- * not lock the child inode, and it unlocks the directory before
- * returning.  The directory's generation number is returned for
- * use by a later call to xfs_lock_dir_and_entry.
- */
-int
-xfs_get_dir_entry(
-        bhv_vname_t     *dentry,
-        xfs_inode_t     **ipp)
-{
-        bhv_vnode_t     *vp;
-        vp = VNAME_TO_VNODE(dentry);
-        *ipp = xfs_vtoi(vp);
-        if (!*ipp)
-                return XFS_ERROR(ENOENT);
-        VN_HOLD(vp);
-        return 0;
-}
 int
 xfs_dir_lookup_int(
        xfs_inode_t     *dp,
        uint            lock_mode,
-        bhv_vname_t     *dentry,
+        struct xfs_name *name,
        xfs_ino_t       *inum,
        xfs_inode_t     **ipp)
 {
@@ -75,7 +53,7 @@ xfs_dir_lookup_int(
        xfs_itrace_entry(dp);
-        error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
+        error = xfs_dir_lookup(NULL, dp, name, inum);
        if (!error) {
                /*
                 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f857fcccb723..175b126d2cab 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,15 +21,14 @@
 #define IRELE(ip)       VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)       VN_HOLD(XFS_ITOV(ip))
-extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
-extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
+                                xfs_ino_t *, xfs_inode_t **);
-                                xfs_inode_t **);
+extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
-extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
                                xfs_inode_t **, int *);
-extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
 #endif  /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7094caff13cf..fc48158fe479 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_refcache.h"
 #include "xfs_buf_item.h"
 #include "xfs_log_priv.h"
 #include "xfs_dir2_trace.h"
@@ -56,6 +55,7 @@
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
 #include "xfs_vfsops.h"
+#include "xfs_utils.h"
 int __init
@@ -69,15 +69,17 @@ xfs_init(void)
        /*
         * Initialize all of the zone allocators we use.
         */
+        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+                                                "xfs_log_ticket");
        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-                                                 "xfs_bmap_free_item");
+                                                "xfs_bmap_free_item");
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-                                            "xfs_btree_cur");
+                                                "xfs_btree_cur");
-        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+        xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
-        xfs_da_state_zone =
+                                                "xfs_da_state");
-                kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
        xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
        xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
        xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
        xfs_mru_cache_init();
        xfs_filestream_init();
@@ -113,9 +115,6 @@ xfs_init(void)
        xfs_ili_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
                                        KM_ZONE_SPREAD, NULL);
-        xfs_icluster_zone =
-                kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
-                                        KM_ZONE_SPREAD, NULL);
        /*
         * Allocate global trace buffers.
@@ -153,11 +152,9 @@ xfs_cleanup(void)
        extern kmem_zone_t      *xfs_inode_zone;
        extern kmem_zone_t      *xfs_efd_zone;
        extern kmem_zone_t      *xfs_efi_zone;
-        extern kmem_zone_t      *xfs_icluster_zone;
        xfs_cleanup_procfs();
        xfs_sysctl_unregister();
-        xfs_refcache_destroy();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
        xfs_acl_zone_destroy(xfs_acl_zone);
@@ -189,7 +186,6 @@ xfs_cleanup(void)
        kmem_zone_destroy(xfs_efi_zone);
        kmem_zone_destroy(xfs_ifork_zone);
        kmem_zone_destroy(xfs_ili_zone);
-        kmem_zone_destroy(xfs_icluster_zone);
 }
 /*
@@ -573,7 +569,7 @@ xfs_unmount(
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                error = XFS_SEND_PREUNMOUNT(mp,
-                                rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
+                                rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
                                NULL, NULL, 0, 0,
                                (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
                                        0:DM_FLAGS_UNWANTED);
@@ -584,11 +580,6 @@ xfs_unmount(
                                        0 : DM_FLAGS_UNWANTED;
        }
 #endif
-        /*
-         * First blow any referenced inode from this file system
-         * out of the reference cache, and delete the timer.
-         */
-        xfs_refcache_purge_mp(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
@@ -607,7 +598,7 @@ xfs_unmount(
        /*
         * Drop the reference count
         */
-        VN_RELE(rvp);
+        IRELE(rip);
        /*
         * If we're forcing a shutdown, typically because of a media error,
@@ -629,7 +620,7 @@ out:
                /* Note: mp structure must still exist for
                 * XFS_SEND_UNMOUNT() call.
                 */
-                XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL,
+                XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
                        DM_RIGHT_NULL, 0, error, unmount_event_flags);
        }
        if (xfs_unmountfs_needed) {
@@ -646,13 +637,12 @@ out:
        return XFS_ERROR(error);
 }
-STATIC int
+STATIC void
 xfs_quiesce_fs(
        xfs_mount_t             *mp)
 {
        int                     count = 0, pincount;
-        xfs_refcache_purge_mp(mp);
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
        xfs_finish_reclaim_all(mp, 0);
@@ -671,8 +661,6 @@ xfs_quiesce_fs(
                        count++;
                }
        } while (count < 2);
-        return 0;
 }
 /*
@@ -684,6 +672,8 @@ void
 xfs_attr_quiesce(
        xfs_mount_t     *mp)
 {
+        int     error = 0;
        /* wait for all modifications to complete */
        while (atomic_read(&mp->m_active_trans) > 0)
                delay(100);
@@ -694,7 +684,11 @@ xfs_attr_quiesce(
        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
        /* Push the superblock and write an unmount record */
-        xfs_log_sbcount(mp, 1);
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
 }
@@ -790,8 +784,8 @@ xfs_unmount_flush(
                goto fscorrupt_out2;
        if (rbmip) {
-                VN_RELE(XFS_ITOV(rbmip));
+                IRELE(rbmip);
-                VN_RELE(XFS_ITOV(rsumip));
+                IRELE(rsumip);
        }
        xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1169,10 +1163,10 @@ xfs_sync_inodes(
                         * above, then wait until after we've unlocked
                         * the inode to release the reference.  This is
                         * because we can be already holding the inode
-                         * lock when VN_RELE() calls xfs_inactive().
+                         * lock when IRELE() calls xfs_inactive().
                         *
                         * Make sure to drop the mount lock before calling
-                         * VN_RELE() so that we don't trip over ourselves if
+                         * IRELE() so that we don't trip over ourselves if
                         * we have to go for the mount lock again in the
                         * inactive code.
                         */
@@ -1180,7 +1174,7 @@ xfs_sync_inodes(
                                IPOINTER_INSERT(ip, mp);
                        }
-                        VN_RELE(vp);
+                        IRELE(ip);
                        vnode_refed = B_FALSE;
                }
@@ -1323,30 +1317,8 @@ xfs_syncsub(
        }
        /*
-         * If this is the periodic sync, then kick some entries out of
-         * the reference cache.  This ensures that idle entries are
-         * eventually kicked out of the cache.
-         */
-        if (flags & SYNC_REFCACHE) {
-                if (flags & SYNC_WAIT)
-                        xfs_refcache_purge_mp(mp);
-                else
-                        xfs_refcache_purge_some(mp);
-        }
-        /*
-         * If asked, update the disk superblock with incore counter values if we
-         * are using non-persistent counters so that they don't get too far out
-         * of sync if we crash or get a forced shutdown. We don't want to force
-         * this to disk, just get a transaction into the iclogs....
-         */
-        if (flags & SYNC_SUPER)
-                xfs_log_sbcount(mp, 0);
-        /*
         * Now check to see if the log needs a "dummy" transaction.
         */
        if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
                xfs_trans_t *tp;
                xfs_inode_t *ip;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64c5953feca4..6650601c64f7 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -48,7 +48,6 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_rtalloc.h"
-#include "xfs_refcache.h"
 #include "xfs_trans_space.h"
 #include "xfs_log_priv.h"
 #include "xfs_filestream.h"
@@ -327,7 +326,7 @@ xfs_setattr(
                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
                    !(flags & ATTR_DMI)) {
                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
+                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
                                vap->va_size, 0, dmflags, NULL);
                        if (code) {
                                lock_flags = 0;
@@ -634,6 +633,15 @@ xfs_setattr(
         * Truncate file.  Must have write permission and not be a directory.
         */
        if (mask & XFS_AT_SIZE) {
+                /*
+                 * Only change the c/mtime if we are changing the size
+                 * or we are explicitly asked to change it. This handles
+                 * the semantic difference between truncate() and ftruncate()
+                 * as implemented in the VFS.
+                 */
+                if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+                        timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
                if (vap->va_size > ip->i_size) {
                        xfs_igrow_finish(tp, ip, vap->va_size,
                            !(flags & ATTR_DMI));
@@ -662,10 +670,6 @@ xfs_setattr(
                         */
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
-                /*
-                 * Have to do this even if the file's size doesn't change.
-                 */
-                timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
        }
        /*
@@ -877,7 +881,7 @@ xfs_setattr(
        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
            !(flags & ATTR_DMI)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
                                        NULL, DM_RIGHT_NULL, NULL, NULL,
                                        0, 0, AT_DELAY_FLAG(flags));
        }
@@ -1443,28 +1447,22 @@ xfs_inactive_attrs(
        tp = *tpp;
        mp = ip->i_mount;
        ASSERT(ip->i_d.di_forkoff != 0);
-        xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        if (error)
+                goto error_unlock;
        error = xfs_attr_inactive(ip);
-        if (error) {
+        if (error)
-                *tpp = NULL;
+                goto error_unlock;
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error; /* goto out */
-        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
        error = xfs_trans_reserve(tp, 0,
                                  XFS_IFREE_LOG_RES(mp),
                                  0, XFS_TRANS_PERM_LOG_RES,
                                  XFS_INACTIVE_LOG_COUNT);
-        if (error) {
+        if (error)
-                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                goto error_cancel;
-                xfs_trans_cancel(tp, 0);
-                *tpp = NULL;
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1475,6 +1473,14 @@ xfs_inactive_attrs(
        *tpp = tp;
        return 0;
+error_cancel:
+        ASSERT(XFS_FORCED_SHUTDOWN(mp));
+        xfs_trans_cancel(tp, 0);
+error_unlock:
+        *tpp = NULL;
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
 }
 int
@@ -1520,12 +1526,6 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
        }
-#ifdef HAVE_REFCACHE
-        /* If we are in the NFS reference cache then don't do this now */
-        if (ip->i_refcache)
-                return 0;
-#endif
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -1588,9 +1588,8 @@ xfs_inactive(
        mp = ip->i_mount;
-        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
+        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
-                (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
+                XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
-        }
        error = 0;
@@ -1744,11 +1743,18 @@ xfs_inactive(
                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
                /*
-                 * Just ignore errors at this point.  There is
+                 * Just ignore errors at this point.  There is nothing we can
-                 * nothing we can do except to try to keep going.
+                 * do except to try to keep going. Make sure it's not a silent
+                 * error.
                 */
-                (void) xfs_bmap_finish(&tp,  &free_list, &committed);
+                error = xfs_bmap_finish(&tp,  &free_list, &committed);
-                (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                if (error)
+                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                                "xfs_bmap_finish() returned error %d", error);
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                if (error)
+                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                                "xfs_trans_commit() returned error %d", error);
        }
        /*
         * Release the dquots held by inode, if any.
@@ -1765,8 +1771,8 @@ xfs_inactive(
 int
 xfs_lookup(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *name,
-        bhv_vnode_t             **vpp)
+        xfs_inode_t             **ipp)
 {
        xfs_inode_t             *ip;
        xfs_ino_t               e_inum;
@@ -1779,9 +1785,9 @@ xfs_lookup(
                return XFS_ERROR(EIO);
        lock_mode = xfs_ilock_map_shared(dp);
-        error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
+        error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
        if (!error) {
-                *vpp = XFS_ITOV(ip);
+                *ipp = ip;
                xfs_itrace_ref(ip);
        }
        xfs_iunlock_map_shared(dp, lock_mode);
@@ -1791,19 +1797,16 @@ xfs_lookup(
 int
 xfs_create(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *name,
        mode_t                  mode,
        xfs_dev_t               rdev,
-        bhv_vnode_t             **vpp,
+        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        char                    *name = VNAME(dentry);
+        xfs_mount_t             *mp = dp->i_mount;
-        xfs_mount_t             *mp = dp->i_mount;
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
        xfs_inode_t             *ip;
-        bhv_vnode_t             *vp = NULL;
        xfs_trans_t             *tp;
-        int                     error;
+        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        boolean_t               unlock_dp_on_error = B_FALSE;
@@ -1813,17 +1816,14 @@ xfs_create(
        xfs_prid_t              prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
-        int                     namelen;
-        ASSERT(!*vpp);
+        ASSERT(!*ipp);
        xfs_itrace_entry(dp);
-        namelen = VNAMELEN(dentry);
        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                dir_vp, DM_RIGHT_NULL, NULL,
+                                dp, DM_RIGHT_NULL, NULL,
-                                DM_RIGHT_NULL, name, NULL,
+                                DM_RIGHT_NULL, name->name, NULL,
                                mode, 0, 0);
                if (error)
@@ -1855,7 +1855,7 @@ xfs_create(
        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+        resblks = XFS_CREATE_SPACE_RES(mp, name->len);
        /*
         * Initially assume that the file does not exist and
         * reserve the resources for that case.  If that is not
@@ -1888,7 +1888,8 @@ xfs_create(
        if (error)
                goto error_return;
-        if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
+        error = xfs_dir_canenter(tp, dp, name, resblks);
+        if (error)
                goto error_return;
        error = xfs_dir_ialloc(&tp, dp, mode, 1,
                        rdev, credp, prid, resblks > 0,
@@ -1914,11 +1915,11 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        VN_HOLD(dir_vp);
+        IHOLD(dp);
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
-        error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks ?
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
@@ -1952,7 +1953,6 @@ xfs_create(
         * vnode to the caller, we bump the vnode ref count now.
         */
        IHOLD(ip);
-        vp = XFS_ITOV(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error) {
@@ -1970,17 +1970,17 @@ xfs_create(
        XFS_QM_DQRELE(mp, udqp);
        XFS_QM_DQRELE(mp, gdqp);
-        *vpp = vp;
+        *ipp = ip;
        /* Fallthrough to std_return with error = 0  */
 std_return:
-        if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
+        if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                        dir_vp, DM_RIGHT_NULL,
+                        dp, DM_RIGHT_NULL,
-                        *vpp ? vp:NULL,
+                        *ipp ? ip : NULL,
-                        DM_RIGHT_NULL, name, NULL,
+                        DM_RIGHT_NULL, name->name, NULL,
                        mode, error, 0);
        }
        return error;
@@ -2272,46 +2272,32 @@ int remove_which_error_return = 0;
 int
 xfs_remove(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry)
+        struct xfs_name         *name,
+        xfs_inode_t             *ip)
 {
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
-        char                    *name = VNAME(dentry);
        xfs_mount_t             *mp = dp->i_mount;
-        xfs_inode_t             *ip;
        xfs_trans_t             *tp = NULL;
        int                     error = 0;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        int                     dm_di_mode = 0;
        int                     link_zero;
        uint                    resblks;
-        int                     namelen;
        xfs_itrace_entry(dp);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        namelen = VNAMELEN(dentry);
-        if (!xfs_get_dir_entry(dentry, &ip)) {
-                dm_di_mode = ip->i_d.di_mode;
-                IRELE(ip);
-        }
        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
-                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL, name->name, NULL,
-                                        name, NULL, dm_di_mode, 0, 0);
+                                        ip->i_d.di_mode, 0, 0);
                if (error)
                        return error;
        }
-        /* From this point on, return through std_return */
-        ip = NULL;
        /*
         * We need to get a reference to ip before we get our log
         * reservation. The reason for this is that we cannot call
@@ -2324,13 +2310,7 @@ xfs_remove(
         * when we call xfs_iget.  Instead we get an unlocked reference
         * to the inode before getting our log reservation.
         */
-        error = xfs_get_dir_entry(dentry, &ip);
+        IHOLD(ip);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
-                goto std_return;
-        }
-        dm_di_mode = ip->i_d.di_mode;
        xfs_itrace_entry(ip);
        xfs_itrace_ref(ip);
@@ -2398,7 +2378,7 @@ xfs_remove(
         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
         */
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, 0);
        if (error) {
                ASSERT(error != ENOENT);
@@ -2449,14 +2429,6 @@ xfs_remove(
        }
        /*
-         * Before we drop our extra reference to the inode, purge it
-         * from the refcache if it is there.  By waiting until afterwards
-         * to do the IRELE, we ensure that we won't go inactive in the
-         * xfs_refcache_purge_ip routine (although that would be OK).
-         */
-        xfs_refcache_purge_ip(ip);
-        /*
         * If we are using filestreams, kill the stream association.
         * If the file is still open it may get a new one but that
         * will get killed on last close in xfs_close() so we don't
@@ -2472,9 +2444,9 @@ xfs_remove(
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-                                dir_vp, DM_RIGHT_NULL,
+                                dp, DM_RIGHT_NULL,
                                NULL, DM_RIGHT_NULL,
-                                name, NULL, dm_di_mode, error, 0);
+                                name->name, NULL, ip->i_d.di_mode, error, 0);
        }
        return error;
@@ -2495,14 +2467,6 @@ xfs_remove(
        cancel_flags |= XFS_TRANS_ABORT;
        xfs_trans_cancel(tp, cancel_flags);
-        /*
-         * Before we drop our extra reference to the inode, purge it
-         * from the refcache if it is there.  By waiting until afterwards
-         * to do the IRELE, we ensure that we won't go inactive in the
-         * xfs_refcache_purge_ip routine (although that would be OK).
-         */
-        xfs_refcache_purge_ip(ip);
        IRELE(ip);
        goto std_return;
@@ -2511,12 +2475,10 @@ xfs_remove(
 int
 xfs_link(
        xfs_inode_t             *tdp,
-        bhv_vnode_t             *src_vp,
+        xfs_inode_t             *sip,
-        bhv_vname_t             *dentry)
+        struct xfs_name         *target_name)
 {
-        bhv_vnode_t             *target_dir_vp = XFS_ITOV(tdp);
        xfs_mount_t             *mp = tdp->i_mount;
-        xfs_inode_t             *sip = xfs_vtoi(src_vp);
        xfs_trans_t             *tp;
        xfs_inode_t             *ips[2];
        int                     error;
@@ -2525,23 +2487,20 @@ xfs_link(
        int                     cancel_flags;
        int                     committed;
        int                     resblks;
-        char                    *target_name = VNAME(dentry);
-        int                     target_namelen;
        xfs_itrace_entry(tdp);
-        xfs_itrace_entry(xfs_vtoi(src_vp));
+        xfs_itrace_entry(sip);
-        target_namelen = VNAMELEN(dentry);
+        ASSERT(!S_ISDIR(sip->i_d.di_mode));
-        ASSERT(!VN_ISDIR(src_vp));
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
        if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-                                        target_dir_vp, DM_RIGHT_NULL,
+                                        tdp, DM_RIGHT_NULL,
-                                        src_vp, DM_RIGHT_NULL,
+                                        sip, DM_RIGHT_NULL,
-                                        target_name, NULL, 0, 0, 0);
+                                        target_name->name, NULL, 0, 0, 0);
                if (error)
                        return error;
        }
@@ -2556,7 +2515,7 @@ xfs_link(
        tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
        if (error == ENOSPC) {
@@ -2584,8 +2543,8 @@ xfs_link(
         * xfs_trans_cancel will both unlock the inodes and
         * decrement the associated ref counts.
         */
-        VN_HOLD(src_vp);
+        IHOLD(sip);
-        VN_HOLD(target_dir_vp);
+        IHOLD(tdp);
        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
@@ -2608,15 +2567,14 @@ xfs_link(
                goto error_return;
        }
-        if (resblks == 0 &&
+        error = xfs_dir_canenter(tp, tdp, target_name, resblks);
-            (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
+        if (error)
                goto error_return;
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
+        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
-                                   sip->i_ino, &first_block, &free_list,
+                                        &first_block, &free_list, resblks);
-                                   resblks);
        if (error)
                goto abort_return;
        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2650,9 +2608,9 @@ xfs_link(
 std_return:
        if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-                                target_dir_vp, DM_RIGHT_NULL,
+                                tdp, DM_RIGHT_NULL,
-                                src_vp, DM_RIGHT_NULL,
+                                sip, DM_RIGHT_NULL,
-                                target_name, NULL, 0, error, 0);
+                                target_name->name, NULL, 0, error, 0);
        }
        return error;
@@ -2669,17 +2627,13 @@ std_return:
 int
 xfs_mkdir(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *dir_name,
        mode_t                  mode,
-        bhv_vnode_t             **vpp,
+        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
-        char                    *dir_name = VNAME(dentry);
-        int                     dir_namelen = VNAMELEN(dentry);
        xfs_mount_t             *mp = dp->i_mount;
        xfs_inode_t             *cdp;   /* inode of created dir */
-        bhv_vnode_t             *cvp;   /* vnode of created dir */
        xfs_trans_t             *tp;
        int                     cancel_flags;
        int                     error;
@@ -2700,8 +2654,8 @@ xfs_mkdir(
        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                        dir_vp, DM_RIGHT_NULL, NULL,
+                                        dp, DM_RIGHT_NULL, NULL,
-                                        DM_RIGHT_NULL, dir_name, NULL,
+                                        DM_RIGHT_NULL, dir_name->name, NULL,
                                        mode, 0, 0);
                if (error)
                        return error;
@@ -2730,7 +2684,7 @@ xfs_mkdir(
        tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+        resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
        error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
                                  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
        if (error == ENOSPC) {
@@ -2762,8 +2716,8 @@ xfs_mkdir(
        if (error)
                goto error_return;
-        if (resblks == 0 &&
+        error = xfs_dir_canenter(tp, dp, dir_name, resblks);
-            (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
+        if (error)
                goto error_return;
        /*
         * create the directory inode.
@@ -2786,15 +2740,15 @@ xfs_mkdir(
         * from here on will result in the transaction cancel
         * unlocking dp so don't do it explicitly in the error path.
         */
-        VN_HOLD(dir_vp);
+        IHOLD(dp);
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
+        error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
-                                   &first_block, &free_list, resblks ?
+                                        &first_block, &free_list, resblks ?
-                                   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != ENOSPC);
                goto error1;
@@ -2817,11 +2771,9 @@ xfs_mkdir(
        if (error)
                goto error2;
-        cvp = XFS_ITOV(cdp);
        created = B_TRUE;
-        *vpp = cvp;
+        *ipp = cdp;
        IHOLD(cdp);
        /*
@@ -2858,10 +2810,10 @@ std_return:
        if ((created || (error != 0 && dm_event_sent != 0)) &&
            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
-                                        created ? XFS_ITOV(cdp):NULL,
+                                        created ? cdp : NULL,
                                        DM_RIGHT_NULL,
-                                        dir_name, NULL,
+                                        dir_name->name, NULL,
                                        mode, error, 0);
        }
        return error;
@@ -2885,20 +2837,17 @@ std_return:
 int
 xfs_rmdir(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry)
+        struct xfs_name         *name,
+        xfs_inode_t             *cdp)
 {
        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
-        char                    *name = VNAME(dentry);
-        int                     namelen = VNAMELEN(dentry);
        xfs_mount_t             *mp = dp->i_mount;
-        xfs_inode_t             *cdp;   /* child directory */
        xfs_trans_t             *tp;
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        int                     dm_di_mode = S_IFDIR;
        int                     last_cdp_link;
        uint                    resblks;
@@ -2907,24 +2856,15 @@ xfs_rmdir(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (!xfs_get_dir_entry(dentry, &cdp)) {
-                dm_di_mode = cdp->i_d.di_mode;
-                IRELE(cdp);
-        }
        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL, name->name,
-                                        name, NULL, dm_di_mode, 0, 0);
+                                        NULL, cdp->i_d.di_mode, 0, 0);
                if (error)
                        return XFS_ERROR(error);
        }
-        /* Return through std_return after this point. */
-        cdp = NULL;
        /*
         * We need to get a reference to cdp before we get our log
         * reservation.  The reason for this is that we cannot call
@@ -2937,13 +2877,7 @@ xfs_rmdir(
         * when we call xfs_iget.  Instead we get an unlocked reference
         * to the inode before getting our log reservation.
         */
-        error = xfs_get_dir_entry(dentry, &cdp);
+        IHOLD(cdp);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
-                goto std_return;
-        }
-        mp = dp->i_mount;
-        dm_di_mode = cdp->i_d.di_mode;
        /*
         * Get the dquots for the inodes.
@@ -3020,7 +2954,7 @@ xfs_rmdir(
                goto error_return;
        }
-        error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+        error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
                goto error1;
@@ -3098,9 +3032,9 @@ xfs_rmdir(
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
                                        NULL, DM_RIGHT_NULL,
-                                        name, NULL, dm_di_mode,
+                                        name->name, NULL, cdp->i_d.di_mode,
                                        error, 0);
        }
        return error;
@@ -3118,13 +3052,12 @@ xfs_rmdir(
 int
 xfs_symlink(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *link_name,
-        char                    *target_path,
+        const char              *target_path,
        mode_t                  mode,
-        bhv_vnode_t             **vpp,
+        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp;
        xfs_inode_t             *ip;
@@ -3140,17 +3073,15 @@ xfs_symlink(
        int                     nmaps;
        xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
        xfs_daddr_t             d;
-        char                    *cur_chunk;
+        const char              *cur_chunk;
        int                     byte_cnt;
        int                     n;
        xfs_buf_t               *bp;
        xfs_prid_t              prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
-        char                    *link_name = VNAME(dentry);
-        int                     link_namelen;
-        *vpp = NULL;
+        *ipp = NULL;
        error = 0;
        ip = NULL;
        tp = NULL;
@@ -3160,44 +3091,17 @@ xfs_symlink(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        link_namelen = VNAMELEN(dentry);
        /*
         * Check component lengths of the target path name.
         */
        pathlen = strlen(target_path);
        if (pathlen >= MAXPATHLEN)      /* total string too long */
                return XFS_ERROR(ENAMETOOLONG);
-        if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
-                int len, total;
-                char *path;
-                for (total = 0, path = target_path; total < pathlen;) {
-                        /*
-                         * Skip any slashes.
-                         */
-                        while(*path == '/') {
-                                total++;
-                                path++;
-                        }
-                        /*
-                         * Count up to the next slash or end of path.
-                         * Error out if the component is bigger than MAXNAMELEN.
-                         */
-                        for(len = 0; *path != '/' && total < pathlen;total++, path++) {
-                                if (++len >= MAXNAMELEN) {
-                                        error = ENAMETOOLONG;
-                                        return error;
-                                }
-                        }
-                }
-        }
        if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-                                        link_name, target_path, 0, 0, 0);
+                                        link_name->name, target_path, 0, 0, 0);
                if (error)
                        return error;
        }
@@ -3229,7 +3133,7 @@ xfs_symlink(
                fs_blocks = 0;
        else
                fs_blocks = XFS_B_TO_FSB(mp, pathlen);
-        resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
        error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
        if (error == ENOSPC && fs_blocks == 0) {
@@ -3263,8 +3167,8 @@ xfs_symlink(
        /*
         * Check for ability to enter directory entry, if no space reserved.
         */
-        if (resblks == 0 &&
+        error = xfs_dir_canenter(tp, dp, link_name, resblks);
-            (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
+        if (error)
                goto error_return;
        /*
         * Initialize the bmap freelist prior to calling either
@@ -3289,7 +3193,7 @@ xfs_symlink(
         * transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        VN_HOLD(dir_vp);
+        IHOLD(dp);
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
@@ -3356,8 +3260,8 @@ xfs_symlink(
        /*
         * Create the directory entry for the symlink.
         */
-        error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
+        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
-                                   &first_block, &free_list, resblks);
+                                        &first_block, &free_list, resblks);
        if (error)
                goto error1;
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3399,19 +3303,14 @@ xfs_symlink(
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
-                                        error ? NULL : XFS_ITOV(ip),
+                                        error ? NULL : ip,
-                                        DM_RIGHT_NULL, link_name, target_path,
+                                        DM_RIGHT_NULL, link_name->name,
-                                        0, error, 0);
+                                        target_path, 0, error, 0);
        }
-        if (!error) {
+        if (!error)
-                bhv_vnode_t *vp;
+                *ipp = ip;
-                ASSERT(ip);
-                vp = XFS_ITOV(ip);
-                *vpp = vp;
-        }
        return error;
 error2:
@@ -3431,60 +3330,11 @@ std_return:
 }
 int
-xfs_rwlock(
-        xfs_inode_t     *ip,
-        bhv_vrwlock_t   locktype)
-{
-        if (S_ISDIR(ip->i_d.di_mode))
-                return 1;
-        if (locktype == VRWLOCK_WRITE) {
-                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        } else if (locktype == VRWLOCK_TRY_READ) {
-                return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
-        } else if (locktype == VRWLOCK_TRY_WRITE) {
-                return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
-        } else {
-                ASSERT((locktype == VRWLOCK_READ) ||
-                       (locktype == VRWLOCK_WRITE_DIRECT));
-                xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        }
-        return 1;
-}
-void
-xfs_rwunlock(
-        xfs_inode_t     *ip,
-        bhv_vrwlock_t   locktype)
-{
-        if (S_ISDIR(ip->i_d.di_mode))
-                return;
-        if (locktype == VRWLOCK_WRITE) {
-                /*
-                 * In the write case, we may have added a new entry to
-                 * the reference cache.  This might store a pointer to
-                 * an inode to be released in this inode.  If it is there,
-                 * clear the pointer and release the inode after unlocking
-                 * this one.
-                 */
-                xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
-        } else {
-                ASSERT((locktype == VRWLOCK_READ) ||
-                       (locktype == VRWLOCK_WRITE_DIRECT));
-                xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        }
-        return;
-}
-int
 xfs_inode_flush(
        xfs_inode_t     *ip,
        int             flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
-        xfs_inode_log_item_t *iip = ip->i_itemp;
        int             error = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
@@ -3494,33 +3344,9 @@ xfs_inode_flush(
         * Bypass inodes which have already been cleaned by
         * the inode flush clustering code inside xfs_iflush
         */
-        if ((ip->i_update_core == 0) &&
+        if (xfs_inode_clean(ip))
-            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
                return 0;
-        if (flags & FLUSH_LOG) {
-                if (iip && iip->ili_last_lsn) {
-                        xlog_t          *log = mp->m_log;
-                        xfs_lsn_t       sync_lsn;
-                        int             log_flags = XFS_LOG_FORCE;
-                        spin_lock(&log->l_grant_lock);
-                        sync_lsn = log->l_last_sync_lsn;
-                        spin_unlock(&log->l_grant_lock);
-                        if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
-                                if (flags & FLUSH_SYNC)
-                                        log_flags |= XFS_LOG_SYNC;
-                                error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
-                                if (error)
-                                        return error;
-                        }
-                        if (ip->i_update_core == 0)
-                                return 0;
-                }
-        }
        /*
         * We make this non-blocking if the inode is contended,
         * return EAGAIN to indicate to the caller that they
@@ -3528,30 +3354,22 @@ xfs_inode_flush(
         * blocking on inodes inside another operation right
         * now, they get caught later by xfs_sync.
         */
-        if (flags & FLUSH_INODE) {
+        if (flags & FLUSH_SYNC) {
-                int     flush_flags;
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                xfs_iflock(ip);
-                if (flags & FLUSH_SYNC) {
+        } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                        xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                        xfs_iflock(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                                return EAGAIN;
-                        }
-                } else {
                        return EAGAIN;
                }
+        } else {
-                if (flags & FLUSH_SYNC)
+                return EAGAIN;
-                        flush_flags = XFS_IFLUSH_SYNC;
-                else
-                        flush_flags = XFS_IFLUSH_ASYNC;
-                error = xfs_iflush(ip, flush_flags);
-                xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
+        error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+                                                    : XFS_IFLUSH_ASYNC_NOBLOCK);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return error;
 }
@@ -3694,12 +3512,12 @@ xfs_finish_reclaim(
         * We get the flush lock regardless, though, just to make sure
         * we don't free it while it is being flushed.
         */
-        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (!locked) {
-                if (!locked) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_iflock(ip);
-                        xfs_iflock(ip);
+        }
-                }
+        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                if (ip->i_update_core ||
                    ((ip->i_itemp != NULL) &&
                     (ip->i_itemp->ili_format.ilf_fields != 0))) {
@@ -3719,17 +3537,11 @@ xfs_finish_reclaim(
                ASSERT(ip->i_update_core == 0);
                ASSERT(ip->i_itemp == NULL ||
                       ip->i_itemp->ili_format.ilf_fields == 0);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        } else if (locked) {
-                /*
-                 * We are not interested in doing an iflush if we're
-                 * in the process of shutting down the filesystem forcibly.
-                 * So, just reclaim the inode.
-                 */
-                xfs_ifunlock(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
+        xfs_ifunlock(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 reclaim:
        xfs_ireclaim(ip);
        return 0;
@@ -3845,9 +3657,8 @@ xfs_alloc_file_space(
                end_dmi_offset = offset+len;
                if (end_dmi_offset > ip->i_size)
                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
-                        offset, end_dmi_offset - offset,
+                                      end_dmi_offset - offset, 0, NULL);
-                        0, NULL);
                if (error)
                        return error;
        }
@@ -3956,8 +3767,8 @@ dmapi_enospc_check:
        if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-                                XFS_ITOV(ip), DM_RIGHT_NULL,
+                                ip, DM_RIGHT_NULL,
-                                XFS_ITOV(ip), DM_RIGHT_NULL,
+                                ip, DM_RIGHT_NULL,
                                NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
                if (error == 0)
                        goto retry;     /* Maybe DMAPI app. has made space */
@@ -4021,7 +3832,8 @@ xfs_zero_remaining_bytes(
                XFS_BUF_READ(bp);
                XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
                xfsbdstrat(mp, bp);
-                if ((error = xfs_iowait(bp))) {
+                error = xfs_iowait(bp);
+                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
                                          mp, bp, XFS_BUF_ADDR(bp));
                        break;
@@ -4033,7 +3845,8 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNREAD(bp);
                XFS_BUF_WRITE(bp);
                xfsbdstrat(mp, bp);
-                if ((error = xfs_iowait(bp))) {
+                error = xfs_iowait(bp);
+                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
                                          mp, bp, XFS_BUF_ADDR(bp));
                        break;
@@ -4102,7 +3915,7 @@ xfs_free_file_space(
            DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
                if (end_dmi_offset > ip->i_size)
                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
                                offset, end_dmi_offset - offset,
                                AT_DELAY_FLAG(attr_flags), NULL);
                if (error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 4e3970f0e5e3..24c53923dc2c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -23,31 +23,32 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
                xfs_off_t stop);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-                bhv_vnode_t **vpp);
+                struct xfs_inode **ipp);
-int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-                xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp);
+                xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
-int xfs_remove(struct xfs_inode *dp, bhv_vname_t        *dentry);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
-int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
+                struct xfs_inode *ip);
-                bhv_vname_t *dentry);
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
-int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
+                struct xfs_name *target_name);
-                mode_t mode, bhv_vnode_t **vpp, struct cred *credp);
+int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
+                mode_t mode, struct xfs_inode **ipp, struct cred *credp);
+int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
+                struct xfs_inode *cdp);
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
-int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-                char *target_path, mode_t mode, bhv_vnode_t **vpp,
+                const char *target_path, mode_t mode, struct xfs_inode **ipp,
                struct cred *credp);
-int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
-void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
                xfs_flock64_t *bf, xfs_off_t offset,
                struct cred *credp, int attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
-                bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname);
+                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+                struct xfs_name *target_name);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
                int *valuelenp, int flags, cred_t *cred);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,