Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: David Woodhouse <dwmw2@infradead.org> 2008-04-22 07:34:25 -0400
committer: David Woodhouse <dwmw2@infradead.org> 2008-04-22 07:34:25 -0400
commit: f838bad1b3be8ca0c785ee0e0c570dfda74cf377 (patch)
tree: 5a842a8056a708cfad55a20fa8ab733dd94b0903 /fs
parent: dd919660aacdf4adfcd279556aa03e595f7f0fc2 (diff)
parent: 807501475fce0ebe68baedf87f202c3e4ee0d12c (diff)
327 files changed, 12512 insertions, 7151 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index dfebdbe7440e..3031e3233dd6 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
-#include <asm/semaphore.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
diff --git a/fs/Kconfig b/fs/Kconfig
index d7312825592b..028ae38ecc52 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -444,6 +444,32 @@ config OCFS2_FS
          For more information on OCFS2, see the file
          <file:Documentation/filesystems/ocfs2.txt>.
+config OCFS2_FS_O2CB
+        tristate "O2CB Kernelspace Clustering"
+        depends on OCFS2_FS
+        default y
+        help
+          OCFS2 includes a simple kernelspace clustering package, the OCFS2
+          Cluster Base.  It only requires a very small userspace component
+          to configure it. This comes with the standard ocfs2-tools package.
+          O2CB is limited to maintaining a cluster for OCFS2 file systems.
+          It cannot manage any other cluster applications.
+          It is always safe to say Y here, as the clustering method is
+          run-time selectable.
+config OCFS2_FS_USERSPACE_CLUSTER
+        tristate "OCFS2 Userspace Clustering"
+        depends on OCFS2_FS && DLM
+        default y
+        help
+          This option will allow OCFS2 to use userspace clustering services
+          in conjunction with the DLM in fs/dlm.  If you are using a
+          userspace cluster manager, say Y here.
+          It is safe to say Y, as the clustering method is run-time
+          selectable.
 config OCFS2_DEBUG_MASKLOG
        bool "OCFS2 logging support"
        depends on OCFS2_FS
@@ -1744,10 +1770,10 @@ config ROOT_NFS
          If you want your Linux box to mount its whole root file system (the
          one containing the directory /) from some other computer over the
          net via NFS (presumably because your box doesn't have a hard disk),
-          say Y. Read <file:Documentation/nfsroot.txt> for details. It is
+          say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
-          likely that in this case, you also want to say Y to "Kernel level IP
+          details. It is likely that in this case, you also want to say Y to
-          autoconfiguration" so that your box can discover its network address
+          "Kernel level IP autoconfiguration" so that your box can discover
-          at boot time.
+          its network address at boot time.
          Most people say N here.
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b5c3b6114add..853845abcca6 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -62,7 +62,7 @@ config BINFMT_SHARED_FLAT
 config BINFMT_AOUT
        tristate "Kernel support for a.out and ECOFF binaries"
        depends on ARCH_SUPPORTS_AOUT && \
-                (X86_32 || ALPHA || ARM || M68K || SPARC32)
+                (X86_32 || ALPHA || ARM || M68K)
        ---help---
          A.out (Assembler.OUTput) is a set of formats for libraries and
          executables used in the earliest versions of UNIX.  Linux used
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 970d38f30565..584bb0f9c36a 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -127,14 +127,21 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
        _enter("%s,%s", name, vllist);
+        down_write(&afs_cells_sem);
+        read_lock(&afs_cells_lock);
+        list_for_each_entry(cell, &afs_cells, link) {
+                if (strcasecmp(cell->name, name) == 0)
+                        goto duplicate_name;
+        }
+        read_unlock(&afs_cells_lock);
        cell = afs_cell_alloc(name, vllist);
        if (IS_ERR(cell)) {
                _leave(" = %ld", PTR_ERR(cell));
+                up_write(&afs_cells_sem);
                return cell;
        }
-        down_write(&afs_cells_sem);
        /* add a proc directory for this cell */
        ret = afs_proc_cell_setup(cell);
        if (ret < 0)
@@ -167,6 +174,11 @@ error:
        kfree(cell);
        _leave(" = %d", ret);
        return ERR_PTR(ret);
+duplicate_name:
+        read_unlock(&afs_cells_lock);
+        up_write(&afs_cells_sem);
+        return ERR_PTR(-EEXIST);
 }
 /*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5ca3625cd39e..eec41c76de72 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -573,7 +573,6 @@ extern const struct file_operations afs_mntpt_file_operations;
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
-extern void afs_umount_begin(struct vfsmount *, int);
 /*
 * proc.c
@@ -750,7 +749,7 @@ extern int afs_fsync(struct file *, struct dentry *, int);
 extern unsigned afs_debug;
 #define dbgprintk(FMT,...) \
-        printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__)
+        printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
 /* make sure we maintain the format strings, even when debugging is disabled */
 static inline __attribute__((format(printf,1,2)))
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 0f60f6b35769..2d3e5d4fb9f7 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -22,7 +22,7 @@ MODULE_LICENSE("GPL");
 unsigned afs_debug;
 module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
+MODULE_PARM_DESC(debug, "AFS debugging mask");
 static char *rootcell;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a3510b8ba3e7..2f5503902c37 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -283,11 +283,3 @@ void afs_mntpt_kill_timer(void)
        cancel_delayed_work(&afs_mntpt_expiry_timer);
        flush_scheduled_work();
 }
-/*
- * begin unmount by attempting to remove all automounted mountpoints we added
- */
-void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
-{
-        shrink_submounts(vfsmnt, &afs_vfsmounts);
-}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 36bbce45f44b..4b572b801d8d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = {
        .write_inode    = afs_write_inode,
        .destroy_inode  = afs_destroy_inode,
        .clear_inode    = afs_clear_inode,
-        .umount_begin   = afs_umount_begin,
        .put_super      = afs_put_super,
        .show_options   = generic_show_options,
 };
diff --git a/fs/aio.c b/fs/aio.c
index b74c567383bc..228368610dfa 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -936,14 +936,6 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
                return 1;
        }
-        /*
-         * Check if the user asked us to deliver the result through an
-         * eventfd. The eventfd_signal() function is safe to be called
-         * from IRQ context.
-         */
-        if (!IS_ERR(iocb->ki_eventfd))
-                eventfd_signal(iocb->ki_eventfd, 1);
        info = &ctx->ring_info;
        /* add a completion event to the ring buffer.
@@ -992,10 +984,27 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
        kunmap_atomic(ring, KM_IRQ1);
        pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+        /*
+         * Check if the user asked us to deliver the result through an
+         * eventfd. The eventfd_signal() function is safe to be called
+         * from IRQ context.
+         */
+        if (!IS_ERR(iocb->ki_eventfd))
+                eventfd_signal(iocb->ki_eventfd, 1);
 put_rq:
        /* everything turned out well, dispose of the aiocb. */
        ret = __aio_put_req(ctx, iocb);
+        /*
+         * We have to order our ring_info tail store above and test
+         * of the wait list below outside the wait lock.  This is
+         * like in wake_up_bit() where clearing a bit has to be
+         * ordered with the unlocked test.
+         */
+        smp_mb();
        if (waitqueue_active(&ctx->wait))
                wake_up(&ctx->wait);
@@ -1782,6 +1791,7 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
                put_ioctx(ioctx);
        }
+        asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
        return ret;
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 23321889d9b0..f42be069e085 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -81,13 +81,10 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
-        file = get_empty_filp();
-        if (!file)
-                return -ENFILE;
        error = get_unused_fd();
        if (error < 0)
-                goto err_put_filp;
+                return error;
        fd = error;
        /*
@@ -114,14 +111,15 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
        dentry->d_flags &= ~DCACHE_UNHASHED;
        d_instantiate(dentry, anon_inode_inode);
-        file->f_path.mnt = mntget(anon_inode_mnt);
+        error = -ENFILE;
-        file->f_path.dentry = dentry;
+        file = alloc_file(anon_inode_mnt, dentry,
+                          FMODE_READ | FMODE_WRITE, fops);
+        if (!file)
+                goto err_dput;
        file->f_mapping = anon_inode_inode->i_mapping;
        file->f_pos = 0;
        file->f_flags = O_RDWR;
-        file->f_op = fops;
-        file->f_mode = FMODE_READ | FMODE_WRITE;
        file->f_version = 0;
        file->private_data = priv;
@@ -132,10 +130,10 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
        *pfile = file;
        return 0;
+err_dput:
+        dput(dentry);
 err_put_unused_fd:
        put_unused_fd(fd);
-err_put_filp:
-        put_filp(file);
        return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 41a958a7585e..5e1a4fb5cacb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1424,6 +1424,18 @@ struct elf_note_info {
        int thread_notes;
 };
+/*
+ * When a regset has a writeback hook, we call it on each thread before
+ * dumping user memory.  On register window machines, this makes sure the
+ * user memory backing the register data is up to date before we read it.
+ */
+static void do_thread_regset_writeback(struct task_struct *task,
+                                       const struct user_regset *regset)
+{
+        if (regset->writeback)
+                regset->writeback(task, regset, 1);
+}
 static int fill_thread_core_info(struct elf_thread_core_info *t,
                                 const struct user_regset_view *view,
                                 long signr, size_t *total)
@@ -1445,6 +1457,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
                  sizeof(t->prstatus), &t->prstatus);
        *total += notesize(&t->notes[0]);
+        do_thread_regset_writeback(t->task, &view->regsets[0]);
        /*
         * Each other regset might generate a note too.  For each regset
         * that has no core_note_type or is inactive, we leave t->notes[i]
@@ -1452,6 +1466,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
         */
        for (i = 1; i < view->n; ++i) {
                const struct user_regset *regset = &view->regsets[i];
+                do_thread_regset_writeback(t->task, regset);
                if (regset->core_note_type &&
                    (!regset->active || regset->active(t->task, regset))) {
                        int ret;
diff --git a/fs/bio.c b/fs/bio.c
index 242e409dab4b..6e0b6f66df03 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 struct bio_map_data {
        struct bio_vec *iovecs;
-        void __user *userptr;
+        int nr_sgvecs;
+        struct sg_iovec *sgvecs;
 };
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio)
+static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
+                             struct sg_iovec *iov, int iov_count)
 {
        memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
+        memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
+        bmd->nr_sgvecs = iov_count;
        bio->bi_private = bmd;
 }
 static void bio_free_map_data(struct bio_map_data *bmd)
 {
        kfree(bmd->iovecs);
+        kfree(bmd->sgvecs);
        kfree(bmd);
 }
-static struct bio_map_data *bio_alloc_map_data(int nr_segs)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
 {
        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
@@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
                return NULL;
        bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
-        if (bmd->iovecs)
+        if (!bmd->iovecs) {
+                kfree(bmd);
+                return NULL;
+        }
+        bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+        if (bmd->sgvecs)
                return bmd;
+        kfree(bmd->iovecs);
        kfree(bmd);
        return NULL;
 }
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+                          int uncopy)
+{
+        int ret = 0, i;
+        struct bio_vec *bvec;
+        int iov_idx = 0;
+        unsigned int iov_off = 0;
+        int read = bio_data_dir(bio) == READ;
+        __bio_for_each_segment(bvec, bio, i, 0) {
+                char *bv_addr = page_address(bvec->bv_page);
+                unsigned int bv_len = bvec->bv_len;
+                while (bv_len && iov_idx < iov_count) {
+                        unsigned int bytes;
+                        char *iov_addr;
+                        bytes = min_t(unsigned int,
+                                      iov[iov_idx].iov_len - iov_off, bv_len);
+                        iov_addr = iov[iov_idx].iov_base + iov_off;
+                        if (!ret) {
+                                if (!read && !uncopy)
+                                        ret = copy_from_user(bv_addr, iov_addr,
+                                                             bytes);
+                                if (read && uncopy)
+                                        ret = copy_to_user(iov_addr, bv_addr,
+                                                           bytes);
+                                if (ret)
+                                        ret = -EFAULT;
+                        }
+                        bv_len -= bytes;
+                        bv_addr += bytes;
+                        iov_addr += bytes;
+                        iov_off += bytes;
+                        if (iov[iov_idx].iov_len == iov_off) {
+                                iov_idx++;
+                                iov_off = 0;
+                        }
+                }
+                if (uncopy)
+                        __free_page(bvec->bv_page);
+        }
+        return ret;
+}
 /**
 *      bio_uncopy_user -       finish previously mapped bio
 *      @bio: bio being terminated
@@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
 int bio_uncopy_user(struct bio *bio)
 {
        struct bio_map_data *bmd = bio->bi_private;
-        const int read = bio_data_dir(bio) == READ;
+        int ret;
-        struct bio_vec *bvec;
-        int i, ret = 0;
-        __bio_for_each_segment(bvec, bio, i, 0) {
+        ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
-                char *addr = page_address(bvec->bv_page);
-                unsigned int len = bmd->iovecs[i].bv_len;
-                if (read && !ret && copy_to_user(bmd->userptr, addr, len))
-                        ret = -EFAULT;
-                __free_page(bvec->bv_page);
-                bmd->userptr += len;
-        }
        bio_free_map_data(bmd);
        bio_put(bio);
        return ret;
 }
 /**
- *      bio_copy_user   -       copy user data to bio
+ *      bio_copy_user_iov       -       copy user data to bio
 *      @q: destination block queue
- *      @uaddr: start of user address
+ *      @iov:   the iovec.
- *      @len: length in bytes
+ *      @iov_count: number of elements in the iovec
 *      @write_to_vm: bool indicating writing to pages or not
 *
 *      Prepares and returns a bio for indirect user io, bouncing data
 *      to/from kernel pages as necessary. Must be paired with
 *      call bio_uncopy_user() on io completion.
 */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
-                          unsigned int len, int write_to_vm)
+                              int iov_count, int write_to_vm)
 {
-        unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        unsigned long start = uaddr >> PAGE_SHIFT;
        struct bio_map_data *bmd;
        struct bio_vec *bvec;
        struct page *page;
        struct bio *bio;
        int i, ret;
+        int nr_pages = 0;
+        unsigned int len = 0;
-        bmd = bio_alloc_map_data(end - start);
+        for (i = 0; i < iov_count; i++) {
+                unsigned long uaddr;
+                unsigned long end;
+                unsigned long start;
+                uaddr = (unsigned long)iov[i].iov_base;
+                end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                start = uaddr >> PAGE_SHIFT;
+                nr_pages += end - start;
+                len += iov[i].iov_len;
+        }
+        bmd = bio_alloc_map_data(nr_pages, iov_count);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
-        bmd->userptr = (void __user *) uaddr;
        ret = -ENOMEM;
-        bio = bio_alloc(GFP_KERNEL, end - start);
+        bio = bio_alloc(GFP_KERNEL, nr_pages);
        if (!bio)
                goto out_bmd;
@@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
         * success
         */
        if (!write_to_vm) {
-                char __user *p = (char __user *) uaddr;
+                ret = __bio_copy_iov(bio, iov, iov_count, 0);
+                if (ret)
-                /*
+                        goto cleanup;
-                 * for a write, copy in data to kernel pages
-                 */
-                ret = -EFAULT;
-                bio_for_each_segment(bvec, bio, i) {
-                        char *addr = page_address(bvec->bv_page);
-                        if (copy_from_user(addr, p, bvec->bv_len))
-                                goto cleanup;
-                        p += bvec->bv_len;
-                }
        }
-        bio_set_map_data(bmd, bio);
+        bio_set_map_data(bmd, bio, iov, iov_count);
        return bio;
 cleanup:
        bio_for_each_segment(bvec, bio, i)
@@ -591,6 +645,28 @@ out_bmd:
        return ERR_PTR(ret);
 }
+/**
+ *      bio_copy_user   -       copy user data to bio
+ *      @q: destination block queue
+ *      @uaddr: start of user address
+ *      @len: length in bytes
+ *      @write_to_vm: bool indicating writing to pages or not
+ *
+ *      Prepares and returns a bio for indirect user io, bouncing data
+ *      to/from kernel pages as necessary. Must be paired with
+ *      call bio_uncopy_user() on io completion.
+ */
+struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+                          unsigned int len, int write_to_vm)
+{
+        struct sg_iovec iov;
+        iov.iov_base = (void __user *)uaddr;
+        iov.iov_len = len;
+        return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+}
 static struct bio *__bio_map_user_iov(struct request_queue *q,
                                      struct block_device *bdev,
                                      struct sg_iovec *iov, int iov_count,
@@ -903,7 +979,7 @@ void bio_set_pages_dirty(struct bio *bio)
        }
 }
-void bio_release_pages(struct bio *bio)
+static void bio_release_pages(struct bio *bio)
 {
        struct bio_vec *bvec = bio->bi_io_vec;
        int i;
@@ -1194,6 +1270,8 @@ EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
+EXPORT_SYMBOL(bio_map_user);
+EXPORT_SYMBOL(bio_unmap_user);
 EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 67fe72ce6ac7..7d822fae7765 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -31,6 +31,8 @@ struct bdev_inode {
        struct inode vfs_inode;
 };
+static const struct address_space_operations def_blk_aops;
 static inline struct bdev_inode *BDEV_I(struct inode *inode)
 {
        return container_of(inode, struct bdev_inode, vfs_inode);
@@ -171,203 +173,6 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
-#if 0
-static void blk_end_aio(struct bio *bio, int error)
-{
-        struct kiocb *iocb = bio->bi_private;
-        atomic_t *bio_count = &iocb->ki_bio_count;
-        if (bio_data_dir(bio) == READ)
-                bio_check_pages_dirty(bio);
-        else {
-                bio_release_pages(bio);
-                bio_put(bio);
-        }
-        /* iocb->ki_nbytes stores error code from LLDD */
-        if (error)
-                iocb->ki_nbytes = -EIO;
-        if (atomic_dec_and_test(bio_count)) {
-                if ((long)iocb->ki_nbytes < 0)
-                        aio_complete(iocb, iocb->ki_nbytes, 0);
-                else
-                        aio_complete(iocb, iocb->ki_left, 0);
-        }
-        return 0;
-}
-#define VEC_SIZE        16
-struct pvec {
-        unsigned short nr;
-        unsigned short idx;
-        struct page *page[VEC_SIZE];
-};
-#define PAGES_SPANNED(addr, len)        \
-        (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
-/*
- * get page pointer for user addr, we internally cache struct page array for
- * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
- * internal page list is exhausted, a batch count of up to VEC_SIZE is used
- * to get next set of page struct.
- */
-static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
-                                 struct pvec *pvec)
-{
-        int ret, nr_pages;
-        if (pvec->idx == pvec->nr) {
-                nr_pages = PAGES_SPANNED(addr, count);
-                nr_pages = min(nr_pages, VEC_SIZE);
-                down_read(&current->mm->mmap_sem);
-                ret = get_user_pages(current, current->mm, addr, nr_pages,
-                                     rw == READ, 0, pvec->page, NULL);
-                up_read(&current->mm->mmap_sem);
-                if (ret < 0)
-                        return ERR_PTR(ret);
-                pvec->nr = ret;
-                pvec->idx = 0;
-        }
-        return pvec->page[pvec->idx++];
-}
-/* return a page back to pvec array */
-static void blk_unget_page(struct page *page, struct pvec *pvec)
-{
-        pvec->page[--pvec->idx] = page;
-}
-static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-                 loff_t pos, unsigned long nr_segs)
-{
-        struct inode *inode = iocb->ki_filp->f_mapping->host;
-        unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
-        unsigned blocksize_mask = (1 << blkbits) - 1;
-        unsigned long seg = 0;  /* iov segment iterator */
-        unsigned long nvec;     /* number of bio vec needed */
-        unsigned long cur_off;  /* offset into current page */
-        unsigned long cur_len;  /* I/O len of current page, up to PAGE_SIZE */
-        unsigned long addr;     /* user iovec address */
-        size_t count;           /* user iovec len */
-        size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */
-        loff_t size;            /* size of block device */
-        struct bio *bio;
-        atomic_t *bio_count = &iocb->ki_bio_count;
-        struct page *page;
-        struct pvec pvec;
-        pvec.nr = 0;
-        pvec.idx = 0;
-        if (pos & blocksize_mask)
-                return -EINVAL;
-        size = i_size_read(inode);
-        if (pos + nbytes > size) {
-                nbytes = size - pos;
-                iocb->ki_left = nbytes;
-        }
-        /*
-         * check first non-zero iov alignment, the remaining
-         * iov alignment is checked inside bio loop below.
-         */
-        do {
-                addr = (unsigned long) iov[seg].iov_base;
-                count = min(iov[seg].iov_len, nbytes);
-                if (addr & blocksize_mask || count & blocksize_mask)
-                        return -EINVAL;
-        } while (!count && ++seg < nr_segs);
-        atomic_set(bio_count, 1);
-        while (nbytes) {
-                /* roughly estimate number of bio vec needed */
-                nvec = (nbytes + PAGE_SIZE - 1) / PAGE_SIZE;
-                nvec = max(nvec, nr_segs - seg);
-                nvec = min(nvec, (unsigned long) BIO_MAX_PAGES);
-                /* bio_alloc should not fail with GFP_KERNEL flag */
-                bio = bio_alloc(GFP_KERNEL, nvec);
-                bio->bi_bdev = I_BDEV(inode);
-                bio->bi_end_io = blk_end_aio;
-                bio->bi_private = iocb;
-                bio->bi_sector = pos >> blkbits;
-same_bio:
-                cur_off = addr & ~PAGE_MASK;
-                cur_len = PAGE_SIZE - cur_off;
-                if (count < cur_len)
-                        cur_len = count;
-                page = blk_get_page(addr, count, rw, &pvec);
-                if (unlikely(IS_ERR(page)))
-                        goto backout;
-                if (bio_add_page(bio, page, cur_len, cur_off)) {
-                        pos += cur_len;
-                        addr += cur_len;
-                        count -= cur_len;
-                        nbytes -= cur_len;
-                        if (count)
-                                goto same_bio;
-                        while (++seg < nr_segs) {
-                                addr = (unsigned long) iov[seg].iov_base;
-                                count = iov[seg].iov_len;
-                                if (!count)
-                                        continue;
-                                if (unlikely(addr & blocksize_mask ||
-                                             count & blocksize_mask)) {
-                                        page = ERR_PTR(-EINVAL);
-                                        goto backout;
-                                }
-                                count = min(count, nbytes);
-                                goto same_bio;
-                        }
-                } else {
-                        blk_unget_page(page, &pvec);
-                }
-                /* bio is ready, submit it */
-                if (rw == READ)
-                        bio_set_pages_dirty(bio);
-                atomic_inc(bio_count);
-                submit_bio(rw, bio);
-        }
-completion:
-        iocb->ki_left -= nbytes;
-        nbytes = iocb->ki_left;
-        iocb->ki_pos += nbytes;
-        blk_run_address_space(inode->i_mapping);
-        if (atomic_dec_and_test(bio_count))
-                aio_complete(iocb, nbytes, 0);
-        return -EIOCBQUEUED;
-backout:
-        /*
-         * back out nbytes count constructed so far for this bio,
-         * we will throw away current bio.
-         */
-        nbytes += bio->bi_size;
-        bio_release_pages(bio);
-        bio_put(bio);
-        /*
-         * if no bio was submmitted, return the error code.
-         * otherwise, proceed with pending I/O completion.
-         */
-        if (atomic_read(bio_count) == 1)
-                return PTR_ERR(page);
-        goto completion;
-}
-#endif
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
        return block_write_full_page(page, blkdev_get_block, wbc);
@@ -1334,7 +1139,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
-const struct address_space_operations def_blk_aops = {
+static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
        .sync_page      = block_sync_page,
diff --git a/fs/buffer.c b/fs/buffer.c
index 3ebccf4aa7e3..39ff14403d13 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -627,8 +627,7 @@ repeat:
 }
 /**
- * sync_mapping_buffers - write out and wait upon a mapping's "associated"
+ * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
- *                        buffers
 * @mapping: the mapping which wants those buffers written
 *
 * Starts I/O against the buffers at mapping->private_list, and waits upon
@@ -836,7 +835,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                smp_mb();
                if (buffer_dirty(bh)) {
                        list_add(&bh->b_assoc_buffers,
-                                 &bh->b_assoc_map->private_list);
+                                 &mapping->private_list);
                        bh->b_assoc_map = mapping;
                }
                spin_unlock(lock);
@@ -1182,7 +1181,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 void mark_buffer_dirty(struct buffer_head *bh)
 {
        WARN_ON_ONCE(!buffer_uptodate(bh));
-        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
+        /*
+         * Very *carefully* optimize the it-is-already-dirty case.
+         *
+         * Don't let the final "is it dirty" escape to before we
+         * perhaps modified the buffer.
+         */
+        if (buffer_dirty(bh)) {
+                smp_mb();
+                if (buffer_dirty(bh))
+                        return;
+        }
+        if (!test_set_buffer_dirty(bh))
                __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
 }
@@ -2565,14 +2577,13 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = page->mapping->host;
        struct buffer_head *head = fsdata;
        struct buffer_head *bh;
+        BUG_ON(fsdata != NULL && page_has_buffers(page));
-        if (!PageMappedToDisk(page)) {
+        if (unlikely(copied < len) && !page_has_buffers(page))
-                if (unlikely(copied < len) && !page_has_buffers(page))
+                attach_nobh_buffers(page, head);
-                        attach_nobh_buffers(page, head);
+        if (page_has_buffers(page))
-                if (page_has_buffers(page))
+                return generic_write_end(file, mapping, pos, len,
-                        return generic_write_end(file, mapping, pos, len,
+                                        copied, page, fsdata);
-                                                copied, page, fsdata);
-        }
        SetPageUptodate(page);
        set_page_dirty(page);
@@ -3214,7 +3225,7 @@ static int buffer_cpu_notify(struct notifier_block *self,
 }
 /**
- * bh_uptodate_or_lock: Test whether the buffer is uptodate
+ * bh_uptodate_or_lock - Test whether the buffer is uptodate
 * @bh: struct buffer_head
 *
 * Return true if the buffer is up-to-date and false,
@@ -3233,7 +3244,7 @@ int bh_uptodate_or_lock(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_uptodate_or_lock);
 /**
- * bh_submit_read: Submit a locked buffer for reading
+ * bh_submit_read - Submit a locked buffer for reading
 * @bh: struct buffer_head
 *
 * Returns zero on success and -EIO on error.
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index edd248367b36..dbd91461853c 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -6,7 +6,9 @@ and sync so that events like out of disk space get reported properly on
 cached files. Fix setxattr failure to certain Samba versions. Fix mount
 of second share to disconnected server session (autoreconnect on this).
 Add ability to modify cifs acls for handling chmod (when mounted with
-cifsacl flag).
+cifsacl flag). Fix prefixpath path separator so we can handle mounts
+with prefixpaths longer than one directory (one path component) when
+mounted to Windows servers.
 Version 1.51
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index c623e2f9c5db..50306229b0f9 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -461,7 +461,7 @@ A partial list of the supported mount options follows:
 cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
                the file. (EXPERIMENTAL)
 servern        Specify the server 's netbios name (RFC1001 name) to use
-                when attempting to setup a session to the server.  This is
+                when attempting to setup a session to the server. 
                This is needed for mounting to some older servers (such
                as OS/2 or Windows 98 and Windows ME) since they do not
                support a default server name.  A server name can be up
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 73c4c419663c..0228ed06069e 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -98,8 +98,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
                        if (mid_entry->resp_buf) {
                                cifs_dump_detail(mid_entry->resp_buf);
                                cifs_dump_mem("existing buf: ",
-                                        mid_entry->resp_buf,
+                                        mid_entry->resp_buf, 62);
-                                        62 /* fixme */);
                        }
                }
        }
@@ -439,7 +438,7 @@ cifs_stats_read(char *buf, char **beginBuffer, off_t offset,
        return length;
 }
-#endif
+#endif /* STATS */
 static struct proc_dir_entry *proc_fs_cifs;
 read_proc_t cifs_txanchor_read;
@@ -482,7 +481,7 @@ cifs_proc_init(void)
                                cifs_stats_read, NULL);
        if (pde)
                pde->write_proc = cifs_stats_write;
-#endif
+#endif /* STATS */
        pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs,
                                cifsFYI_read, NULL);
        if (pde)
@@ -918,4 +917,12 @@ security_flags_write(struct file *file, const char __user *buffer,
        /* BB should we turn on MAY flags for other MUST options? */
        return count;
 }
-#endif
+#else
+inline void cifs_proc_init(void)
+{
+}
+inline void cifs_proc_clean(void)
+{
+}
+#endif /* PROC_FS */
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c26cd0d2c6d5..5eb3b83bbfa7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,8 +25,11 @@
 void cifs_dump_mem(char *label, void *data, int length);
 #ifdef CONFIG_CIFS_DEBUG2
+#define DBG2 2
 void cifs_dump_detail(struct smb_hdr *);
 void cifs_dump_mids(struct TCP_Server_Info *);
+#else
+#define DBG2 0
 #endif
 extern int traceSMB;            /* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
@@ -64,10 +67,10 @@ extern int cifsERROR;
 *      ---------
 */
 #else           /* _CIFS_DEBUG */
-#define cERROR(button,prspec)
+#define cERROR(button, prspec)
-#define cEVENT(format,arg...)
+#define cEVENT(format, arg...)
 #define cFYI(button, prspec)
-#define cifserror(format,arg...)
+#define cifserror(format, arg...)
 #endif          /* _CIFS_DEBUG */
 #endif                          /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 6ad447529961..56c924033b78 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -33,7 +33,6 @@ void dfs_shrink_umount_helper(struct vfsmount *vfsmnt)
 {
        mark_mounts_for_expiry(&cifs_dfs_automount_list);
        mark_mounts_for_expiry(&cifs_dfs_automount_list);
-        shrink_submounts(vfsmnt, &cifs_dfs_automount_list);
 }
 /**
@@ -74,7 +73,7 @@ static char *cifs_get_share_name(const char *node_name)
        pSep = memchr(UNC+2, '\\', len-2);
        if (!pSep) {
                cERROR(1, ("%s: no server name end in node name: %s",
-                        __FUNCTION__, node_name));
+                        __func__, node_name));
                kfree(UNC);
                return NULL;
        }
@@ -84,7 +83,7 @@ static char *cifs_get_share_name(const char *node_name)
        pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
        if (!pSep) {
                cERROR(1, ("%s:2 cant find share name in node name: %s",
-                        __FUNCTION__, node_name));
+                        __func__, node_name));
                kfree(UNC);
                return NULL;
        }
@@ -127,7 +126,7 @@ static char *compose_mount_options(const char *sb_mountdata,
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP",
-                          __FUNCTION__, *devname));
+                          __func__, *devname));
                mountdata = ERR_PTR(rc);
                goto compose_mount_options_out;
        }
@@ -181,8 +180,8 @@ static char *compose_mount_options(const char *sb_mountdata,
                }
        }
-        /*cFYI(1,("%s: parent mountdata: %s", __FUNCTION__,sb_mountdata));*/
+        /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
-        /*cFYI(1, ("%s: submount mountdata: %s", __FUNCTION__, mountdata ));*/
+        /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
 compose_mount_options_out:
        kfree(srvIP);
@@ -286,7 +285,7 @@ static void dump_referral(const struct dfs_info3_param *ref)
        cFYI(1, ("DFS: node path: %s", ref->node_name));
        cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
        cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-                                ref->PathConsumed));
+                                ref->path_consumed));
 }
@@ -302,7 +301,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        int rc = 0;
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
-        cFYI(1, ("in %s", __FUNCTION__));
+        cFYI(1, ("in %s", __func__));
        BUG_ON(IS_ROOT(dentry));
        xid = GetXid();
@@ -336,7 +335,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                        len = strlen(referrals[i].node_name);
                        if (len < 2) {
                                cERROR(1, ("%s: Net Address path too short: %s",
-                                        __FUNCTION__, referrals[i].node_name));
+                                        __func__, referrals[i].node_name));
                                rc = -EINVAL;
                                goto out_err;
                        }
@@ -344,7 +343,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                                                nd->path.dentry,
                                                referrals[i].node_name);
                        cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
-                                         __FUNCTION__,
+                                         __func__,
                                        referrals[i].node_name, mnt));
                        /* complete mount procedure if we accured submount */
@@ -365,7 +364,7 @@ out:
        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
        kfree(full_path);
-        cFYI(1, ("leaving %s" , __FUNCTION__));
+        cFYI(1, ("leaving %s" , __func__));
        return ERR_PTR(rc);
 out_err:
        path_put(&nd->path);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index d543accc10dd..6653e29637a7 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 #ifdef CONFIG_CIFS_DEBUG2
        if (cifsFYI && !IS_ERR(spnego_key)) {
                struct cifs_spnego_msg *msg = spnego_key->payload.data;
-                cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024,
+                cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
                                msg->secblob_len + msg->sesskey_len));
        }
 #endif /* CONFIG_CIFS_DEBUG2 */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index b5903b89250d..7d75272a6b3f 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -32,7 +32,7 @@
 *
 */
 int
-cifs_strfromUCS_le(char *to, const __le16 * from,
+cifs_strfromUCS_le(char *to, const __le16 *from,
                   int len, const struct nls_table *codepage)
 {
        int i;
@@ -61,7 +61,7 @@ cifs_strfromUCS_le(char *to, const __le16 * from,
 *
 */
 int
-cifs_strtoUCS(__le16 * to, const char *from, int len,
+cifs_strtoUCS(__le16 *to, const char *from, int len,
              const struct nls_table *codepage)
 {
        int charlen;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 614c11fcdcb6..14eb9a2395d3 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -254,7 +254,8 @@ UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
        const wchar_t *anchor2 = ucs2;
        while (*ucs1) {
-                if (*ucs1 == *ucs2) {   /* Partial match found */
+                if (*ucs1 == *ucs2) {
+                        /* Partial match found */
                        ucs1++;
                        ucs2++;
                } else {
@@ -279,7 +280,8 @@ UniToupper(register wchar_t uc)
 {
        register const struct UniCaseRange *rp;
-        if (uc < sizeof (CifsUniUpperTable)) {  /* Latin characters */
+        if (uc < sizeof(CifsUniUpperTable)) {
+                /* Latin characters */
                return uc + CifsUniUpperTable[uc];      /* Use base tables */
        } else {
                rp = CifsUniUpperRange; /* Use range tables */
@@ -320,7 +322,8 @@ UniTolower(wchar_t uc)
 {
        register struct UniCaseRange *rp;
-        if (uc < sizeof (UniLowerTable)) {      /* Latin characters */
+        if (uc < sizeof(UniLowerTable)) {
+                /* Latin characters */
                return uc + UniLowerTable[uc];  /* Use base tables */
        } else {
                rp = UniLowerRange;     /* Use range tables */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a7035bd18e4e..1cb5b0a9f2ac 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifsacl.c
 *
- *   Copyright (C) International Business Machines  Corp., 2007
+ *   Copyright (C) International Business Machines  Corp., 2007,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for mapping CIFS/NTFS ACLs
@@ -46,8 +46,7 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 static const struct cifs_sid sid_everyone = {
        1, 1, {0, 0, 0, 0, 0, 1}, {0} };
 /* group users */
-static const struct cifs_sid sid_user =
+static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
-                {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
 int match_sid(struct cifs_sid *ctsid)
@@ -195,9 +194,9 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
        /* For deny ACEs we change the mask so that subsequent allow access
           control entries do not turn on the bits we are denying */
        if (type == ACCESS_DENIED) {
-                if (flags & GENERIC_ALL) {
+                if (flags & GENERIC_ALL)
                        *pbits_to_set &= ~S_IRWXUGO;
-                }
                if ((flags & GENERIC_WRITE) ||
                        ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
                        *pbits_to_set &= ~S_IWUGO;
@@ -216,9 +215,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
        if (flags & GENERIC_ALL) {
                *pmode |= (S_IRWXUGO & (*pbits_to_set));
-#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(DBG2, ("all perms"));
-                cFYI(1, ("all perms"));
-#endif
                return;
        }
        if ((flags & GENERIC_WRITE) ||
@@ -231,9 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
                *pmode |= (S_IXUGO & (*pbits_to_set));
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode));
-        cFYI(1, ("access flags 0x%x mode now 0x%x", flags, *pmode));
-#endif
        return;
 }
@@ -262,13 +257,11 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
        if (mode & S_IXUGO)
                *pace_flags |= SET_FILE_EXEC_RIGHTS;
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
-        cFYI(1, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
-#endif
        return;
 }
-static __le16 fill_ace_for_sid(struct cifs_ace *pntace,
+static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
                        const struct cifs_sid *psid, __u64 nmode, umode_t bits)
 {
        int i;
@@ -358,11 +351,9 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                return;
        }
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("DACL revision %d size %d num aces %d",
-        cFYI(1, ("DACL revision %d size %d num aces %d",
                le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
                le32_to_cpu(pdacl->num_aces)));
-#endif
        /* reset rwx permissions for user/group/other.
           Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -381,10 +372,6 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
-/*              cifscred->cecount = pdacl->num_aces;
-                cifscred->aces = kmalloc(num_aces *
-                        sizeof(struct cifs_ace *), GFP_KERNEL);*/
                for (i = 0; i < num_aces; ++i) {
                        ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
 #ifdef CONFIG_CIFS_DEBUG2
@@ -424,7 +411,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
                        struct cifs_sid *pgrpsid, __u64 nmode)
 {
-        __le16 size = 0;
+        u16 size = 0;
        struct cifs_acl *pnndacl;
        pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
@@ -437,7 +424,7 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
                                         &sid_everyone, nmode, S_IRWXO);
        pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
-        pndacl->num_aces = 3;
+        pndacl->num_aces = cpu_to_le32(3);
        return (0);
 }
@@ -495,13 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                                le32_to_cpu(pntsd->gsidoffset));
        dacloffset = le32_to_cpu(pntsd->dacloffset);
        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
-        cFYI(1, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
                 "sacloffset 0x%x dacloffset 0x%x",
                 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
                 le32_to_cpu(pntsd->gsidoffset),
                 le32_to_cpu(pntsd->sacloffset), dacloffset));
-#endif
 /*      cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
        rc = parse_sid(owner_sid_ptr, end_of_acl);
        if (rc)
@@ -571,9 +556,9 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 /* Retrieve an ACL from the server */
 static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-                                       const char *path)
+                                       const char *path, const __u16 *pfid)
 {
-        struct cifsFileInfo *open_file;
+        struct cifsFileInfo *open_file = NULL;
        int unlock_file = FALSE;
        int xid;
        int rc = -EIO;
@@ -588,7 +573,11 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
                return NULL;
        xid = GetXid();
-        open_file = find_readable_file(CIFS_I(inode));
+        if (pfid == NULL)
+                open_file = find_readable_file(CIFS_I(inode));
+        else
+                fid = *pfid;
        sb = inode->i_sb;
        if (sb == NULL) {
                FreeXid(xid);
@@ -599,7 +588,7 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
        if (open_file) {
                unlock_file = TRUE;
                fid = open_file->netfid;
-        } else {
+        } else if (pfid == NULL) {
                int oplock = FALSE;
                /* open file */
                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
@@ -615,10 +604,11 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-        if (unlock_file == TRUE)
+        if (unlock_file == TRUE) /* find_readable_file increments ref count */
                atomic_dec(&open_file->wrtPending);
-        else
+        else if (pfid == NULL) /* if opened above we have to close the handle */
                CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        /* else handle was passed in by caller */
        FreeXid(xid);
        return pntsd;
@@ -636,9 +626,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        struct super_block *sb;
        struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
-        cFYI(1, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
-#endif
        if (!inode)
                return (rc);
@@ -669,9 +657,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        cFYI(1, ("SetCIFSACL rc = %d", rc));
-#endif
        if (unlock_file == TRUE)
                atomic_dec(&open_file->wrtPending);
        else
@@ -683,16 +669,14 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path)
+void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
        int rc = 0;
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("converting ACL to mode for %s", path));
-        cFYI(1, ("converting ACL to mode for %s", path));
+        pntsd = get_cifs_acl(&acllen, inode, path, pfid);
-#endif
-        pntsd = get_cifs_acl(&acllen, inode, path);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (pntsd)
@@ -712,12 +696,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("set ACL from mode for %s", path));
-        cFYI(1, ("set ACL from mode for %s", path));
-#endif
        /* Get the security descriptor */
-        pntsd = get_cifs_acl(&acllen, inode, path);
+        pntsd = get_cifs_acl(&acllen, inode, path, NULL);
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
@@ -736,16 +718,12 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
                rc = build_sec_desc(pntsd, pnntsd, acllen, inode, nmode);
-#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(DBG2, ("build_sec_desc rc: %d", rc));
-                cFYI(1, ("build_sec_desc rc: %d", rc));
-#endif
                if (!rc) {
                        /* Set the security descriptor */
                        rc = set_cifs_acl(pnntsd, acllen, inode, path);
-#ifdef CONFIG_CIFS_DEBUG2
+                        cFYI(DBG2, ("set_cifs_acl rc: %d", rc));
-                        cFYI(1, ("set_cifs_acl rc: %d", rc));
-#endif
                }
                kfree(pnntsd);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index fcc434227691..a04b17e5a9d0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -204,9 +204,8 @@ cifs_put_super(struct super_block *sb)
                return;
        }
        rc = cifs_umount(sb, cifs_sb);
-        if (rc) {
+        if (rc)
                cERROR(1, ("cifs_umount failed with return code %d", rc));
-        }
 #ifdef CONFIG_CIFS_DFS_UPCALL
        if (cifs_sb->mountdata) {
                kfree(cifs_sb->mountdata);
@@ -461,7 +460,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
 static struct quotactl_ops cifs_quotactl_ops = {
        .set_xquota     = cifs_xquota_set,
-        .get_xquota     = cifs_xquota_set,
+        .get_xquota     = cifs_xquota_get,
        .set_xstate     = cifs_xstate_set,
        .get_xstate     = cifs_xstate_get,
 };
@@ -472,9 +471,7 @@ static void cifs_umount_begin(struct vfsmount *vfsmnt, int flags)
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
-#ifdef CONFIG_CIFS_DFS_UPCALL
        dfs_shrink_umount_helper(vfsmnt);
-#endif /* CONFIG CIFS_DFS_UPCALL */
        if (!(flags & MNT_FORCE))
                return;
@@ -992,9 +989,7 @@ static int __init
 init_cifs(void)
 {
        int rc = 0;
-#ifdef CONFIG_PROC_FS
        cifs_proc_init();
-#endif
 /*      INIT_LIST_HEAD(&GlobalServerList);*/    /* BB not implemented yet */
        INIT_LIST_HEAD(&GlobalSMBSessionList);
        INIT_LIST_HEAD(&GlobalTreeConnectionList);
@@ -1095,19 +1090,15 @@ init_cifs(void)
 out_destroy_inodecache:
        cifs_destroy_inodecache();
 out_clean_proc:
-#ifdef CONFIG_PROC_FS
        cifs_proc_clean();
-#endif
        return rc;
 }
 static void __exit
 exit_cifs(void)
 {
-        cFYI(0, ("exit_cifs"));
+        cFYI(DBG2, ("exit_cifs"));
-#ifdef CONFIG_PROC_FS
        cifs_proc_clean();
-#endif
 #ifdef CONFIG_CIFS_DFS_UPCALL
        unregister_key_type(&key_type_dns_resolver);
 #endif
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d32d8ddc82e..69a2e1942542 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -454,7 +454,7 @@ struct dir_notify_req {
 struct dfs_info3_param {
        int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/
-        int PathConsumed;
+        int path_consumed;
        int server_type;
        int ref_flag;
        char *path_name;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2f09f565a3d9..7e5e0e78cd72 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,8 @@ extern int smb_send(struct socket *, struct smb_hdr *,
                        unsigned int /* length */ , struct sockaddr *);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__FUNCTION__, xid,current->fsuid));
+#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid));
-#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__FUNCTION__,curr_xid,(int)rc));}
+#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
@@ -53,11 +53,11 @@ extern int SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
                        struct kvec *, int /* nvec to send */,
                        int * /* type of buf returned */ , const int flags);
-extern int SendReceiveBlockingLock(const unsigned int /* xid */ ,
+extern int SendReceiveBlockingLock(const unsigned int xid,
-                                        struct cifsTconInfo *,
+                        struct cifsTconInfo *ptcon,
-                                struct smb_hdr * /* input */ ,
+                        struct smb_hdr *in_buf ,
-                                struct smb_hdr * /* out */ ,
+                        struct smb_hdr *out_buf,
-                                int * /* bytes returned */);
+                        int *bytes_returned);
 extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
 extern int is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
@@ -84,7 +84,7 @@ extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
                                                 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
-extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ );
+extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
@@ -92,11 +92,12 @@ extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 extern int cifs_get_inode_info(struct inode **pinode,
                        const unsigned char *search_path,
                        FILE_ALL_INFO * pfile_info,
-                        struct super_block *sb, int xid);
+                        struct super_block *sb, int xid, const __u16 *pfid);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *search_path);
+extern void acl_to_uid_mode(struct inode *inode, const char *path,
+                            const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
@@ -104,7 +105,11 @@ extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
 #ifdef CONFIG_CIFS_DFS_UPCALL
 extern void dfs_shrink_umount_helper(struct vfsmount *vfsmnt);
-#endif
+#else
+static inline void dfs_shrink_umount_helper(struct vfsmount *vfsmnt)
+{
+}
+#endif /* DFS_UPCALL */
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
@@ -175,11 +180,11 @@ extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
                        struct kstatfs *FSData);
 extern int CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon,
-                        const char *fileName, const FILE_BASIC_INFO * data,
+                        const char *fileName, const FILE_BASIC_INFO *data,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
-                        const FILE_BASIC_INFO * data, __u16 fid);
+                        const FILE_BASIC_INFO *data, __u16 fid);
 #if 0
 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
                        char *fileName, __u16 dos_attributes,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9409524e4bf8..30bbe448e260 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifssmb.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for constructing the SMB PDUs themselves
@@ -102,10 +102,12 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
           to this tcon */
 }
-/* If the return code is zero, this function must fill in request_buf pointer */
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
 static int
 small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-         void **request_buf /* returned */)
+                void **request_buf)
 {
        int rc = 0;
@@ -363,7 +365,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
                *response_buf = *request_buf;
        header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon,
-                        wct /*wct */ );
+                        wct);
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
@@ -523,7 +525,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        if (remain >= (MIN_TZ_ADJ / 2))
                                result += MIN_TZ_ADJ;
                        if (val < 0)
-                                result = - result;
+                                result = -result;
                        server->timeAdj = result;
                } else {
                        server->timeAdj = (int)tmp;
@@ -600,7 +602,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
-        cFYI(0, ("Max buf = %d", ses->server->maxBuf));
+        cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -868,9 +870,8 @@ PsxDelete:
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Posix delete returned %d", rc));
-        }
        cifs_buf_release(pSMB);
        cifs_stats_inc(&tcon->num_deletes);
@@ -916,9 +917,8 @@ DelFileRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_deletes);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Error in RMFile = %d", rc));
-        }
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -961,9 +961,8 @@ RmDirRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_rmdirs);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Error in RMDir = %d", rc));
-        }
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -1005,9 +1004,8 @@ MkDirRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_mkdirs);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Error in Mkdir = %d", rc));
-        }
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -1017,7 +1015,7 @@ MkDirRetry:
 int
 CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
-                __u64 mode, __u16 * netfid, FILE_UNIX_BASIC_INFO *pRetData,
+                __u64 mode, __u16 *netfid, FILE_UNIX_BASIC_INFO *pRetData,
                __u32 *pOplock, const char *name,
                const struct nls_table *nls_codepage, int remap)
 {
@@ -1027,8 +1025,8 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
        int rc = 0;
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        OPEN_PSX_REQ * pdata;
+        OPEN_PSX_REQ *pdata;
-        OPEN_PSX_RSP * psx_rsp;
+        OPEN_PSX_RSP *psx_rsp;
        cFYI(1, ("In POSIX Create"));
 PsxCreat:
@@ -1110,9 +1108,7 @@ PsxCreat:
        /* check to make sure response data is there */
        if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
                pRetData->Type = cpu_to_le32(-1); /* unknown */
-#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(DBG2, ("unknown type"));
-                cFYI(1, ("unknown type"));
-#endif
        } else {
                if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
                                        + sizeof(FILE_UNIX_BASIC_INFO)) {
@@ -1169,8 +1165,8 @@ static __u16 convert_disposition(int disposition)
 int
 SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
            const char *fileName, const int openDisposition,
-            const int access_flags, const int create_options, __u16 * netfid,
+            const int access_flags, const int create_options, __u16 *netfid,
-            int *pOplock, FILE_ALL_INFO * pfile_info,
+            int *pOplock, FILE_ALL_INFO *pfile_info,
            const struct nls_table *nls_codepage, int remap)
 {
        int rc = -EACCES;
@@ -1221,8 +1217,8 @@ OldOpenRetry:
        if (create_options & CREATE_OPTION_SPECIAL)
                pSMB->FileAttributes = cpu_to_le16(ATTR_SYSTEM);
-        else
+        else /* BB FIXME BB */
-                pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/); /* BB FIXME */
+                pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/);
        /* if ((omode & S_IWUGO) == 0)
                pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);*/
@@ -1284,8 +1280,8 @@ OldOpenRetry:
 int
 CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
            const char *fileName, const int openDisposition,
-            const int access_flags, const int create_options, __u16 * netfid,
+            const int access_flags, const int create_options, __u16 *netfid,
-            int *pOplock, FILE_ALL_INFO * pfile_info,
+            int *pOplock, FILE_ALL_INFO *pfile_info,
            const struct nls_table *nls_codepage, int remap)
 {
        int rc = -EACCES;
@@ -1556,9 +1552,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        } /* else setting file size with write of zero bytes */
        if (wct == 14)
                byte_count = bytes_sent + 1; /* pad */
-        else /* wct == 12 */ {
+        else /* wct == 12 */
                byte_count = bytes_sent + 5; /* bigger pad, smaller smb hdr */
-        }
        pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
        pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
        pSMB->hdr.smb_buf_length += byte_count;
@@ -1663,7 +1659,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                rc = -EIO;
                *nbytes = 0;
        } else {
-                WRITE_RSP * pSMBr = (WRITE_RSP *)iov[0].iov_base;
+                WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
                *nbytes = (*nbytes) << 16;
                *nbytes += le16_to_cpu(pSMBr->Count);
@@ -1744,9 +1740,8 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
                /* SMB buffer freed by function above */
        }
        cifs_stats_inc(&tcon->num_locks);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in Lock = %d", rc));
-        }
        /* Note: On -EAGAIN error only caller can retry on handle based calls
        since file handle passed in no longer valid */
@@ -1791,7 +1786,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        count = sizeof(struct cifs_posix_lock);
        pSMB->MaxParameterCount = cpu_to_le16(2);
-        pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB PDU from sess */
+        pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
        pSMB->SetupCount = 1;
        pSMB->Reserved3 = 0;
        if (get_flag)
@@ -1972,9 +1967,8 @@ renameRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_renames);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in rename = %d", rc));
-        }
        cifs_buf_release(pSMB);
@@ -2016,7 +2010,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
        rename_info = (struct set_file_rename *) data_offset;
        pSMB->MaxParameterCount = cpu_to_le16(2);
-        pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB PDU from sess */
+        pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB from sess */
        pSMB->SetupCount = 1;
        pSMB->Reserved3 = 0;
        pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
@@ -2052,9 +2046,8 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
        rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&pTcon->num_t2renames);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in Rename (by file handle) = %d", rc));
-        }
        cifs_buf_release(pSMB);
@@ -2211,9 +2204,8 @@ createSymLinkRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_symlinks);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
-        }
        if (pSMB)
                cifs_buf_release(pSMB);
@@ -2299,9 +2291,8 @@ createHardLinkRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc));
-        }
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2370,9 +2361,9 @@ winCreateHardLinkRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in hard link (NT rename) = %d", rc));
-        }
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
                goto winCreateHardLinkRetry;
@@ -2968,9 +2959,8 @@ setAclRetry:
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Set POSIX ACL returned %d", rc));
-        }
 setACLerrorExit:
        cifs_buf_release(pSMB);
@@ -2982,7 +2972,7 @@ setACLerrorExit:
 /* BB fix tabs in this function FIXME BB */
 int
 CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
-               const int netfid, __u64 * pExtAttrBits, __u64 *pMask)
+               const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
 {
        int rc = 0;
        struct smb_t2_qfi_req *pSMB = NULL;
@@ -3000,7 +2990,7 @@ GetExtAttrRetry:
        if (rc)
                return rc;
-        params = 2 /* level */ +2 /* fid */;
+        params = 2 /* level */ + 2 /* fid */;
        pSMB->t2.TotalDataCount = 0;
        pSMB->t2.MaxParameterCount = cpu_to_le16(4);
        /* BB find exact max data count below from sess structure BB */
@@ -3071,7 +3061,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 {
        int rc = 0;
        int buf_type = 0;
-        QUERY_SEC_DESC_REQ * pSMB;
+        QUERY_SEC_DESC_REQ *pSMB;
        struct kvec iov[1];
        cFYI(1, ("GetCifsACL"));
@@ -3101,7 +3091,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        if (rc) {
                cFYI(1, ("Send error in QuerySecDesc = %d", rc));
        } else {                /* decode response */
-                __le32 * parm;
+                __le32 *parm;
                __u32 parm_len;
                __u32 acl_len;
                struct smb_com_ntransact_rsp *pSMBr;
@@ -3230,8 +3220,8 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
                        FILE_ALL_INFO *pFinfo,
                        const struct nls_table *nls_codepage, int remap)
 {
-        QUERY_INFORMATION_REQ * pSMB;
+        QUERY_INFORMATION_REQ *pSMB;
-        QUERY_INFORMATION_RSP * pSMBr;
+        QUERY_INFORMATION_RSP *pSMBr;
        int rc = 0;
        int bytes_returned;
        int name_len;
@@ -3263,9 +3253,11 @@ QInfRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
                cFYI(1, ("Send error in QueryInfo = %d", rc));
-        } else if (pFinfo) {            /* decode response */
+        } else if (pFinfo) {
                struct timespec ts;
                __u32 time = le32_to_cpu(pSMBr->last_write_time);
+                /* decode response */
                /* BB FIXME - add time zone adjustment BB */
                memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
                ts.tv_nsec = 0;
@@ -3296,7 +3288,7 @@ QInfRetry:
 int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
                 const unsigned char *searchName,
-                 FILE_ALL_INFO * pFindData,
+                 FILE_ALL_INFO *pFindData,
                 int legacy /* old style infolevel */,
                 const struct nls_table *nls_codepage, int remap)
 {
@@ -3371,10 +3363,12 @@ QPathInfoRetry:
                else if (pFindData) {
                        int size;
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                        if (legacy) /* we do not read the last field, EAsize,
-                                       fortunately since it varies by subdialect
+                        /* On legacy responses we do not read the last field,
-                                       and on Set vs. Get, is two bytes or 4
+                        EAsize, fortunately since it varies by subdialect and
-                                       bytes depending but we don't care here */
+                        also note it differs on Set vs. Get, ie two bytes or 4
+                        bytes depending but we don't care here */
+                        if (legacy)
                                size = sizeof(FILE_INFO_STANDARD);
                        else
                                size = sizeof(FILE_ALL_INFO);
@@ -3476,85 +3470,6 @@ UnixQPathInfoRetry:
        return rc;
 }
-#if 0  /* function unused at present */
-int CIFSFindSingle(const int xid, struct cifsTconInfo *tcon,
-               const char *searchName, FILE_ALL_INFO * findData,
-               const struct nls_table *nls_codepage)
-{
-/* level 257 SMB_ */
-        TRANSACTION2_FFIRST_REQ *pSMB = NULL;
-        TRANSACTION2_FFIRST_RSP *pSMBr = NULL;
-        int rc = 0;
-        int bytes_returned;
-        int name_len;
-        __u16 params, byte_count;
-        cFYI(1, ("In FindUnique"));
-findUniqueRetry:
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-                      (void **) &pSMBr);
-        if (rc)
-                return rc;
-        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-                                     PATH_MAX, nls_codepage);
-                name_len++;     /* trailing null */
-                name_len *= 2;
-        } else {        /* BB improve the check for buffer overruns BB */
-                name_len = strnlen(searchName, PATH_MAX);
-                name_len++;     /* trailing null */
-                strncpy(pSMB->FileName, searchName, name_len);
-        }
-        params = 12 + name_len /* includes null */ ;
-        pSMB->TotalDataCount = 0;       /* no EAs */
-        pSMB->MaxParameterCount = cpu_to_le16(2);
-        pSMB->MaxDataCount = cpu_to_le16(4000); /* BB find exact max SMB PDU from sess structure BB */
-        pSMB->MaxSetupCount = 0;
-        pSMB->Reserved = 0;
-        pSMB->Flags = 0;
-        pSMB->Timeout = 0;
-        pSMB->Reserved2 = 0;
-        pSMB->ParameterOffset = cpu_to_le16(
-         offsetof(struct smb_com_transaction2_ffirst_req, InformationLevel)-4);
-        pSMB->DataCount = 0;
-        pSMB->DataOffset = 0;
-        pSMB->SetupCount = 1;   /* one byte, no need to le convert */
-        pSMB->Reserved3 = 0;
-        pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_FIRST);
-        byte_count = params + 1 /* pad */ ;
-        pSMB->TotalParameterCount = cpu_to_le16(params);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->SearchAttributes =
-            cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
-                        ATTR_DIRECTORY);
-        pSMB->SearchCount = cpu_to_le16(16);    /* BB increase */
-        pSMB->SearchFlags = cpu_to_le16(1);
-        pSMB->InformationLevel = cpu_to_le16(SMB_FIND_FILE_DIRECTORY_INFO);
-        pSMB->SearchStorageType = 0;    /* BB what should we set this to? BB */
-        pSMB->hdr.smb_buf_length += byte_count;
-        pSMB->ByteCount = cpu_to_le16(byte_count);
-        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
-                cFYI(1, ("Send error in FindFileDirInfo = %d", rc));
-        } else {                /* decode response */
-                cifs_stats_inc(&tcon->num_ffirst);
-                /* BB fill in */
-        }
-        cifs_buf_release(pSMB);
-        if (rc == -EAGAIN)
-                goto findUniqueRetry;
-        return rc;
-}
-#endif /* end unused (temporarily) function */
 /* xid, tcon, searchName and codepage are input parms, rest are returned */
 int
 CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
@@ -3566,7 +3481,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
 /* level 257 SMB_ */
        TRANSACTION2_FFIRST_REQ *pSMB = NULL;
        TRANSACTION2_FFIRST_RSP *pSMBr = NULL;
-        T2_FFIRST_RSP_PARMS * parms;
+        T2_FFIRST_RSP_PARMS *parms;
        int rc = 0;
        int bytes_returned = 0;
        int name_len;
@@ -3697,7 +3612,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 {
        TRANSACTION2_FNEXT_REQ *pSMB = NULL;
        TRANSACTION2_FNEXT_RSP *pSMBr = NULL;
-        T2_FNEXT_RSP_PARMS * parms;
+        T2_FNEXT_RSP_PARMS *parms;
        char *response_data;
        int rc = 0;
        int bytes_returned, name_len;
@@ -3836,9 +3751,9 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        pSMB->FileID = searchHandle;
        pSMB->ByteCount = 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-        if (rc) {
+        if (rc)
                cERROR(1, ("Send error in FindClose = %d", rc));
-        }
        cifs_stats_inc(&tcon->num_fclose);
        /* Since session is dead, search handle closed on server already */
@@ -3851,7 +3766,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 int
 CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
                      const unsigned char *searchName,
-                      __u64 * inode_number,
+                      __u64 *inode_number,
                      const struct nls_table *nls_codepage, int remap)
 {
        int rc = 0;
@@ -4560,9 +4475,8 @@ SETFSUnixRetry:
                cERROR(1, ("Send error in SETFSUnixInfo = %d", rc));
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-                if (rc) {
+                if (rc)
                        rc = -EIO;      /* bad smb */
-                }
        }
        cifs_buf_release(pSMB);
@@ -4744,9 +4658,8 @@ SetEOFRetry:
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("SetPathInfo (file size) returned %d", rc));
-        }
        cifs_buf_release(pSMB);
@@ -4897,9 +4810,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
        pSMB->ByteCount = cpu_to_le16(byte_count);
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
-        }
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -4975,9 +4887,8 @@ SetTimesRetry:
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("SetPathInfo (times) returned %d", rc));
-        }
        cifs_buf_release(pSMB);
@@ -5027,9 +4938,8 @@ SetAttrLgcyRetry:
        pSMB->ByteCount = cpu_to_le16(name_len + 1);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("Error in LegacySetAttr = %d", rc));
-        }
        cifs_buf_release(pSMB);
@@ -5138,9 +5048,8 @@ setPermsRetry:
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("SetPathInfo (perms) returned %d", rc));
-        }
        if (pSMB)
                cifs_buf_release(pSMB);
@@ -5615,9 +5524,8 @@ SetEARetry:
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        if (rc) {
+        if (rc)
                cFYI(1, ("SetPathInfo (EA) returned %d", rc));
-        }
        cifs_buf_release(pSMB);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 65d0ba72e78f..8dbfa97cd18c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1722,8 +1722,15 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                           originally at mount time */
                        if ((saved_cap & CIFS_UNIX_POSIX_ACL_CAP) == 0)
                                cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
-                        if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0)
+                        if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
+                                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
+                                        cERROR(1, ("POSIXPATH support change"));
                                cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
+                        } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
+                                cERROR(1, ("possible reconnect error"));
+                                cERROR(1,
+                                        ("server disabled POSIX path support"));
+                        }
                }
                cap &= CIFS_UNIX_CAP_MASK;
@@ -1753,9 +1760,8 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
                                CIFS_SB(sb)->rsize = 127 * 1024;
-#ifdef CONFIG_CIFS_DEBUG2
+                                cFYI(DBG2,
-                                cFYI(1, ("larger reads not supported by srv"));
+                                        ("larger reads not supported by srv"));
-#endif
                        }
                }
@@ -1792,6 +1798,26 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        }
 }
+static void
+convert_delimiter(char *path, char delim)
+{
+        int i;
+        char old_delim;
+        if (path == NULL)
+                return;
+        if (delim == '/') 
+                old_delim = '\\';
+        else
+                old_delim = '/';
+        for (i = 0; path[i] != '\0'; i++) {
+                if (path[i] == old_delim)
+                        path[i] = delim;
+        }
+}
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
           char *mount_data, const char *devname)
@@ -2057,7 +2083,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                cifs_sb->prepath = volume_info.prepath;
                if (cifs_sb->prepath) {
                        cifs_sb->prepathlen = strlen(cifs_sb->prepath);
-                        cifs_sb->prepath[0] = CIFS_DIR_SEP(cifs_sb);
+                        /* we can not convert the / to \ in the path
+                        separators in the prefixpath yet because we do not
+                        know (until reset_cifs_unix_caps is called later)
+                        whether POSIX PATH CAP is available. We normalize
+                        the / to \ after reset_cifs_unix_caps is called */
                        volume_info.prepath = NULL;
                } else
                        cifs_sb->prepathlen = 0;
@@ -2225,11 +2255,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                else
                        tcon->unix_ext = 0; /* server does not support them */
+                /* convert forward to back slashes in prepath here if needed */
+                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+                        convert_delimiter(cifs_sb->prepath,
+                                          CIFS_DIR_SEP(cifs_sb));
                if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
                        cifs_sb->rsize = 1024 * 127;
-#ifdef CONFIG_CIFS_DEBUG2
+                        cFYI(DBG2,
-                        cFYI(1, ("no very large read support, rsize now 127K"));
+                                ("no very large read support, rsize now 127K"));
-#endif
                }
                if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
                        cifs_sb->wsize = min(cifs_sb->wsize,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 699ec1198409..0f5c62ba4038 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
 *
 *   vfs operations that deal with dentries
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -111,16 +111,6 @@ cifs_bp_rename_retry:
        return full_path;
 }
-/* char * build_wildcard_path_from_dentry(struct dentry *direntry)
-{
-        if(full_path == NULL)
-                return full_path;
-        full_path[namelen] = '\\';
-        full_path[namelen+1] = '*';
-        full_path[namelen+2] = 0;
-BB remove above eight lines BB */
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 int
@@ -171,9 +161,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        disposition = FILE_OVERWRITE_IF;
                else if ((oflags & O_CREAT) == O_CREAT)
                        disposition = FILE_OPEN_IF;
-                else {
+                else
                        cFYI(1, ("Create flag not set in create function"));
-                }
        }
        /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -240,7 +229,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                                                 inode->i_sb, xid);
                else {
                        rc = cifs_get_inode_info(&newinode, full_path,
-                                                 buf, inode->i_sb, xid);
+                                                 buf, inode->i_sb, xid,
+                                                 &fileHandle);
                        if (newinode) {
                                newinode->i_mode = mode;
                                if ((oplock & CIFS_CREATE_ACTION) &&
@@ -367,7 +357,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
                        int oplock = 0;
                        u16 fileHandle;
-                        FILE_ALL_INFO * buf;
+                        FILE_ALL_INFO *buf;
                        cFYI(1, ("sfu compat create special file"));
@@ -494,7 +484,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                                              parent_dir_inode->i_sb, xid);
        else
                rc = cifs_get_inode_info(&newInode, full_path, NULL,
-                                         parent_dir_inode->i_sb, xid);
+                                         parent_dir_inode->i_sb, xid, NULL);
        if ((rc == 0) && (newInode != NULL)) {
                if (pTcon->nocase)
@@ -534,9 +524,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
        int isValid = 1;
        if (direntry->d_inode) {
-                if (cifs_revalidate(direntry)) {
+                if (cifs_revalidate(direntry))
                        return 0;
-                }
        } else {
                cFYI(1, ("neg dentry 0x%p name = %s",
                         direntry, direntry->d_name.name));
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index ef7f43824347..7cc86c418182 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -77,14 +77,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* search for server name delimiter */
        len = strlen(unc);
        if (len < 3) {
-                cFYI(1, ("%s: unc is too short: %s", __FUNCTION__, unc));
+                cFYI(1, ("%s: unc is too short: %s", __func__, unc));
                return -EINVAL;
        }
        len -= 2;
        name = memchr(unc+2, '\\', len);
        if (!name) {
                cFYI(1, ("%s: probably server name is whole unc: %s",
-                                        __FUNCTION__, unc));
+                                        __func__, unc));
        } else {
                len = (name - unc) - 2/* leading // */;
        }
@@ -104,7 +104,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
                if (*ip_addr) {
                        memcpy(*ip_addr, rkey->payload.data, len);
                        (*ip_addr)[len] = '\0';
-                        cFYI(1, ("%s: resolved: %s to %s", __FUNCTION__,
+                        cFYI(1, ("%s: resolved: %s to %s", __func__,
                                        rkey->description,
                                        *ip_addr
                                ));
@@ -114,7 +114,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
                }
                key_put(rkey);
        } else {
-                cERROR(1, ("%s: unable to resolve: %s", __FUNCTION__, name));
+                cERROR(1, ("%s: unable to resolve: %s", __func__, name));
        }
        kfree(name);
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 073fdc3db419..966e9288930b 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
 *                            Handles host name to IP address resolution
- * 
+ *
 *   Copyright (c) International Business Machines  Corp., 2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index 995474c90885..7d1d5aa4c430 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -35,9 +35,8 @@ static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
        /* No way on Linux VFS to ask to monitor xattr
        changes (and no stream support either */
-        if (fcntl_notify_flags & DN_ACCESS) {
+        if (fcntl_notify_flags & DN_ACCESS)
                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-        }
        if (fcntl_notify_flags & DN_MODIFY) {
                /* What does this mean on directories? */
                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
@@ -47,9 +46,8 @@ static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
                        FILE_NOTIFY_CHANGE_LAST_WRITE;
        }
-        if (fcntl_notify_flags & DN_DELETE) {
+        if (fcntl_notify_flags & DN_DELETE)
                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-        }
        if (fcntl_notify_flags & DN_RENAME) {
                /* BB review this - checking various server behaviors */
                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5f7c374ae89c..40b690073fc1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -145,7 +145,7 @@ client_can_cache:
                        full_path, inode->i_sb, xid);
        else
                rc = cifs_get_inode_info(&file->f_path.dentry->d_inode,
-                        full_path, buf, inode->i_sb, xid);
+                        full_path, buf, inode->i_sb, xid, NULL);
        if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = TRUE;
@@ -353,9 +353,9 @@ static int cifs_reopen_file(struct file *file, int can_flush)
        int disposition = FILE_OPEN;
        __u16 netfid;
-        if (file->private_data) {
+        if (file->private_data)
                pCifsFile = (struct cifsFileInfo *)file->private_data;
-        } else
+        else
                return -EBADF;
        xid = GetXid();
@@ -440,7 +440,7 @@ reopen_error_exit:
                                else
                                        rc = cifs_get_inode_info(&inode,
                                                full_path, NULL, inode->i_sb,
-                                                xid);
+                                                xid, NULL);
                        } /* else we are writing out data to server already
                             and could deadlock if we tried to flush data, and
                             since we do not know if we have data that would
@@ -499,9 +499,8 @@ int cifs_close(struct inode *inode, struct file *file)
                                        the struct would be in each open file,
                                        but this should give enough time to
                                        clear the socket */
-#ifdef CONFIG_CIFS_DEBUG2
+                                        cFYI(DBG2,
-                                        cFYI(1, ("close delay, write pending"));
+                                                ("close delay, write pending"));
-#endif /* DEBUG2 */
                                        msleep(timeout);
                                        timeout *= 4;
                                }
@@ -1423,9 +1422,8 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
        xid = GetXid();
 /* BB add check for wbc flags */
        page_cache_get(page);
-        if (!PageUptodate(page)) {
+        if (!PageUptodate(page))
                cFYI(1, ("ppw - page not up to date"));
-        }
        /*
         * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1460,9 +1458,9 @@ static int cifs_commit_write(struct file *file, struct page *page,
        cFYI(1, ("commit write for page %p up to position %lld for %d",
                 page, position, to));
        spin_lock(&inode->i_lock);
-        if (position > inode->i_size) {
+        if (position > inode->i_size)
                i_size_write(inode, position);
-        }
        spin_unlock(&inode->i_lock);
        if (!PageUptodate(page)) {
                position =  ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
@@ -1596,9 +1594,9 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
        }
        open_file = (struct cifsFileInfo *)file->private_data;
-        if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
+        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, ("attempting read on write only file instance"));
-        }
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
             total_read += bytes_read, current_offset += bytes_read) {
@@ -1625,9 +1623,8 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                                                smb_read_data +
                                                4 /* RFC1001 length field */ +
                                                le16_to_cpu(pSMBr->DataOffset),
-                                                bytes_read)) {
+                                                bytes_read))
                                        rc = -EFAULT;
-                                }
                                if (buf_type == CIFS_SMALL_BUFFER)
                                        cifs_small_buf_release(smb_read_data);
@@ -1814,9 +1811,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        pTcon = cifs_sb->tcon;
        pagevec_init(&lru_pvec, 0);
-#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(DBG2, ("rpages: num pages %d", num_pages));
-                cFYI(1, ("rpages: num pages %d", num_pages));
-#endif
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
@@ -1849,10 +1844,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                /* Read size needs to be in multiples of one page */
                read_size = min_t(const unsigned int, read_size,
                                  cifs_sb->rsize & PAGE_CACHE_MASK);
-#ifdef CONFIG_CIFS_DEBUG2
+                cFYI(DBG2, ("rpages: read size 0x%x  contiguous pages %d",
-                cFYI(1, ("rpages: read size 0x%x  contiguous pages %d",
                                read_size, contig_pages));
-#endif
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
                        if ((open_file->invalidHandle) &&
@@ -2026,7 +2019,7 @@ int is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
                struct cifs_sb_info *cifs_sb;
                cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
-                if ( cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO ) {
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
                        /* since no page cache to corrupt on directio
                        we can change size safely */
                        return 1;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b1a4a65eaa08..bc673c8c1e6b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -29,6 +29,162 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_op = &cifs_file_inode_ops;
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                                inode->i_fop = &cifs_file_direct_nobrl_ops;
+                        else
+                                inode->i_fop = &cifs_file_direct_ops;
+                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+                        inode->i_fop = &cifs_file_nobrl_ops;
+                else { /* not direct, send byte range locks */
+                        inode->i_fop = &cifs_file_ops;
+                }
+                /* check if server can support readpages */
+                if (cifs_sb->tcon->ses->server->maxBuf <
+                                PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+                        inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+                else
+                        inode->i_data.a_ops = &cifs_addr_ops;
+                break;
+        case S_IFDIR:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+                if (is_dfs_referral) {
+                        inode->i_op = &cifs_dfs_referral_inode_operations;
+                } else {
+#else /* NO DFS support, treat as a directory */
+                {
+#endif
+                        inode->i_op = &cifs_dir_inode_ops;
+                        inode->i_fop = &cifs_dir_ops;
+                }
+                break;
+        case S_IFLNK:
+                inode->i_op = &cifs_symlink_inode_ops;
+                break;
+        default:
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        }
+}
+static void cifs_unix_info_to_inode(struct inode *inode,
+                FILE_UNIX_BASIC_INFO *info, int force_uid_gid)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+        __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
+        __u64 end_of_file = le64_to_cpu(info->EndOfFile);
+        inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+        inode->i_mtime =
+                cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
+        inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+        inode->i_mode = le64_to_cpu(info->Permissions);
+        /*
+         * Since we set the inode type below we need to mask off
+         * to avoid strange results if bits set above.
+         */
+        inode->i_mode &= ~S_IFMT;
+        switch (le32_to_cpu(info->Type)) {
+        case UNIX_FILE:
+                inode->i_mode |= S_IFREG;
+                break;
+        case UNIX_SYMLINK:
+                inode->i_mode |= S_IFLNK;
+                break;
+        case UNIX_DIR:
+                inode->i_mode |= S_IFDIR;
+                break;
+        case UNIX_CHARDEV:
+                inode->i_mode |= S_IFCHR;
+                inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                                      le64_to_cpu(info->DevMinor) & MINORMASK);
+                break;
+        case UNIX_BLOCKDEV:
+                inode->i_mode |= S_IFBLK;
+                inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor),
+                                      le64_to_cpu(info->DevMinor) & MINORMASK);
+                break;
+        case UNIX_FIFO:
+                inode->i_mode |= S_IFIFO;
+                break;
+        case UNIX_SOCKET:
+                inode->i_mode |= S_IFSOCK;
+                break;
+        default:
+                /* safest to call it a file if we do not know */
+                inode->i_mode |= S_IFREG;
+                cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
+                break;
+        }
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) &&
+            !force_uid_gid)
+                inode->i_uid = cifs_sb->mnt_uid;
+        else
+                inode->i_uid = le64_to_cpu(info->Uid);
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) &&
+            !force_uid_gid)
+                inode->i_gid = cifs_sb->mnt_gid;
+        else
+                inode->i_gid = le64_to_cpu(info->Gid);
+        inode->i_nlink = le64_to_cpu(info->Nlinks);
+        spin_lock(&inode->i_lock);
+        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
+                /*
+                 * We can not safely change the file size here if the client
+                 * is writing to it due to potential races.
+                 */
+                i_size_write(inode, end_of_file);
+                /*
+                 * i_blocks is not related to (i_size / i_blksize),
+                 * but instead 512 byte (2**9) size is required for
+                 * calculating num blocks.
+                 */
+                inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
+        }
+        spin_unlock(&inode->i_lock);
+}
+static const unsigned char *cifs_get_search_path(struct cifsTconInfo *pTcon,
+                                        const char *search_path)
+{
+        int tree_len;
+        int path_len;
+        char *tmp_path;
+        if (!(pTcon->Flags & SMB_SHARE_IS_IN_DFS))
+                return search_path;
+        /* use full path name for working with DFS */
+        tree_len = strnlen(pTcon->treeName, MAX_TREE_SIZE + 1);
+        path_len = strnlen(search_path, MAX_PATHCONF);
+        tmp_path = kmalloc(tree_len+path_len+1, GFP_KERNEL);
+        if (tmp_path == NULL)
+                return search_path;
+        strncpy(tmp_path, pTcon->treeName, tree_len);
+        strncpy(tmp_path+tree_len, search_path, path_len);
+        tmp_path[tree_len+path_len] = 0;
+        return tmp_path;
+}
 int cifs_get_inode_info_unix(struct inode **pinode,
        const unsigned char *search_path, struct super_block *sb, int xid)
 {
@@ -37,52 +193,43 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        struct cifsTconInfo *pTcon;
        struct inode *inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        char *tmp_path;
+        const unsigned char *full_path;
+        bool is_dfs_referral = false;
        pTcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", search_path));
+        full_path = cifs_get_search_path(pTcon, search_path);
+try_again_CIFSSMBUnixQPathInfo:
        /* could have done a find first instead but this returns more info */
-        rc = CIFSSMBUnixQPathInfo(xid, pTcon, search_path, &findData,
+        rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &findData,
                                  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 /*      dump_mem("\nUnixQPathInfo return data", &findData,
                 sizeof(findData)); */
        if (rc) {
-                if (rc == -EREMOTE) {
+                if (rc == -EREMOTE && !is_dfs_referral) {
-                        tmp_path =
+                        is_dfs_referral = true;
-                            kmalloc(strnlen(pTcon->treeName,
+                        if (full_path != search_path) {
-                                            MAX_TREE_SIZE + 1) +
+                                kfree(full_path);
-                                    strnlen(search_path, MAX_PATHCONF) + 1,
+                                full_path = search_path;
-                                    GFP_KERNEL);
+                        }
-                        if (tmp_path == NULL)
+                        goto try_again_CIFSSMBUnixQPathInfo;
-                                return -ENOMEM;
-                        /* have to skip first of the double backslash of
-                           UNC name */
-                        strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE);
-                        strncat(tmp_path, search_path, MAX_PATHCONF);
-                        rc = connect_to_dfs_path(xid, pTcon->ses,
-                                                 /* treename + */ tmp_path,
-                                                 cifs_sb->local_nls,
-                                                 cifs_sb->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        kfree(tmp_path);
-                        /* BB fix up inode etc. */
-                } else if (rc) {
-                        return rc;
                }
+                goto cgiiu_exit;
        } else {
                struct cifsInodeInfo *cifsInfo;
-                __u32 type = le32_to_cpu(findData.Type);
                __u64 num_of_bytes = le64_to_cpu(findData.NumOfBytes);
                __u64 end_of_file = le64_to_cpu(findData.EndOfFile);
                /* get new inode */
                if (*pinode == NULL) {
                        *pinode = new_inode(sb);
-                        if (*pinode == NULL)
+                        if (*pinode == NULL) {
-                                return -ENOMEM;
+                                rc = -ENOMEM;
+                                goto cgiiu_exit;
+                        }
                        /* Is an i_ino of zero legal? */
                        /* Are there sanity checks we can use to ensure that
                           the server is really filling in that field? */
@@ -105,113 +252,20 @@ int cifs_get_inode_info_unix(struct inode **pinode,
                /* this is ok to set on every inode revalidate */
                atomic_set(&cifsInfo->inUse, 1);
-                inode->i_atime =
+                cifs_unix_info_to_inode(inode, &findData, 0);
-                    cifs_NTtimeToUnix(le64_to_cpu(findData.LastAccessTime));
-                inode->i_mtime =
-                    cifs_NTtimeToUnix(le64_to_cpu
-                                (findData.LastModificationTime));
-                inode->i_ctime =
-                    cifs_NTtimeToUnix(le64_to_cpu(findData.LastStatusChange));
-                inode->i_mode = le64_to_cpu(findData.Permissions);
-                /* since we set the inode type below we need to mask off
-                   to avoid strange results if bits set above */
-                inode->i_mode &= ~S_IFMT;
-                if (type == UNIX_FILE) {
-                        inode->i_mode |= S_IFREG;
-                } else if (type == UNIX_SYMLINK) {
-                        inode->i_mode |= S_IFLNK;
-                } else if (type == UNIX_DIR) {
-                        inode->i_mode |= S_IFDIR;
-                } else if (type == UNIX_CHARDEV) {
-                        inode->i_mode |= S_IFCHR;
-                        inode->i_rdev = MKDEV(le64_to_cpu(findData.DevMajor),
-                                le64_to_cpu(findData.DevMinor) & MINORMASK);
-                } else if (type == UNIX_BLOCKDEV) {
-                        inode->i_mode |= S_IFBLK;
-                        inode->i_rdev = MKDEV(le64_to_cpu(findData.DevMajor),
-                                le64_to_cpu(findData.DevMinor) & MINORMASK);
-                } else if (type == UNIX_FIFO) {
-                        inode->i_mode |= S_IFIFO;
-                } else if (type == UNIX_SOCKET) {
-                        inode->i_mode |= S_IFSOCK;
-                } else {
-                        /* safest to call it a file if we do not know */
-                        inode->i_mode |= S_IFREG;
-                        cFYI(1, ("unknown type %d", type));
-                }
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-                        inode->i_uid = cifs_sb->mnt_uid;
-                else
-                        inode->i_uid = le64_to_cpu(findData.Uid);
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-                        inode->i_gid = cifs_sb->mnt_gid;
-                else
-                        inode->i_gid = le64_to_cpu(findData.Gid);
-                inode->i_nlink = le64_to_cpu(findData.Nlinks);
-                spin_lock(&inode->i_lock);
-                if (is_size_safe_to_change(cifsInfo, end_of_file)) {
-                /* can not safely change the file size here if the
-                   client is writing to it due to potential races */
-                        i_size_write(inode, end_of_file);
-                /* blksize needs to be multiple of two. So safer to default to
-                blksize and blkbits set in superblock so 2**blkbits and blksize
-                will match rather than setting to:
-                (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
-                /* This seems incredibly stupid but it turns out that i_blocks
-                   is not related to (i_size / i_blksize), instead 512 byte size
-                   is required for calculating num blocks */
-                /* 512 bytes (2**9) is the fake blocksize that must be used */
-                /* for this calculation */
-                        inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
-                }
-                spin_unlock(&inode->i_lock);
                if (num_of_bytes < end_of_file)
                        cFYI(1, ("allocation size less than end of file"));
                cFYI(1, ("Size %ld and blocks %llu",
                        (unsigned long) inode->i_size,
                        (unsigned long long)inode->i_blocks));
-                if (S_ISREG(inode->i_mode)) {
-                        cFYI(1, ("File inode"));
+                cifs_set_ops(inode, is_dfs_referral);
-                        inode->i_op = &cifs_file_inode_ops;
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                        inode->i_fop =
-                                                &cifs_file_direct_nobrl_ops;
-                                else
-                                        inode->i_fop = &cifs_file_direct_ops;
-                        } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                inode->i_fop = &cifs_file_nobrl_ops;
-                        else /* not direct, send byte range locks */
-                                inode->i_fop = &cifs_file_ops;
-                        /* check if server can support readpages */
-                        if (pTcon->ses->server->maxBuf <
-                            PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
-                                inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
-                        else
-                                inode->i_data.a_ops = &cifs_addr_ops;
-                } else if (S_ISDIR(inode->i_mode)) {
-                        cFYI(1, ("Directory inode"));
-                        inode->i_op = &cifs_dir_inode_ops;
-                        inode->i_fop = &cifs_dir_ops;
-                } else if (S_ISLNK(inode->i_mode)) {
-                        cFYI(1, ("Symbolic Link inode"));
-                        inode->i_op = &cifs_symlink_inode_ops;
-                /* tmp_inode->i_fop = */ /* do not need to set to anything */
-                } else {
-                        cFYI(1, ("Init special inode"));
-                        init_special_inode(inode, inode->i_mode,
-                                           inode->i_rdev);
-                }
        }
+cgiiu_exit:
+        if (full_path != search_path)
+                kfree(full_path);
        return rc;
 }
@@ -320,15 +374,16 @@ static int get_sfu_mode(struct inode *inode,
 int cifs_get_inode_info(struct inode **pinode,
        const unsigned char *search_path, FILE_ALL_INFO *pfindData,
-        struct super_block *sb, int xid)
+        struct super_block *sb, int xid, const __u16 *pfid)
 {
        int rc = 0;
        struct cifsTconInfo *pTcon;
        struct inode *inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        char *tmp_path;
+        const unsigned char *full_path = NULL;
        char *buf = NULL;
        int adjustTZ = FALSE;
+        bool is_dfs_referral = false;
        pTcon = cifs_sb->tcon;
        cFYI(1, ("Getting info on %s", search_path));
@@ -346,8 +401,12 @@ int cifs_get_inode_info(struct inode **pinode,
                if (buf == NULL)
                        return -ENOMEM;
                pfindData = (FILE_ALL_INFO *)buf;
+                full_path = cifs_get_search_path(pTcon, search_path);
+try_again_CIFSSMBQPathInfo:
                /* could do find first instead but this returns more info */
-                rc = CIFSSMBQPathInfo(xid, pTcon, search_path, pfindData,
+                rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData,
                              0 /* not legacy */,
                              cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -355,7 +414,7 @@ int cifs_get_inode_info(struct inode **pinode,
                when server claims no NT SMB support and the above call
                failed at least once - set flag in tcon or mount */
                if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
-                        rc = SMBQueryInformation(xid, pTcon, search_path,
+                        rc = SMBQueryInformation(xid, pTcon, full_path,
                                        pfindData, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                          CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -364,31 +423,15 @@ int cifs_get_inode_info(struct inode **pinode,
        }
        /* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */
        if (rc) {
-                if (rc == -EREMOTE) {
+                if (rc == -EREMOTE && !is_dfs_referral) {
-                        tmp_path =
+                        is_dfs_referral = true;
-                            kmalloc(strnlen
+                        if (full_path != search_path) {
-                                    (pTcon->treeName,
+                                kfree(full_path);
-                                     MAX_TREE_SIZE + 1) +
+                                full_path = search_path;
-                                    strnlen(search_path, MAX_PATHCONF) + 1,
-                                    GFP_KERNEL);
-                        if (tmp_path == NULL) {
-                                kfree(buf);
-                                return -ENOMEM;
                        }
+                        goto try_again_CIFSSMBQPathInfo;
-                        strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE);
-                        strncat(tmp_path, search_path, MAX_PATHCONF);
-                        rc = connect_to_dfs_path(xid, pTcon->ses,
-                                                 /* treename + */ tmp_path,
-                                                 cifs_sb->local_nls,
-                                                 cifs_sb->mnt_cifs_flags &
-                                                   CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        kfree(tmp_path);
-                        /* BB fix up inode etc. */
-                } else if (rc) {
-                        kfree(buf);
-                        return rc;
                }
+                goto cgii_exit;
        } else {
                struct cifsInodeInfo *cifsInfo;
                __u32 attr = le32_to_cpu(pfindData->Attributes);
@@ -397,8 +440,8 @@ int cifs_get_inode_info(struct inode **pinode,
                if (*pinode == NULL) {
                        *pinode = new_inode(sb);
                        if (*pinode == NULL) {
-                                kfree(buf);
+                                rc = -ENOMEM;
-                                return -ENOMEM;
+                                goto cgii_exit;
                        }
                        /* Is an i_ino of zero legal? Can we use that to check
                           if the server supports returning inode numbers?  Are
@@ -490,9 +533,9 @@ int cifs_get_inode_info(struct inode **pinode,
                        if (decode_sfu_inode(inode,
                                         le64_to_cpu(pfindData->EndOfFile),
                                         search_path,
-                                         cifs_sb, xid)) {
+                                         cifs_sb, xid))
                                cFYI(1, ("Unrecognized sfu inode type"));
-                        }
                        cFYI(1, ("sfu mode 0%o", inode->i_mode));
                } else {
                        inode->i_mode |= S_IFREG;
@@ -532,7 +575,7 @@ int cifs_get_inode_info(struct inode **pinode,
                /* fill in 0777 bits from ACL */
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                        cFYI(1, ("Getting mode bits from ACL"));
-                        acl_to_uid_mode(inode, search_path);
+                        acl_to_uid_mode(inode, search_path, pfid);
                }
 #endif
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -546,37 +589,11 @@ int cifs_get_inode_info(struct inode **pinode,
                        atomic_set(&cifsInfo->inUse, 1);
                }
-                if (S_ISREG(inode->i_mode)) {
+                cifs_set_ops(inode, is_dfs_referral);
-                        cFYI(1, ("File inode"));
-                        inode->i_op = &cifs_file_inode_ops;
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                        inode->i_fop =
-                                                &cifs_file_direct_nobrl_ops;
-                                else
-                                        inode->i_fop = &cifs_file_direct_ops;
-                        } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                inode->i_fop = &cifs_file_nobrl_ops;
-                        else /* not direct, send byte range locks */
-                                inode->i_fop = &cifs_file_ops;
-                        if (pTcon->ses->server->maxBuf <
-                             PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
-                                inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
-                        else
-                                inode->i_data.a_ops = &cifs_addr_ops;
-                } else if (S_ISDIR(inode->i_mode)) {
-                        cFYI(1, ("Directory inode"));
-                        inode->i_op = &cifs_dir_inode_ops;
-                        inode->i_fop = &cifs_dir_ops;
-                } else if (S_ISLNK(inode->i_mode)) {
-                        cFYI(1, ("Symbolic Link inode"));
-                        inode->i_op = &cifs_symlink_inode_ops;
-                } else {
-                        init_special_inode(inode, inode->i_mode,
-                                           inode->i_rdev);
-                }
        }
+cgii_exit:
+        if (full_path != search_path)
+                kfree(full_path);
        kfree(buf);
        return rc;
 }
@@ -605,7 +622,8 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
        if (cifs_sb->tcon->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, "", inode->i_sb, xid);
        else
-                rc = cifs_get_inode_info(&inode, "", NULL, inode->i_sb, xid);
+                rc = cifs_get_inode_info(&inode, "", NULL, inode->i_sb, xid,
+                                         NULL);
        if (rc && cifs_sb->tcon->ipc) {
                cFYI(1, ("ipc connection - fake read inode"));
                inode->i_mode |= S_IFDIR;
@@ -792,17 +810,12 @@ psx_del_no_retry:
 }
 static void posix_fill_in_inode(struct inode *tmp_inode,
-        FILE_UNIX_BASIC_INFO *pData, int *pobject_type, int isNewInode)
+        FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
+        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
        loff_t local_size;
        struct timespec local_mtime;
-        struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
-        __u32 type = le32_to_cpu(pData->Type);
-        __u64 num_of_bytes = le64_to_cpu(pData->NumOfBytes);
-        __u64 end_of_file = le64_to_cpu(pData->EndOfFile);
        cifsInfo->time = jiffies;
        atomic_inc(&cifsInfo->inUse);
@@ -810,115 +823,27 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
        local_mtime = tmp_inode->i_mtime;
        local_size  = tmp_inode->i_size;
-        tmp_inode->i_atime =
+        cifs_unix_info_to_inode(tmp_inode, pData, 1);
-            cifs_NTtimeToUnix(le64_to_cpu(pData->LastAccessTime));
+        cifs_set_ops(tmp_inode, false);
-        tmp_inode->i_mtime =
-            cifs_NTtimeToUnix(le64_to_cpu(pData->LastModificationTime));
-        tmp_inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pData->LastStatusChange));
-        tmp_inode->i_mode = le64_to_cpu(pData->Permissions);
-        /* since we set the inode type below we need to mask off type
-           to avoid strange results if bits above were corrupt */
-        tmp_inode->i_mode &= ~S_IFMT;
-        if (type == UNIX_FILE) {
-                *pobject_type = DT_REG;
-                tmp_inode->i_mode |= S_IFREG;
-        } else if (type == UNIX_SYMLINK) {
-                *pobject_type = DT_LNK;
-                tmp_inode->i_mode |= S_IFLNK;
-        } else if (type == UNIX_DIR) {
-                *pobject_type = DT_DIR;
-                tmp_inode->i_mode |= S_IFDIR;
-        } else if (type == UNIX_CHARDEV) {
-                *pobject_type = DT_CHR;
-                tmp_inode->i_mode |= S_IFCHR;
-                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
-                                le64_to_cpu(pData->DevMinor) & MINORMASK);
-        } else if (type == UNIX_BLOCKDEV) {
-                *pobject_type = DT_BLK;
-                tmp_inode->i_mode |= S_IFBLK;
-                tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
-                                le64_to_cpu(pData->DevMinor) & MINORMASK);
-        } else if (type == UNIX_FIFO) {
-                *pobject_type = DT_FIFO;
-                tmp_inode->i_mode |= S_IFIFO;
-        } else if (type == UNIX_SOCKET) {
-                *pobject_type = DT_SOCK;
-                tmp_inode->i_mode |= S_IFSOCK;
-        } else {
-                /* safest to just call it a file */
-                *pobject_type = DT_REG;
-                tmp_inode->i_mode |= S_IFREG;
-                cFYI(1, ("unknown inode type %d", type));
-        }
-#ifdef CONFIG_CIFS_DEBUG2
-        cFYI(1, ("object type: %d", type));
-#endif
-        tmp_inode->i_uid = le64_to_cpu(pData->Uid);
-        tmp_inode->i_gid = le64_to_cpu(pData->Gid);
-        tmp_inode->i_nlink = le64_to_cpu(pData->Nlinks);
-        spin_lock(&tmp_inode->i_lock);
-        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
-                /* can not safely change the file size here if the
-                client is writing to it due to potential races */
-                i_size_write(tmp_inode, end_of_file);
-        /* 512 bytes (2**9) is the fake blocksize that must be used */
-        /* for this calculation, not the real blocksize */
-                tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
-        }
-        spin_unlock(&tmp_inode->i_lock);
-        if (S_ISREG(tmp_inode->i_mode)) {
-                cFYI(1, ("File inode"));
-                tmp_inode->i_op = &cifs_file_inode_ops;
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
-                                tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
-                        else
-                                tmp_inode->i_fop = &cifs_file_direct_ops;
-                } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+        if (!S_ISREG(tmp_inode->i_mode))
-                        tmp_inode->i_fop = &cifs_file_nobrl_ops;
+                return;
-                else
-                        tmp_inode->i_fop = &cifs_file_ops;
-                if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+        /*
-                   (cifs_sb->tcon->ses->server->maxBuf <
+         * No sense invalidating pages for new inode
-                        PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
+         * since we we have not started caching
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+         * readahead file data yet.
-                else
+         */
-                        tmp_inode->i_data.a_ops = &cifs_addr_ops;
+        if (isNewInode)
+                return;
-                if (isNewInode)
-                        return; /* No sense invalidating pages for new inode
-                                   since we we have not started caching
-                                   readahead file data yet */
-                if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
+        if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
-                        (local_size == tmp_inode->i_size)) {
+                (local_size == tmp_inode->i_size)) {
-                        cFYI(1, ("inode exists but unchanged"));
+                cFYI(1, ("inode exists but unchanged"));
-                } else {
-                        /* file may have changed on server */
-                        cFYI(1, ("invalidate inode, readdir detected change"));
-                        invalidate_remote_inode(tmp_inode);
-                }
-        } else if (S_ISDIR(tmp_inode->i_mode)) {
-                cFYI(1, ("Directory inode"));
-                tmp_inode->i_op = &cifs_dir_inode_ops;
-                tmp_inode->i_fop = &cifs_dir_ops;
-        } else if (S_ISLNK(tmp_inode->i_mode)) {
-                cFYI(1, ("Symbolic Link inode"));
-                tmp_inode->i_op = &cifs_symlink_inode_ops;
-/* tmp_inode->i_fop = *//* do not need to set to anything */
        } else {
-                cFYI(1, ("Special inode"));
+                /* file may have changed on server */
-                init_special_inode(tmp_inode, tmp_inode->i_mode,
+                cFYI(1, ("invalidate inode, readdir detected change"));
-                                   tmp_inode->i_rdev);
+                invalidate_remote_inode(tmp_inode);
        }
 }
@@ -968,7 +893,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        cFYI(1, ("posix mkdir returned 0x%x", rc));
                        d_drop(direntry);
                } else {
-                        int obj_type;
                        if (pInfo->Type == cpu_to_le32(-1)) {
                                /* no return info, go query for it */
                                kfree(pInfo);
@@ -1004,7 +928,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        /* we already checked in POSIXCreate whether
                           frame was long enough */
                        posix_fill_in_inode(direntry->d_inode,
-                                        pInfo, &obj_type, 1 /* NewInode */);
+                                        pInfo, 1 /* NewInode */);
 #ifdef CONFIG_CIFS_DEBUG2
                        cFYI(1, ("instantiated dentry %p %s to inode %p",
                                direntry, direntry->d_name.name, newinode));
@@ -1032,7 +956,7 @@ mkdir_get_info:
                                                      inode->i_sb, xid);
                else
                        rc = cifs_get_inode_info(&newinode, full_path, NULL,
-                                                 inode->i_sb, xid);
+                                                 inode->i_sb, xid, NULL);
                if (pTcon->nocase)
                        direntry->d_op = &cifs_ci_dentry_ops;
@@ -1214,9 +1138,8 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
                } /* if we can not get memory just leave rc as EEXIST */
        }
-        if (rc) {
+        if (rc)
                cFYI(1, ("rename rc %d", rc));
-        }
        if ((rc == -EIO) || (rc == -EEXIST)) {
                int oplock = FALSE;
@@ -1315,7 +1238,7 @@ int cifs_revalidate(struct dentry *direntry)
                }
        } else {
                rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
-                                         direntry->d_sb, xid);
+                                         direntry->d_sb, xid, NULL);
                if (rc) {
                        cFYI(1, ("error on getting revalidate info %d", rc));
 /*                      if (rc != -ENOENT)
@@ -1504,11 +1427,10 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        }
        cifsInode = CIFS_I(direntry->d_inode);
-        /* BB check if we need to refresh inode from server now ? BB */
+        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
-        if (attrs->ia_valid & ATTR_SIZE) {
                /*
-                   Flush data before changing file size on server. If the
+                   Flush data before changing file size or changing the last
+                   write time of the file on the server. If the
                   flush returns error, store it to report later and continue.
                   BB: This should be smarter. Why bother flushing pages that
                   will be truncated anyway? Also, should we error out here if
@@ -1519,7 +1441,9 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                        CIFS_I(direntry->d_inode)->write_behind_rc = rc;
                        rc = 0;
                }
+        }
+        if (attrs->ia_valid & ATTR_SIZE) {
                /* To avoid spurious oplock breaks from server, in the case of
                   inodes that we already have open, avoid doing path based
                   setting of file size if we can do it by handle.
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index d24fe6880a04..5c792df13d62 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -30,7 +30,7 @@
 #define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2)
-int cifs_ioctl (struct inode *inode, struct file *filep,
+int cifs_ioctl(struct inode *inode, struct file *filep,
                unsigned int command, unsigned long arg)
 {
        int rc = -ENOTTY; /* strange error - but the precedent */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 1d6fb01b8e6d..d4e7ec93285f 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -205,7 +205,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                                                      inode->i_sb, xid);
                else
                        rc = cifs_get_inode_info(&newinode, full_path, NULL,
-                                                 inode->i_sb, xid);
+                                                 inode->i_sb, xid, NULL);
                if (rc != 0) {
                        cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
index a2415c1a14db..a725c2609d67 100644
--- a/fs/cifs/md4.c
+++ b/fs/cifs/md4.c
@@ -56,7 +56,7 @@ lshift(__u32 x, int s)
 /* this applies md4 to 64 byte chunks */
 static void
-mdfour64(__u32 * M, __u32 * A, __u32 *B, __u32 * C, __u32 *D)
+mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
 {
        int j;
        __u32 AA, BB, CC, DD;
@@ -137,7 +137,7 @@ mdfour64(__u32 * M, __u32 * A, __u32 *B, __u32 * C, __u32 *D)
 }
 static void
-copy64(__u32 * M, unsigned char *in)
+copy64(__u32 *M, unsigned char *in)
 {
        int i;
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index f13f96d42fcf..462bbfefd4b6 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -161,7 +161,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
 /* This is the central step in the MD5 algorithm. */
 #define MD5STEP(f, w, x, y, z, data, s) \
-        ( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
+        (w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x)
 /*
 * The core of the MD5 algorithm, this alters an existing MD5 hash to
@@ -302,9 +302,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
        int i;
        /* if key is longer than 64 bytes truncate it */
-        if (key_len > 64) {
+        if (key_len > 64)
                key_len = 64;
-        }
        /* start out by storing key in pads */
        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
@@ -359,9 +358,9 @@ hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
 {
        struct HMACMD5Context ctx;
        hmac_md5_init_limK_to_64(key, 16, &ctx);
-        if (data_len != 0) {
+        if (data_len != 0)
                hmac_md5_update(data, data_len, &ctx);
-        }
        hmac_md5_final(digest, &ctx);
 }
 #endif
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 15546c2354c5..2a42d9fedbb2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/misc.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -320,9 +320,9 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                if (treeCon->ses) {
                        if (treeCon->ses->capabilities & CAP_UNICODE)
                                buffer->Flags2 |= SMBFLG2_UNICODE;
-                        if (treeCon->ses->capabilities & CAP_STATUS32) {
+                        if (treeCon->ses->capabilities & CAP_STATUS32)
                                buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-                        }
                        /* Uid is not converted */
                        buffer->Uid = treeCon->ses->Suid;
                        buffer->Mid = GetNextMid(treeCon->ses->server);
@@ -610,7 +610,8 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
        buffer = (unsigned char *) smb_buf;
        for (i = 0, j = 0; i < smb_buf_length; i++, j++) {
-                if (i % 8 == 0) {       /* have reached the beginning of line */
+                if (i % 8 == 0) {
+                        /* have reached the beginning of line */
                        printk(KERN_DEBUG "| ");
                        j = 0;
                }
@@ -621,7 +622,8 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
                else
                        debug_line[1 + (2 * j)] = '_';
-                if (i % 8 == 7) { /* reached end of line, time to print ascii */
+                if (i % 8 == 7) {
+                        /* reached end of line, time to print ascii */
                        debug_line[16] = 0;
                        printk(" | %s\n", debug_line);
                }
@@ -631,7 +633,7 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
                debug_line[2 * j] = ' ';
                debug_line[1 + (2 * j)] = ' ';
        }
-        printk( " | %s\n", debug_line);
+        printk(" | %s\n", debug_line);
        return;
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 646e1f06941b..3b5a5ce882b6 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/netmisc.c
 *
- *   Copyright (c) International Business Machines  Corp., 2002
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Error mapping routines from Samba libsmb/errormap.c
@@ -150,9 +150,7 @@ static int canonicalize_unc(char *cp)
                if (cp[i] == '\\')
                        break;
                if (cp[i] == '/') {
-#ifdef CONFIG_CIFS_DEBUG2
+                        cFYI(DBG2, ("change slash to \\ in malformed UNC"));
-                        cFYI(1, ("change slash to backslash in malformed UNC"));
-#endif
                        cp[i] = '\\';
                        return 1;
                }
@@ -178,9 +176,7 @@ cifs_inet_pton(int address_family, char *cp, void *dst)
        } else if (address_family == AF_INET6) {
                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
        }
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
-        cFYI(1, ("address conversion returned %d for %s", ret, cp));
-#endif
        if (ret > 0)
                ret = 1;
        return ret;
@@ -253,7 +249,8 @@ static const struct {
        ERRDOS, 87, NT_STATUS_INVALID_PARAMETER_MIX}, {
        ERRHRD, ERRgeneral, NT_STATUS_INVALID_QUOTA_LOWER}, {
        ERRHRD, ERRgeneral, NT_STATUS_DISK_CORRUPT_ERROR}, {
-        ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, {   /* mapping changed since shell does lookup on * and expects file not found */
+         /* mapping changed since shell does lookup on * expects FileNotFound */
+        ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_INVALID}, {
        ERRDOS, ERRbadfile, NT_STATUS_OBJECT_NAME_NOT_FOUND}, {
        ERRDOS, ERRalreadyexists, NT_STATUS_OBJECT_NAME_COLLISION}, {
        ERRHRD, ERRgeneral, NT_STATUS_HANDLE_NOT_WAITABLE}, {
@@ -820,7 +817,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        /* old style errors */
        /* DOS class smb error codes - map DOS */
-        if (smberrclass == ERRDOS) {  /* 1 byte field no need to byte reverse */
+        if (smberrclass == ERRDOS) {
+                /* 1 byte field no need to byte reverse */
                for (i = 0;
                     i <
                     sizeof(mapping_table_ERRDOS) /
@@ -834,7 +832,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
                        }
                        /* else try next error mapping one to see if match */
                }
-        } else if (smberrclass == ERRSRV) {   /* server class of error codes */
+        } else if (smberrclass == ERRSRV) {
+                /* server class of error codes */
                for (i = 0;
                     i <
                     sizeof(mapping_table_ERRSRV) /
@@ -922,8 +921,8 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 {
        struct timespec ts;
        int sec, min, days, month, year;
-        SMB_TIME * st = (SMB_TIME *)&time;
+        SMB_TIME *st = (SMB_TIME *)&time;
-        SMB_DATE * sd = (SMB_DATE *)&date;
+        SMB_DATE *sd = (SMB_DATE *)&date;
        cFYI(1, ("date %d time %d", date, time));
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 0f22def4bdff..32b445edc882 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -3,7 +3,7 @@
 *
 *   Directory search handling
 *
- *   Copyright (C) International Business Machines  Corp., 2004, 2007
+ *   Copyright (C) International Business Machines  Corp., 2004, 2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -42,17 +42,18 @@ static void dump_cifs_file_struct(struct file *file, char *label)
                        cFYI(1, ("empty cifs private file data"));
                        return;
                }
-                if (cf->invalidHandle) {
+                if (cf->invalidHandle)
                        cFYI(1, ("invalid handle"));
-                }
+                if (cf->srch_inf.endOfSearch)
-                if (cf->srch_inf.endOfSearch) {
                        cFYI(1, ("end of search"));
-                }
+                if (cf->srch_inf.emptyDir)
-                if (cf->srch_inf.emptyDir) {
                        cFYI(1, ("empty dir"));
-                }
        }
 }
+#else
+static inline void dump_cifs_file_struct(struct file *file, char *label)
+{
+}
 #endif /* DEBUG2 */
 /* Returns one if new inode created (which therefore needs to be hashed) */
@@ -150,7 +151,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
        } else { /* legacy, OS2 and DOS style */
 /*              struct timespec ts;*/
-                FIND_FILE_STANDARD_INFO * pfindData =
+                FIND_FILE_STANDARD_INFO *pfindData =
                        (FIND_FILE_STANDARD_INFO *)buf;
                tmp_inode->i_mtime = cnvrtDosUnixTm(
@@ -198,9 +199,8 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (attr & ATTR_DIRECTORY) {
                *pobject_type = DT_DIR;
                /* override default perms since we do not lock dirs */
-                if (atomic_read(&cifsInfo->inUse) == 0) {
+                if (atomic_read(&cifsInfo->inUse) == 0)
                        tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
-                }
                tmp_inode->i_mode |= S_IFDIR;
        } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
                   (attr & ATTR_SYSTEM)) {
@@ -231,9 +231,8 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        } /* could add code here - to validate if device or weird share type? */
        /* can not fill in nlink here as in qpathinfo version and Unx search */
-        if (atomic_read(&cifsInfo->inUse) == 0) {
+        if (atomic_read(&cifsInfo->inUse) == 0)
                atomic_set(&cifsInfo->inUse, 1);
-        }
        spin_lock(&tmp_inode->i_lock);
        if (is_size_safe_to_change(cifsInfo, end_of_file)) {
@@ -461,9 +460,8 @@ static int initiate_cifs_search(const int xid, struct file *file)
        full_path = build_path_from_dentry(file->f_path.dentry);
-        if (full_path == NULL) {
+        if (full_path == NULL)
                return -ENOMEM;
-        }
        cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos));
@@ -471,9 +469,9 @@ ffirst_retry:
        /* test for Unix extensions */
        /* but now check for them on the share/mount not on the SMB session */
 /*      if (pTcon->ses->capabilities & CAP_UNIX) { */
-        if (pTcon->unix_ext) {
+        if (pTcon->unix_ext)
                cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
-        } else if ((pTcon->ses->capabilities &
+        else if ((pTcon->ses->capabilities &
                        (CAP_NT_SMBS | CAP_NT_FIND)) == 0) {
                cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
        } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
@@ -514,10 +512,10 @@ static int cifs_unicode_bytelen(char *str)
 static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 {
        char *new_entry;
-        FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
+        FILE_DIRECTORY_INFO *pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
        if (level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO * pfData;
+                FIND_FILE_STANDARD_INFO *pfData;
                pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
                new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
@@ -553,7 +551,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
        int len = 0;
        if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
-                FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
+                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                if (cfile->srch_inf.unicode) {
                        len = cifs_unicode_bytelen(filename);
@@ -562,30 +560,30 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
                        len = strnlen(filename, 5);
                }
        } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
-                FILE_DIRECTORY_INFO * pFindData =
+                FILE_DIRECTORY_INFO *pFindData =
                        (FILE_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
        } else if (cfile->srch_inf.info_level ==
                        SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-                FILE_FULL_DIRECTORY_INFO * pFindData =
+                FILE_FULL_DIRECTORY_INFO *pFindData =
                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
        } else if (cfile->srch_inf.info_level ==
                        SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-                SEARCH_ID_FULL_DIR_INFO * pFindData =
+                SEARCH_ID_FULL_DIR_INFO *pFindData =
                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
        } else if (cfile->srch_inf.info_level ==
                        SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-                FILE_BOTH_DIRECTORY_INFO * pFindData =
+                FILE_BOTH_DIRECTORY_INFO *pFindData =
                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
        } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO * pFindData =
+                FIND_FILE_STANDARD_INFO *pFindData =
                        (FIND_FILE_STANDARD_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                len = pFindData->FileNameLength;
@@ -666,9 +664,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        . and .. for the root of a drive and for those we need
        to start two entries earlier */
-#ifdef CONFIG_CIFS_DEBUG2
        dump_cifs_file_struct(file, "In fce ");
-#endif
        if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) &&
             is_dir_changed(file)) ||
           (index_to_find < first_entry_in_buffer)) {
@@ -718,7 +714,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                pos_in_buf = index_to_find - first_entry_in_buffer;
                cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf));
-                for (i=0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
+                for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
                        /* go entry by entry figuring out which is first */
                        current_entry = nxt_dir_entry(current_entry, end_of_smb,
                                                cifsFile->srch_inf.info_level);
@@ -793,7 +789,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                filename = &pFindData->FileName[0];
                len = le32_to_cpu(pFindData->FileNameLength);
        } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO * pFindData =
+                FIND_FILE_STANDARD_INFO *pFindData =
                        (FIND_FILE_STANDARD_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                /* one byte length, no name conversion */
@@ -928,7 +924,7 @@ static int cifs_save_resume_key(const char *current_entry,
        level = cifsFile->srch_inf.info_level;
        if (level == SMB_FIND_FILE_UNIX) {
-                FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
+                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
                filename = &pFindData->FileName[0];
                if (cifsFile->srch_inf.unicode) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d2153abcba6d..ed150efbe27c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -417,10 +417,6 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                calc_lanman_hash(ses, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
-/* #ifdef CONFIG_CIFS_DEBUG2
-                cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
-                        CIFS_SESS_KEY_SIZE);
-#endif */
                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
                bcc_ptr += CIFS_SESS_KEY_SIZE;
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index cfa6d21fb4e8..04943c976f98 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -114,42 +114,42 @@ static uchar sbox[8][4][16] = {
        {{14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7},
         {0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8},
         {4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0},
-         {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13}},
+         {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13} },
        {{15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10},
         {3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5},
         {0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15},
-         {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9}},
+         {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9} },
        {{10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8},
         {13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1},
         {13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7},
-         {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12}},
+         {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12} },
        {{7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15},
         {13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9},
         {10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4},
-         {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14}},
+         {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14} },
        {{2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9},
         {14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6},
         {4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14},
-         {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3}},
+         {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3} },
        {{12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11},
         {10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8},
         {9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6},
-         {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13}},
+         {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13} },
        {{4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1},
         {13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6},
         {1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2},
-         {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12}},
+         {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12} },
        {{13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7},
         {1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2},
         {7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8},
-         {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11}}
+         {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11} }
 };
 static void
@@ -313,9 +313,8 @@ str_to_key(unsigned char *str, unsigned char *key)
        key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
        key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
        key[7] = str[6] & 0x7F;
-        for (i = 0; i < 8; i++) {
+        for (i = 0; i < 8; i++)
                key[i] = (key[i] << 1);
-        }
 }
 static void
@@ -344,9 +343,8 @@ smbhash(unsigned char *out, unsigned char *in, unsigned char *key, int forw)
        dohash(outb, inb, keyb, forw);
-        for (i = 0; i < 8; i++) {
+        for (i = 0; i < 8; i++)
                out[i] = 0;
-        }
        for (i = 0; i < 64; i++) {
                if (outb[i])
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 50b623ad9320..3612d6c0a0bb 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/transport.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *   Jeremy Allison (jra@samba.org) 2006.
 *
@@ -358,9 +358,9 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        } else if (ses->status != CifsGood) {
                /* check if SMB session is bad because we are setting it up */
                if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
-                        (in_buf->Command != SMB_COM_NEGOTIATE)) {
+                        (in_buf->Command != SMB_COM_NEGOTIATE))
                        return -EAGAIN;
-                } /* else ok - we are setting up session */
+                /* else ok - we are setting up session */
        }
        *ppmidQ = AllocMidQEntry(in_buf, ses);
        if (*ppmidQ == NULL)
@@ -437,9 +437,8 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        iov[0].iov_len = in_buf->smb_buf_length + 4;
        flags |= CIFS_NO_RESP;
        rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc));
-        cFYI(1, ("SendRcvNoR flags %d rc %d", flags, rc));
-#endif
        return rc;
 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 54e8ef96cb79..8cd6a445b017 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -139,9 +139,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
-                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
+                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
                        cFYI(1, ("attempt to set cifs inode metadata"));
-                }
                ea_name += 5; /* skip past user. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
                        (__u16)value_size, cifs_sb->local_nls,
@@ -262,7 +262,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-                else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+                else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                        __u16 fid;
                        int oplock = FALSE;
                        struct cifs_ntsd *pacl = NULL;
@@ -303,11 +303,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        } else if (strncmp(ea_name,
                  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
                cFYI(1, ("Security xattr namespace not supported yet"));
-        } else {
+        } else
                cFYI(1,
                    ("illegal xattr request %s (only user namespace supported)",
                        ea_name));
-        }
        /* We could add an additional check for streams ie
            if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 350680fd7da7..0c3b618c15b3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -23,7 +23,6 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d26e2826ba5b..e9602d85c11d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -29,10 +29,6 @@
 #define DEBUGFS_MAGIC   0x64626720
-/* declared over in file.c */
-extern struct file_operations debugfs_file_operations;
-extern struct inode_operations debugfs_link_operations;
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d30ea8b433a2..7a8824f475f2 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -37,7 +37,6 @@
 #include <linux/jhash.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 035e6f9990b0..67522c268c14 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -215,6 +215,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
        ls->ls_recover_nodeid = nodeid;
        if (nodeid == dlm_our_nodeid()) {
+                ls->ls_recover_buf->rc_header.h_length =
+                        dlm_config.ci_buffer_size;
                dlm_copy_master_names(ls, last_name, last_len,
                                      ls->ls_recover_buf->rc_buf,
                                      max_size, nodeid);
diff --git a/fs/dquot.c b/fs/dquot.c
index 9c7feb62eed1..41b9dbd68b0e 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1522,8 +1522,8 @@ int vfs_quota_off(struct super_block *sb, int type)
                                truncate_inode_pages(&toputinode[cnt]->i_data, 0);
                                mutex_unlock(&toputinode[cnt]->i_mutex);
                                mark_inode_dirty(toputinode[cnt]);
-                                iput(toputinode[cnt]);
                        }
+                        iput(toputinode[cnt]);
                        mutex_unlock(&dqopt->dqonoff_mutex);
                }
        if (sb->s_bdev)
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 841a032050a7..5e596583946c 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -80,8 +80,8 @@ static void ecryptfs_d_release(struct dentry *dentry)
 {
        if (ecryptfs_dentry_to_private(dentry)) {
                if (ecryptfs_dentry_to_lower(dentry)) {
-                        mntput(ecryptfs_dentry_to_lower_mnt(dentry));
                        dput(ecryptfs_dentry_to_lower(dentry));
+                        mntput(ecryptfs_dentry_to_lower_mnt(dentry));
                }
                kmem_cache_free(ecryptfs_dentry_info_cache,
                                ecryptfs_dentry_to_private(dentry));
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index dc74b186145d..6df1debdccce 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -263,52 +263,102 @@ out:
        return 0;
 }
-/* This function must zero any hole we create */
+/**
+ * ecryptfs_prepare_write
+ * @file: The eCryptfs file
+ * @page: The eCryptfs page
+ * @from: The start byte from which we will write
+ * @to: The end byte to which we will write
+ *
+ * This function must zero any hole we create
+ *
+ * Returns zero on success; non-zero otherwise
+ */
 static int ecryptfs_prepare_write(struct file *file, struct page *page,
                                  unsigned from, unsigned to)
 {
-        int rc = 0;
        loff_t prev_page_end_size;
+        int rc = 0;
        if (!PageUptodate(page)) {
-                rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
+                struct ecryptfs_crypt_stat *crypt_stat =
-                                                      PAGE_CACHE_SIZE,
+                        &ecryptfs_inode_to_private(
-                                                      page->mapping->host);
+                                file->f_path.dentry->d_inode)->crypt_stat;
-                if (rc) {
-                        printk(KERN_ERR "%s: Error attemping to read lower "
+                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
-                               "page segment; rc = [%d]\n", __FUNCTION__, rc);
+                    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
-                        ClearPageUptodate(page);
+                        rc = ecryptfs_read_lower_page_segment(
-                        goto out;
+                                page, page->index, 0, PAGE_CACHE_SIZE,
-                } else
+                                page->mapping->host);
+                        if (rc) {
+                                printk(KERN_ERR "%s: Error attemping to read "
+                                       "lower page segment; rc = [%d]\n",
+                                       __FUNCTION__, rc);
+                                ClearPageUptodate(page);
+                                goto out;
+                        } else
+                                SetPageUptodate(page);
+                } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
+                        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+                                rc = ecryptfs_copy_up_encrypted_with_header(
+                                        page, crypt_stat);
+                                if (rc) {
+                                        printk(KERN_ERR "%s: Error attempting "
+                                               "to copy the encrypted content "
+                                               "from the lower file whilst "
+                                               "inserting the metadata from "
+                                               "the xattr into the header; rc "
+                                               "= [%d]\n", __FUNCTION__, rc);
+                                        ClearPageUptodate(page);
+                                        goto out;
+                                }
+                                SetPageUptodate(page);
+                        } else {
+                                rc = ecryptfs_read_lower_page_segment(
+                                        page, page->index, 0, PAGE_CACHE_SIZE,
+                                        page->mapping->host);
+                                if (rc) {
+                                        printk(KERN_ERR "%s: Error reading "
+                                               "page; rc = [%d]\n",
+                                               __FUNCTION__, rc);
+                                        ClearPageUptodate(page);
+                                        goto out;
+                                }
+                                SetPageUptodate(page);
+                        }
+                } else {
+                        rc = ecryptfs_decrypt_page(page);
+                        if (rc) {
+                                printk(KERN_ERR "%s: Error decrypting page "
+                                       "at index [%ld]; rc = [%d]\n",
+                                       __FUNCTION__, page->index, rc);
+                                ClearPageUptodate(page);
+                                goto out;
+                        }
                        SetPageUptodate(page);
+                }
        }
        prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT);
+        /* If creating a page or more of holes, zero them out via truncate.
-        /*
+         * Note, this will increase i_size. */
-         * If creating a page or more of holes, zero them out via truncate.
-         * Note, this will increase i_size.
-         */
        if (page->index != 0) {
                if (prev_page_end_size > i_size_read(page->mapping->host)) {
                        rc = ecryptfs_truncate(file->f_path.dentry,
                                               prev_page_end_size);
                        if (rc) {
-                                printk(KERN_ERR "Error on attempt to "
+                                printk(KERN_ERR "%s: Error on attempt to "
                                       "truncate to (higher) offset [%lld];"
-                                       " rc = [%d]\n", prev_page_end_size, rc);
+                                       " rc = [%d]\n", __FUNCTION__,
+                                       prev_page_end_size, rc);
                                goto out;
                        }
                }
        }
-        /*
+        /* Writing to a new page, and creating a small hole from start
-         * Writing to a new page, and creating a small hole from start of page?
+         * of page?  Zero it out. */
-         * Zero it out.
+        if ((i_size_read(page->mapping->host) == prev_page_end_size)
-         */
+            && (from != 0))
-        if ((i_size_read(page->mapping->host) == prev_page_end_size) &&
-            (from != 0)) {
                zero_user(page, 0, PAGE_CACHE_SIZE);
-        }
 out:
        return rc;
 }
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index dfb5cb400217..49308a29798a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -5,8 +5,8 @@
 */
 #include <linux/buffer_head.h>
-#include <linux/efs_fs.h>
 #include <linux/smp_lock.h>
+#include "efs.h"
 static int efs_readdir(struct file *, void *, filldir_t);
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
new file mode 100644
index 000000000000..d8305b582ab0
--- /dev/null
+++ b/fs/efs/efs.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 1999 Al Smith
+ *
+ * Portions derived from work (c) 1995,1996 Christian Vogelgsang.
+ * Portions derived from IRIX header files (c) 1988 Silicon Graphics
+ */
+#ifndef _EFS_EFS_H_
+#define _EFS_EFS_H_
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#define EFS_VERSION "1.0a"
+static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith <Al.Smith@aeschi.ch.eu.org>";
+/* 1 block is 512 bytes */
+#define EFS_BLOCKSIZE_BITS      9
+#define EFS_BLOCKSIZE           (1 << EFS_BLOCKSIZE_BITS)
+typedef int32_t         efs_block_t;
+typedef uint32_t        efs_ino_t;
+#define EFS_DIRECTEXTENTS       12
+/*
+ * layout of an extent, in memory and on disk. 8 bytes exactly.
+ */
+typedef union extent_u {
+        unsigned char raw[8];
+        struct extent_s {
+                unsigned int    ex_magic:8;     /* magic # (zero) */
+                unsigned int    ex_bn:24;       /* basic block */
+                unsigned int    ex_length:8;    /* numblocks in this extent */
+                unsigned int    ex_offset:24;   /* logical offset into file */
+        } cooked;
+} efs_extent;
+typedef struct edevs {
+        __be16          odev;
+        __be32          ndev;
+} efs_devs;
+/*
+ * extent based filesystem inode as it appears on disk.  The efs inode
+ * is exactly 128 bytes long.
+ */
+struct  efs_dinode {
+        __be16          di_mode;        /* mode and type of file */
+        __be16          di_nlink;       /* number of links to file */
+        __be16          di_uid;         /* owner's user id */
+        __be16          di_gid;         /* owner's group id */
+        __be32          di_size;        /* number of bytes in file */
+        __be32          di_atime;       /* time last accessed */
+        __be32          di_mtime;       /* time last modified */
+        __be32          di_ctime;       /* time created */
+        __be32          di_gen;         /* generation number */
+        __be16          di_numextents;  /* # of extents */
+        u_char          di_version;     /* version of inode */
+        u_char          di_spare;       /* spare - used by AFS */
+        union di_addr {
+                efs_extent      di_extents[EFS_DIRECTEXTENTS];
+                efs_devs        di_dev; /* device for IFCHR/IFBLK */
+        } di_u;
+};
+/* efs inode storage in memory */
+struct efs_inode_info {
+        int             numextents;
+        int             lastextent;
+        efs_extent      extents[EFS_DIRECTEXTENTS];
+        struct inode    vfs_inode;
+};
+#include <linux/efs_fs_sb.h>
+#define EFS_DIRBSIZE_BITS       EFS_BLOCKSIZE_BITS
+#define EFS_DIRBSIZE            (1 << EFS_DIRBSIZE_BITS)
+struct efs_dentry {
+        __be32          inode;
+        unsigned char   namelen;
+        char            name[3];
+};
+#define EFS_DENTSIZE    (sizeof(struct efs_dentry) - 3 + 1)
+#define EFS_MAXNAMELEN  ((1 << (sizeof(char) * 8)) - 1)
+#define EFS_DIRBLK_HEADERSIZE   4
+#define EFS_DIRBLK_MAGIC        0xbeef  /* moo */
+struct efs_dir {
+        __be16  magic;
+        unsigned char   firstused;
+        unsigned char   slots;
+        unsigned char   space[EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE];
+};
+#define EFS_MAXENTS \
+        ((EFS_DIRBSIZE - EFS_DIRBLK_HEADERSIZE) / \
+         (EFS_DENTSIZE + sizeof(char)))
+#define EFS_SLOTAT(dir, slot) EFS_REALOFF((dir)->space[slot])
+#define EFS_REALOFF(offset) ((offset << 1))
+static inline struct efs_inode_info *INODE_INFO(struct inode *inode)
+{
+        return container_of(inode, struct efs_inode_info, vfs_inode);
+}
+static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+struct statfs;
+struct fid;
+extern const struct inode_operations efs_dir_inode_operations;
+extern const struct file_operations efs_dir_operations;
+extern const struct address_space_operations efs_symlink_aops;
+extern struct inode *efs_iget(struct super_block *, unsigned long);
+extern efs_block_t efs_map_block(struct inode *, efs_block_t);
+extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *efs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                int fh_len, int fh_type);
+extern struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
+                int fh_len, int fh_type);
+extern struct dentry *efs_get_parent(struct dentry *);
+extern int efs_bmap(struct inode *, int);
+#endif /* _EFS_EFS_H_ */
diff --git a/fs/efs/file.c b/fs/efs/file.c
index 5db20129681e..1ccb364ffa63 100644
--- a/fs/efs/file.c
+++ b/fs/efs/file.c
@@ -7,7 +7,7 @@
 */
 #include <linux/buffer_head.h>
-#include <linux/efs_fs.h>
+#include "efs.h"
 int efs_get_block(struct inode *inode, sector_t iblock,
                  struct buffer_head *bh_result, int create)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 627c3026946d..a8e7797b9477 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -7,11 +7,11 @@
 *              and from work (c) 1998 Mike Shaver.
 */
-#include <linux/efs_fs.h>
-#include <linux/efs_fs_sb.h>
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include "efs.h"
+#include <linux/efs_fs_sb.h>
 static int efs_readpage(struct file *file, struct page *page)
 {
@@ -140,7 +140,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
        brelse(bh);
   
 #ifdef DEBUG
-        printk(KERN_DEBUG "EFS: read_inode(): inode %lu, extents %d, mode %o\n",
+        printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n",
                inode->i_ino, in->numextents, inode->i_mode);
 #endif
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index e26704742d41..3a404e7fad53 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -8,9 +8,9 @@
 #include <linux/buffer_head.h>
 #include <linux/string.h>
-#include <linux/efs_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/exportfs.h>
+#include "efs.h"
 static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 14082405cdd1..d733531b55e2 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -8,14 +8,15 @@
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/efs_fs.h>
-#include <linux/efs_vh.h>
-#include <linux/efs_fs_sb.h>
 #include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include "efs.h"
+#include <linux/efs_vh.h>
+#include <linux/efs_fs_sb.h>
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 1d30d2ff440f..41911ec83aaf 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -7,10 +7,10 @@
 */
 #include <linux/string.h>
-#include <linux/efs_fs.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include "efs.h"
 static int efs_symlink_readpage(struct file *file, struct page *page)
 {
diff --git a/fs/exec.c b/fs/exec.c
index a44b142fb460..54a0a557b678 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -173,8 +173,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                return NULL;
        if (write) {
-                struct rlimit *rlim = current->signal->rlim;
                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+                struct rlimit *rlim;
+                /*
+                 * We've historically supported up to 32 pages (ARG_MAX)
+                 * of argument strings even with small stacks
+                 */
+                if (size <= ARG_MAX)
+                        return page;
                /*
                 * Limit to 1/4-th the stack size for the argv+env strings.
@@ -183,6 +190,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                 *  - the program will have a reasonable amount of stack left
                 *    to work from.
                 */
+                rlim = current->signal->rlim;
                if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
                        put_page(page);
                        return NULL;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5deb8b74e649..08f647d8188d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -253,7 +253,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
 * it has too few free inodes left (min_inodes) or 
 * it has too few free blocks left (min_blocks) or 
 * it's already running too large debt (max_debt). 
- * Parent's group is prefered, if it doesn't satisfy these 
+ * Parent's group is preferred, if it doesn't satisfy these 
 * conditions we search cyclically through the rest. If none 
 * of the groups look good we just look for a group with more 
 * free inodes than average (starting at parent's group). 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c62006805427..b8a2990bab83 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -239,7 +239,7 @@ no_block:
 *      @inode: owner
 *      @ind: descriptor of indirect block.
 *
- *      This function returns the prefered place for block allocation.
+ *      This function returns the preferred place for block allocation.
 *      It is used when heuristic for sequential allocation fails.
 *      Rules are:
 *        + if there is a block to the left of our position - allocate near it.
@@ -283,7 +283,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
 }
 /**
- *      ext2_find_goal - find a prefered place for allocation.
+ *      ext2_find_goal - find a preferred place for allocation.
 *      @inode: owner
 *      @block:  block we want
 *      @partial: pointer to the last triple within a chain
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b8ea11fee5c6..de876fa793e1 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
+#include <linux/mount.h>
 #include <linux/smp_lock.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
@@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        struct ext2_inode_info *ei = EXT2_I(inode);
        unsigned int flags;
        unsigned short rsv_window_size;
+        int ret;
        ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
@@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case EXT2_IOC_SETFLAGS: {
                unsigned int oldflags;
-                if (IS_RDONLY(inode))
+                ret = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (ret)
+                        return ret;
-                if (!is_owner_or_cap(inode))
+                if (!is_owner_or_cap(inode)) {
-                        return -EACCES;
+                        ret = -EACCES;
+                        goto setflags_out;
+                }
-                if (get_user(flags, (int __user *) arg))
+                if (get_user(flags, (int __user *) arg)) {
-                        return -EFAULT;
+                        ret = -EFAULT;
+                        goto setflags_out;
+                }
                if (!S_ISDIR(inode->i_mode))
                        flags &= ~EXT2_DIRSYNC_FL;
@@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                /* Is it quota file? Do not allow user to mess with it */
                if (IS_NOQUOTA(inode)) {
                        mutex_unlock(&inode->i_mutex);
-                        return -EPERM;
+                        ret = -EPERM;
+                        goto setflags_out;
                }
                oldflags = ei->i_flags;
@@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
                        if (!capable(CAP_LINUX_IMMUTABLE)) {
                                mutex_unlock(&inode->i_mutex);
-                                return -EPERM;
+                                ret = -EPERM;
+                                goto setflags_out;
                        }
                }
@@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                ext2_set_inode_flags(inode);
                inode->i_ctime = CURRENT_TIME_SEC;
                mark_inode_dirty(inode);
-                return 0;
+setflags_out:
+                mnt_drop_write(filp->f_path.mnt);
+                return ret;
        }
        case EXT2_IOC_GETVERSION:
                return put_user(inode->i_generation, (int __user *) arg);
        case EXT2_IOC_SETVERSION:
                if (!is_owner_or_cap(inode))
                        return -EPERM;
-                if (IS_RDONLY(inode))
+                ret = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (ret)
-                if (get_user(inode->i_generation, (int __user *) arg))
+                        return ret;
-                        return -EFAULT; 
+                if (get_user(inode->i_generation, (int __user *) arg)) {
-                inode->i_ctime = CURRENT_TIME_SEC;
+                        ret = -EFAULT;
-                mark_inode_dirty(inode);
+                } else {
-                return 0;
+                        inode->i_ctime = CURRENT_TIME_SEC;
+                        mark_inode_dirty(inode);
+                }
+                mnt_drop_write(filp->f_path.mnt);
+                return ret;
        case EXT2_IOC_GETRSVSZ:
                if (test_opt(inode->i_sb, RESERVATION)
                        && S_ISREG(inode->i_mode)
@@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
                        return -ENOTTY;
-                if (IS_RDONLY(inode))
+                if (!is_owner_or_cap(inode))
-                        return -EROFS;
-                if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
                        return -EACCES;
                if (get_user(rsv_window_size, (int __user *)arg))
                        return -EFAULT;
+                ret = mnt_want_write(filp->f_path.mnt);
+                if (ret)
+                        return ret;
                if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
                        rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
@@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        rsv->rsv_goal_size = rsv_window_size;
                }
                mutex_unlock(&ei->truncate_mutex);
+                mnt_drop_write(filp->f_path.mnt);
                return 0;
        }
        default:
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 3e8683dbb13f..a99d46f3b26e 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -835,7 +835,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext2_xattr_cache);
+        ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
        if (!ce)
                return -ENOMEM;
        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index d34e9967430a..a754d1848173 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -37,7 +37,7 @@ ext3_acl_from_disk(const void *value, size_t size)
                return ERR_PTR(-EINVAL);
        if (count == 0)
                return NULL;
-        acl = posix_acl_alloc(count, GFP_KERNEL);
+        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        for (n=0; n < count; n++) {
@@ -91,7 +91,7 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
        *size = ext3_acl_size(acl->a_count);
        ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
-                        sizeof(ext3_acl_entry), GFP_KERNEL);
+                        sizeof(ext3_acl_entry), GFP_NOFS);
        if (!ext_acl)
                return ERR_PTR(-ENOMEM);
        ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
@@ -187,7 +187,7 @@ ext3_get_acl(struct inode *inode, int type)
        }
        retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
-                value = kmalloc(retval, GFP_KERNEL);
+                value = kmalloc(retval, GFP_NOFS);
                if (!value)
                        return ERR_PTR(-ENOMEM);
                retval = ext3_xattr_get(inode, name_index, "", value, retval);
@@ -335,7 +335,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                        if (error)
                                goto cleanup;
                }
-                clone = posix_acl_clone(acl, GFP_KERNEL);
+                clone = posix_acl_clone(acl, GFP_NOFS);
                error = -ENOMEM;
                if (!clone)
                        goto cleanup;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4f4020c54683..96dd5573e49b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -239,7 +239,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
 * it has too few free inodes left (min_inodes) or
 * it has too few free blocks left (min_blocks) or
 * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb95670a27eb..c683609b0e3a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -392,7 +392,7 @@ no_block:
 *      @inode: owner
 *      @ind: descriptor of indirect block.
 *
- *      This function returns the prefered place for block allocation.
+ *      This function returns the preferred place for block allocation.
 *      It is used when heuristic for sequential allocation fails.
 *      Rules are:
 *        + if there is a block to the left of our position - allocate near it.
@@ -436,12 +436,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 }
 /**
- *      ext3_find_goal - find a prefered place for allocation.
+ *      ext3_find_goal - find a preferred place for allocation.
 *      @inode: owner
 *      @block:  block we want
 *      @partial: pointer to the last triple within a chain
 *
- *      Normally this function find the prefered place for block allocation,
+ *      Normally this function find the preferred place for block allocation,
 *      returns it.
 */
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 023a070f55f1..0d0c70151642 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -12,6 +12,7 @@
 #include <linux/capability.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
+#include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
@@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                unsigned int oldflags;
                unsigned int jflag;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
+                        return err;
-                if (!is_owner_or_cap(inode))
+                if (!is_owner_or_cap(inode)) {
-                        return -EACCES;
+                        err = -EACCES;
+                        goto flags_out;
+                }
-                if (get_user(flags, (int __user *) arg))
+                if (get_user(flags, (int __user *) arg)) {
-                        return -EFAULT;
+                        err = -EFAULT;
+                        goto flags_out;
+                }
                if (!S_ISDIR(inode->i_mode))
                        flags &= ~EXT3_DIRSYNC_FL;
@@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                /* Is it quota file? Do not allow user to mess with it */
                if (IS_NOQUOTA(inode)) {
                        mutex_unlock(&inode->i_mutex);
-                        return -EPERM;
+                        err = -EPERM;
+                        goto flags_out;
                }
                oldflags = ei->i_flags;
@@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
                        if (!capable(CAP_LINUX_IMMUTABLE)) {
                                mutex_unlock(&inode->i_mutex);
-                                return -EPERM;
+                                err = -EPERM;
+                                goto flags_out;
                        }
                }
@@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
                        if (!capable(CAP_SYS_RESOURCE)) {
                                mutex_unlock(&inode->i_mutex);
-                                return -EPERM;
+                                err = -EPERM;
+                                goto flags_out;
                        }
                }
@@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                handle = ext3_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        mutex_unlock(&inode->i_mutex);
-                        return PTR_ERR(handle);
+                        err = PTR_ERR(handle);
+                        goto flags_out;
                }
                if (IS_SYNC(inode))
                        handle->h_sync = 1;
@@ -115,6 +125,8 @@ flags_err:
                if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
                        err = ext3_change_inode_journal_flag(inode, jflag);
                mutex_unlock(&inode->i_mutex);
+flags_out:
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
        case EXT3_IOC_GETVERSION:
@@ -129,14 +141,18 @@ flags_err:
                if (!is_owner_or_cap(inode))
                        return -EPERM;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
-                if (get_user(generation, (int __user *) arg))
+                        return err;
-                        return -EFAULT;
+                if (get_user(generation, (int __user *) arg)) {
+                        err = -EFAULT;
+                        goto setversion_out;
+                }
                handle = ext3_journal_start(inode, 1);
-                if (IS_ERR(handle))
+                if (IS_ERR(handle)) {
-                        return PTR_ERR(handle);
+                        err = PTR_ERR(handle);
+                        goto setversion_out;
+                }
                err = ext3_reserve_inode_write(handle, inode, &iloc);
                if (err == 0) {
                        inode->i_ctime = CURRENT_TIME_SEC;
@@ -144,6 +160,8 @@ flags_err:
                        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
                }
                ext3_journal_stop(handle);
+setversion_out:
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
 #ifdef CONFIG_JBD_DEBUG
@@ -179,18 +197,24 @@ flags_err:
                }
                return -ENOTTY;
        case EXT3_IOC_SETRSVSZ: {
+                int err;
                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
                        return -ENOTTY;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
+                        return err;
-                if (!is_owner_or_cap(inode))
+                if (!is_owner_or_cap(inode)) {
-                        return -EACCES;
+                        err = -EACCES;
+                        goto setrsvsz_out;
+                }
-                if (get_user(rsv_window_size, (int __user *)arg))
+                if (get_user(rsv_window_size, (int __user *)arg)) {
-                        return -EFAULT;
+                        err = -EFAULT;
+                        goto setrsvsz_out;
+                }
                if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
                        rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -208,7 +232,9 @@ flags_err:
                        rsv->rsv_goal_size = rsv_window_size;
                }
                mutex_unlock(&ei->truncate_mutex);
-                return 0;
+setrsvsz_out:
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
        }
        case EXT3_IOC_GROUP_EXTEND: {
                ext3_fsblk_t n_blocks_count;
@@ -218,17 +244,20 @@ flags_err:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
+                        return err;
-                if (get_user(n_blocks_count, (__u32 __user *)arg))
-                        return -EFAULT;
+                if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+                        err = -EFAULT;
+                        goto group_extend_out;
+                }
                err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
                journal_lock_updates(EXT3_SB(sb)->s_journal);
                journal_flush(EXT3_SB(sb)->s_journal);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
+group_extend_out:
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
        case EXT3_IOC_GROUP_ADD: {
@@ -239,18 +268,22 @@ flags_err:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
+                        return err;
                if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
-                                sizeof(input)))
+                                sizeof(input))) {
-                        return -EFAULT;
+                        err = -EFAULT;
+                        goto group_add_out;
+                }
                err = ext3_group_add(sb, &input);
                journal_lock_updates(EXT3_SB(sb)->s_journal);
                journal_flush(EXT3_SB(sb)->s_journal);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
+group_add_out:
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 9397d779c43d..0e97b6e07cb0 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -485,7 +485,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_dindj;
        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
-                        GFP_KERNEL);
+                        GFP_NOFS);
        if (!n_group_desc) {
                err = -ENOMEM;
                ext3_warning (sb, __FUNCTION__,
@@ -568,7 +568,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        int res, i;
        int err;
-        primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+        primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
        if (!primary)
                return -ENOMEM;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 18769cc32377..ad5360664082 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -806,8 +806,8 @@ static match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
-        {Opt_err, NULL},
        {Opt_resize, "resize"},
+        {Opt_err, NULL},
 };
 static ext3_fsblk_t get_sb_block(void **data)
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index fb89c299bece..42856541e9a5 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -728,7 +728,7 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
                                ce = NULL;
                        }
                        ea_bdebug(bs->bh, "cloning");
-                        s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+                        s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
                        error = -ENOMEM;
                        if (s->base == NULL)
                                goto cleanup;
@@ -740,7 +740,7 @@ ext3_xattr_block_set(handle_t *handle, struct inode *inode,
                }
        } else {
                /* Allocate a buffer where we construct the new block. */
-                s->base = kzalloc(sb->s_blocksize, GFP_KERNEL);
+                s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
                /* assert(header == s->base) */
                error = -ENOMEM;
                if (s->base == NULL)
@@ -1126,7 +1126,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext3_xattr_cache);
+        ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
        if (!ce) {
                ea_bdebug(bh, "out of memory");
                return;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 33888bb58144..2c23bade9aa6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -46,7 +46,7 @@ const struct file_operations ext4_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
 #endif
-        .fsync          = ext4_sync_file,       /* BKL held */
+        .fsync          = ext4_sync_file,
        .release        = ext4_release_dir,
 };
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc7081f1fbe8..9ae6e67090cd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -148,6 +148,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_fsblk_t bg_start;
+        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
        int depth;
@@ -169,8 +170,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        /* OK. use inode's group */
        bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
-        colour = (current->pid % 16) *
+        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+        else
+                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
        return bg_start + colour + block;
 }
@@ -349,7 +355,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 #define ext4_ext_show_leaf(inode,path)
 #endif
-static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
        int depth = path->p_depth;
        int i;
@@ -2168,6 +2174,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        newblock = iblock - ee_block + ext_pblock(ex);
        ex2 = ex;
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
        /* ex1: ee_block to iblock - 1 : uninitialized */
        if (iblock > ee_block) {
                ex1 = ex;
@@ -2200,16 +2210,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                newdepth = ext_depth(inode);
                if (newdepth != depth) {
                        depth = newdepth;
-                        path = ext4_ext_find_extent(inode, iblock, NULL);
+                        ext4_ext_drop_refs(path);
+                        path = ext4_ext_find_extent(inode, iblock, path);
                        if (IS_ERR(path)) {
                                err = PTR_ERR(path);
-                                path = NULL;
                                goto out;
                        }
                        eh = path[depth].p_hdr;
                        ex = path[depth].p_ext;
                        if (ex2 != &newex)
                                ex2 = ex;
+                        err = ext4_ext_get_access(handle, inode, path + depth);
+                        if (err)
+                                goto out;
                }
                allocated = max_blocks;
        }
@@ -2230,9 +2244,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ex2->ee_len = cpu_to_le16(allocated);
        if (ex2 != ex)
                goto insert;
-        err = ext4_ext_get_access(handle, inode, path + depth);
-        if (err)
-                goto out;
        /*
         * New (initialized) extent starts from the first block
         * in the current extent. i.e., ex2 == ex
@@ -2276,9 +2287,22 @@ out:
 }
 /*
+ * Block allocation/map/preallocation routine for extents based files
+ *
+ *
 * Need to be called with
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ *
+ * return > 0, number of of blocks already mapped/allocated
+ *          if create == 0 and these are pre-allocated blocks
+ *              buffer head is unmapped
+ *          otherwise blocks are mapped
+ *
+ * return = 0, if plain look up failed (blocks have not been allocated)
+ *          buffer head is unmapped
+ *
+ * return < 0, error case.
 */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
@@ -2623,7 +2647,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
         * modify 1 super block, 1 block bitmap and 1 group descriptor.
         */
        credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
-        down_write((&EXT4_I(inode)->i_data_sem));
+        mutex_lock(&inode->i_mutex);
 retry:
        while (ret >= 0 && ret < max_blocks) {
                block = block + ret;
@@ -2634,16 +2658,17 @@ retry:
                        break;
                }
-                ret = ext4_ext_get_blocks(handle, inode, block,
+                ret = ext4_get_blocks_wrap(handle, inode, block,
                                          max_blocks, &map_bh,
                                          EXT4_CREATE_UNINITIALIZED_EXT, 0);
-                WARN_ON(ret <= 0);
                if (ret <= 0) {
-                        ext4_error(inode->i_sb, "ext4_fallocate",
+#ifdef EXT4FS_DEBUG
-                                    "ext4_ext_get_blocks returned error: "
+                        WARN_ON(ret <= 0);
-                                    "inode#%lu, block=%u, max_blocks=%lu",
+                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                                    "returned error inode#%lu, block=%u, "
+                                    "max_blocks=%lu", __func__,
                                    inode->i_ino, block, max_blocks);
-                        ret = -EIO;
+#endif
                        ext4_mark_inode_dirty(handle, inode);
                        ret2 = ext4_journal_stop(handle);
                        break;
@@ -2680,7 +2705,6 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        up_write((&EXT4_I(inode)->i_data_sem));
        /*
         * Time to update the file size.
         * Update only when preallocation was requested beyond the file size.
@@ -2692,21 +2716,18 @@ retry:
                         * if no error, we assume preallocation succeeded
                         * completely
                         */
-                        mutex_lock(&inode->i_mutex);
                        i_size_write(inode, offset + len);
                        EXT4_I(inode)->i_disksize = i_size_read(inode);
-                        mutex_unlock(&inode->i_mutex);
                } else if (ret < 0 && nblocks) {
                        /* Handle partial allocation scenario */
                        loff_t newsize;
-                        mutex_lock(&inode->i_mutex);
                        newsize  = (nblocks << blkbits) + i_size_read(inode);
                        i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
                        EXT4_I(inode)->i_disksize = i_size_read(inode);
-                        mutex_unlock(&inode->i_mutex);
                }
        }
+        mutex_unlock(&inode->i_mutex);
        return ret > 0 ? ret2 : ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index da18a74b966a..486e46a3918d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -305,7 +305,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 * it has too few free inodes left (min_inodes) or
 * it has too few free blocks left (min_blocks) or
 * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
@@ -702,7 +702,12 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
-        ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
+        /*
+         * Don't inherit extent flag from directory. We set extent flag on
+         * newly created directory and file only if -o extent mount option is
+         * specified
+         */
+        ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
        if (S_ISLNK(mode))
                ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
        /* dirsync only applies to directories */
@@ -745,12 +750,15 @@ got:
                goto fail_free_drop;
        }
        if (test_opt(sb, EXTENTS)) {
-                EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+                /* set extent flag only for directory and file */
-                ext4_ext_tree_init(handle, inode);
+                if (S_ISDIR(mode) || S_ISREG(mode)) {
-                err = ext4_update_incompat_feature(handle, sb,
+                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
-                                                EXT4_FEATURE_INCOMPAT_EXTENTS);
+                        ext4_ext_tree_init(handle, inode);
-                if (err)
+                        err = ext4_update_incompat_feature(handle, sb,
-                        goto fail;
+                                        EXT4_FEATURE_INCOMPAT_EXTENTS);
+                        if (err)
+                                goto fail;
+                }
        }
        ext4_debug("allocating inode %lu\n", inode->i_ino);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7dd9b50d5ebc..8fab233cb05f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -382,7 +382,7 @@ no_block:
 *      @inode: owner
 *      @ind: descriptor of indirect block.
 *
- *      This function returns the prefered place for block allocation.
+ *      This function returns the preferred place for block allocation.
 *      It is used when heuristic for sequential allocation fails.
 *      Rules are:
 *        + if there is a block to the left of our position - allocate near it.
@@ -403,6 +403,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
        __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
        __le32 *p;
        ext4_fsblk_t bg_start;
+        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
        /* Try to find previous block */
@@ -420,18 +421,23 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
         * into the same cylinder group then.
         */
        bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
-        colour = (current->pid % 16) *
+        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+        else
+                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
        return bg_start + colour;
 }
 /**
- *      ext4_find_goal - find a prefered place for allocation.
+ *      ext4_find_goal - find a preferred place for allocation.
 *      @inode: owner
 *      @block:  block we want
 *      @partial: pointer to the last triple within a chain
 *
- *      Normally this function find the prefered place for block allocation,
+ *      Normally this function find the preferred place for block allocation,
 *      returns it.
 */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
@@ -768,7 +774,6 @@ err_out:
 *
 * `handle' can be NULL if create == 0.
 *
- * The BKL may not be held on entry here.  Be sure to take it early.
 * return > 0, # of blocks mapped or allocated.
 * return = 0, if plain lookup failed.
 * return < 0, error case.
@@ -903,11 +908,38 @@ out:
 */
 #define DIO_CREDITS 25
+/*
+ *
+ *
+ * ext4_ext4 get_block() wrapper function
+ * It will do a look up first, and returns if the blocks already mapped.
+ * Otherwise it takes the write lock of the i_data_sem and allocate blocks
+ * and store the allocated blocks in the result buffer head and mark it
+ * mapped.
+ *
+ * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * based files
+ *
+ * On success, it returns the number of blocks being mapped or allocate.
+ * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * the result buffer head is unmapped. If the create ==1, it will make sure
+ * the buffer head is mapped.
+ *
+ * It returns 0 if plain look up failed (blocks have not been allocated), in
+ * that casem, buffer head is unmapped
+ *
+ * It returns the error in case of allocation failure.
+ */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                        unsigned long max_blocks, struct buffer_head *bh,
                        int create, int extend_disksize)
 {
        int retval;
+        clear_buffer_mapped(bh);
        /*
         * Try to see if we can get  the block without requesting
         * for new file system block.
@@ -921,12 +953,26 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                                inode, block, max_blocks, bh, 0, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
-        if (!create || (retval > 0))
+        /* If it is only a block(s) look up */
+        if (!create)
+                return retval;
+        /*
+         * Returns if the blocks have already allocated
+         *
+         * Note that if blocks have been preallocated
+         * ext4_ext_get_block() returns th create = 0
+         * with buffer head unmapped.
+         */
+        if (retval > 0 && buffer_mapped(bh))
                return retval;
        /*
-         * We need to allocate new blocks which will result
+         * New blocks allocate and/or writing to uninitialized extent
-         * in i_data update
+         * will possibly result in updating i_data, so we take
+         * the write lock of i_data_sem, and call get_blocks()
+         * with create == 1 flag.
         */
        down_write((&EXT4_I(inode)->i_data_sem));
        /*
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897e..25b13ede8086 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
@@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                unsigned int oldflags;
                unsigned int jflag;
-                if (IS_RDONLY(inode))
-                        return -EROFS;
                if (!is_owner_or_cap(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
                        return -EFAULT;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
                if (!S_ISDIR(inode->i_mode))
                        flags &= ~EXT4_DIRSYNC_FL;
+                err = -EPERM;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
-                if (IS_NOQUOTA(inode)) {
+                if (IS_NOQUOTA(inode))
-                        mutex_unlock(&inode->i_mutex);
+                        goto flags_out;
-                        return -EPERM;
-                }
                oldflags = ei->i_flags;
                /* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                 * This test looks nicer. Thanks to Pauline Middelink
                 */
                if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-                        if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        if (!capable(CAP_LINUX_IMMUTABLE))
-                                mutex_unlock(&inode->i_mutex);
+                                goto flags_out;
-                                return -EPERM;
-                        }
                }
                /*
@@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                 * the relevant capability.
                 */
                if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-                        if (!capable(CAP_SYS_RESOURCE)) {
+                        if (!capable(CAP_SYS_RESOURCE))
-                                mutex_unlock(&inode->i_mutex);
+                                goto flags_out;
-                                return -EPERM;
-                        }
                }
                handle = ext4_journal_start(inode, 1);
                if (IS_ERR(handle)) {
-                        mutex_unlock(&inode->i_mutex);
+                        err = PTR_ERR(handle);
-                        return PTR_ERR(handle);
+                        goto flags_out;
                }
                if (IS_SYNC(inode))
                        handle->h_sync = 1;
@@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
                ext4_journal_stop(handle);
-                if (err) {
+                if (err)
-                        mutex_unlock(&inode->i_mutex);
+                        goto flags_out;
-                        return err;
-                }
                if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
                        err = ext4_change_inode_journal_flag(inode, jflag);
+flags_out:
                mutex_unlock(&inode->i_mutex);
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
        case EXT4_IOC_GETVERSION:
@@ -129,14 +126,20 @@ flags_err:
                if (!is_owner_or_cap(inode))
                        return -EPERM;
-                if (IS_RDONLY(inode))
-                        return -EROFS;
+                err = mnt_want_write(filp->f_path.mnt);
-                if (get_user(generation, (int __user *) arg))
+                if (err)
-                        return -EFAULT;
+                        return err;
+                if (get_user(generation, (int __user *) arg)) {
+                        err = -EFAULT;
+                        goto setversion_out;
+                }
                handle = ext4_journal_start(inode, 1);
-                if (IS_ERR(handle))
+                if (IS_ERR(handle)) {
-                        return PTR_ERR(handle);
+                        err = PTR_ERR(handle);
+                        goto setversion_out;
+                }
                err = ext4_reserve_inode_write(handle, inode, &iloc);
                if (err == 0) {
                        inode->i_ctime = ext4_current_time(inode);
@@ -144,6 +147,8 @@ flags_err:
                        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
                }
                ext4_journal_stop(handle);
+setversion_out:
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
 #ifdef CONFIG_JBD2_DEBUG
@@ -179,19 +184,21 @@ flags_err:
                }
                return -ENOTTY;
        case EXT4_IOC_SETRSVSZ: {
+                int err;
                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
                        return -ENOTTY;
-                if (IS_RDONLY(inode))
-                        return -EROFS;
                if (!is_owner_or_cap(inode))
                        return -EACCES;
                if (get_user(rsv_window_size, (int __user *)arg))
                        return -EFAULT;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
                if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
                        rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
@@ -208,6 +215,7 @@ flags_err:
                        rsv->rsv_goal_size = rsv_window_size;
                }
                up_write(&ei->i_data_sem);
+                mnt_drop_write(filp->f_path.mnt);
                return 0;
        }
        case EXT4_IOC_GROUP_EXTEND: {
@@ -218,16 +226,18 @@ flags_err:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
-                if (IS_RDONLY(inode))
-                        return -EROFS;
                if (get_user(n_blocks_count, (__u32 __user *)arg))
                        return -EFAULT;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
@@ -239,17 +249,19 @@ flags_err:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
-                if (IS_RDONLY(inode))
-                        return -EROFS;
                if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
                                sizeof(input)))
                        return -EFAULT;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
                err = ext4_group_add(sb, &input);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dd0fcfcb35ce..ef97f19c2f9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -627,21 +627,19 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
        return block;
 }
+static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
+{
 #if BITS_PER_LONG == 64
-#define mb_correct_addr_and_bit(bit, addr)              \
+        *bit += ((unsigned long) addr & 7UL) << 3;
-{                                                       \
+        addr = (void *) ((unsigned long) addr & ~7UL);
-        bit += ((unsigned long) addr & 7UL) << 3;       \
-        addr = (void *) ((unsigned long) addr & ~7UL);  \
-}
 #elif BITS_PER_LONG == 32
-#define mb_correct_addr_and_bit(bit, addr)              \
+        *bit += ((unsigned long) addr & 3UL) << 3;
-{                                                       \
+        addr = (void *) ((unsigned long) addr & ~3UL);
-        bit += ((unsigned long) addr & 3UL) << 3;       \
-        addr = (void *) ((unsigned long) addr & ~3UL);  \
-}
 #else
 #error "how many bits you are?!"
 #endif
+        return addr;
+}
 static inline int mb_test_bit(int bit, void *addr)
 {
@@ -649,34 +647,54 @@ static inline int mb_test_bit(int bit, void *addr)
         * ext4_test_bit on architecture like powerpc
         * needs unsigned long aligned address
         */
-        mb_correct_addr_and_bit(bit, addr);
+        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_bit(bit, addr);
 }
 static inline void mb_set_bit(int bit, void *addr)
 {
-        mb_correct_addr_and_bit(bit, addr);
+        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_set_bit(bit, addr);
 }
 static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
 {
-        mb_correct_addr_and_bit(bit, addr);
+        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_set_bit_atomic(lock, bit, addr);
 }
 static inline void mb_clear_bit(int bit, void *addr)
 {
-        mb_correct_addr_and_bit(bit, addr);
+        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
 }
 static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 {
-        mb_correct_addr_and_bit(bit, addr);
+        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit_atomic(lock, bit, addr);
 }
+static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
+        int fix = 0;
+        addr = mb_correct_addr_and_bit(&fix, addr);
+        max += fix;
+        start += fix;
+        return ext4_find_next_zero_bit(addr, max, start) - fix;
+}
+static inline int mb_find_next_bit(void *addr, int max, int start)
+{
+        int fix = 0;
+        addr = mb_correct_addr_and_bit(&fix, addr);
+        max += fix;
+        start += fix;
+        return ext4_find_next_bit(addr, max, start) - fix;
+}
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
        char *bb;
@@ -906,7 +924,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        unsigned short chunk;
        unsigned short border;
-        BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
+        BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
        border = 2 << sb->s_blocksize_bits;
@@ -946,12 +964,12 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
        /* initialize buddy from bitmap which is aggregation
         * of on-disk bitmap and preallocations */
-        i = ext4_find_next_zero_bit(bitmap, max, 0);
+        i = mb_find_next_zero_bit(bitmap, max, 0);
        grp->bb_first_free = i;
        while (i < max) {
                fragments++;
                first = i;
-                i = ext4_find_next_bit(bitmap, max, i);
+                i = mb_find_next_bit(bitmap, max, i);
                len = i - first;
                free += len;
                if (len > 1)
@@ -959,7 +977,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
                else
                        grp->bb_counters[0]++;
                if (i < max)
-                        i = ext4_find_next_zero_bit(bitmap, max, i);
+                        i = mb_find_next_zero_bit(bitmap, max, i);
        }
        grp->bb_fragments = fragments;
@@ -967,6 +985,10 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
                ext4_error(sb, __FUNCTION__,
                        "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
                        group, free, grp->bb_free);
+                /*
+                 * If we intent to continue, we consider group descritor
+                 * corrupt and update bb_free using bitmap value
+                 */
                grp->bb_free = free;
        }
@@ -1778,7 +1800,7 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                buddy = mb_find_buddy(e4b, i, &max);
                BUG_ON(buddy == NULL);
-                k = ext4_find_next_zero_bit(buddy, max, 0);
+                k = mb_find_next_zero_bit(buddy, max, 0);
                BUG_ON(k >= max);
                ac->ac_found++;
@@ -1818,11 +1840,11 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
        i = e4b->bd_info->bb_first_free;
        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
-                i = ext4_find_next_zero_bit(bitmap,
+                i = mb_find_next_zero_bit(bitmap,
                                                EXT4_BLOCKS_PER_GROUP(sb), i);
                if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
                        /*
-                         * IF we corrupt the bitmap  we won't find any
+                         * IF we have corrupt bitmap, we won't find any
                         * free blocks even though group info says we
                         * we have free blocks
                         */
@@ -1838,6 +1860,12 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                        ext4_error(sb, __FUNCTION__, "%d free blocks as per "
                                        "group info. But got %d blocks\n",
                                        free, ex.fe_len);
+                        /*
+                         * The number of free blocks differs. This mostly
+                         * indicate that the bitmap is corrupt. So exit
+                         * without claiming the space.
+                         */
+                        break;
                }
                ext4_mb_measure_extent(ac, &ex, e4b);
@@ -3740,10 +3768,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
        }
        while (bit < end) {
-                bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
                        break;
-                next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
+                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
                if (next > end)
                        next = end;
                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
@@ -3771,6 +3799,10 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
                        (unsigned long) pa->pa_len);
                ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n",
                                                free, pa->pa_free);
+                /*
+                 * pa is already deleted so we use the value obtained
+                 * from the bitmap and continue.
+                 */
        }
        atomic_add(free, &sbi->s_mb_discarded);
        if (ac)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8c6c685b9d22..5c1e27de7755 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -43,6 +43,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
        if (IS_ERR(path)) {
                retval = PTR_ERR(path);
+                path = NULL;
                goto err_out;
        }
@@ -74,6 +75,10 @@ static int finish_range(handle_t *handle, struct inode *inode,
        }
        retval = ext4_ext_insert_extent(handle, inode, path, &newext);
 err_out:
+        if (path) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
        lb->first_pblock = 0;
        return retval;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a9347fb43bcc..28aa2ed4297e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1804,12 +1804,8 @@ retry:
        inode->i_fop = &ext4_dir_operations;
        inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext4_bread (handle, inode, 0, 1, &err);
-        if (!dir_block) {
+        if (!dir_block)
-                ext4_dec_count(handle, inode); /* is this nlink == 0? */
+                goto out_clear_inode;
-                ext4_mark_inode_dirty(handle, inode);
-                iput (inode);
-                goto out_stop;
-        }
        BUFFER_TRACE(dir_block, "get_write_access");
        ext4_journal_get_write_access(handle, dir_block);
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
@@ -1832,7 +1828,8 @@ retry:
        ext4_mark_inode_dirty(handle, inode);
        err = ext4_add_entry (handle, dentry, inode);
        if (err) {
-                inode->i_nlink = 0;
+out_clear_inode:
+                clear_nlink(inode);
                ext4_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -2164,7 +2161,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
        dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
        ext4_update_dx_flag(dir);
        ext4_mark_inode_dirty(handle, dir);
-        ext4_dec_count(handle, inode);
+        drop_nlink(inode);
        if (!inode->i_nlink)
                ext4_orphan_add(handle, inode);
        inode->i_ctime = ext4_current_time(inode);
@@ -2214,7 +2211,7 @@ retry:
                err = __page_symlink(inode, symname, l,
                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
-                        ext4_dec_count(handle, inode);
+                        clear_nlink(inode);
                        ext4_mark_inode_dirty(handle, inode);
                        iput (inode);
                        goto out_stop;
@@ -2223,7 +2220,6 @@ retry:
                inode->i_op = &ext4_fast_symlink_inode_operations;
                memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
                inode->i_size = l-1;
-                EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
        }
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_add_nondir(handle, dentry, inode);
@@ -2407,7 +2403,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
-                         * ext3_dec_count() won't work for many-linked dirs */
+                         * ext4_dec_count() won't work for many-linked dirs */
                        new_inode->i_nlink = 0;
                } else {
                        ext4_inc_count(handle, new_dir);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9477a2bd6ff2..e29efa0f9d62 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1037,6 +1037,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                ext4_warning(sb, __FUNCTION__,
                             "multiple resizers run on filesystem!");
                unlock_super(sb);
+                ext4_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
        }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d7962139c010..e9054c1c7d93 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1386,7 +1386,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext4_xattr_cache);
+        ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
        if (!ce) {
                ea_bdebug(bh, "out of memory");
                return;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c614175876e0..2a3bed967041 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
@@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
                mutex_lock(&inode->i_mutex);
-                if (IS_RDONLY(inode)) {
+                err = mnt_want_write(filp->f_path.mnt);
-                        err = -EROFS;
+                if (err)
-                        goto up;
+                        goto up_no_drop_write;
-                }
                /*
                 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
                MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
                mark_inode_dirty(inode);
-        up:
+up:
+                mnt_drop_write(filp->f_path.mnt);
+up_no_drop_write:
                mutex_unlock(&inode->i_mutex);
                return err;
        }
diff --git a/fs/file_table.c b/fs/file_table.c
index 6d27befe2d48..7a0a9b872251 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head)
 static inline void file_free(struct file *f)
 {
        percpu_counter_dec(&nr_files);
+        file_check_state(f);
        call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
@@ -83,6 +84,12 @@ int proc_nr_files(ctl_table *table, int write, struct file *filp,
 /* Find an unused file structure and return a pointer to it.
 * Returns NULL, if there are no more free file structures or
 * we run out of memory.
+ *
+ * Be very careful using this.  You are responsible for
+ * getting write access to any mount that you might assign
+ * to this filp, if it is opened for write.  If this is not
+ * done, you will imbalance int the mount's writer count
+ * and a warning at __fput() time.
 */
 struct file *get_empty_filp(void)
 {
@@ -193,6 +200,18 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
        file->f_mapping = dentry->d_inode->i_mapping;
        file->f_mode = mode;
        file->f_op = fop;
+        /*
+         * These mounts don't really matter in practice
+         * for r/o bind mounts.  They aren't userspace-
+         * visible.  We do this for consistency, and so
+         * that we can do debugging checks at __fput()
+         */
+        if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+                file_take_write(file);
+                error = mnt_want_write(mnt);
+                WARN_ON(error);
+        }
        return error;
 }
 EXPORT_SYMBOL(init_file);
@@ -205,6 +224,31 @@ void fput(struct file *file)
 EXPORT_SYMBOL(fput);
+/**
+ * drop_file_write_access - give up ability to write to a file
+ * @file: the file to which we will stop writing
+ *
+ * This is a central place which will give up the ability
+ * to write to @file, along with access to write through
+ * its vfsmount.
+ */
+void drop_file_write_access(struct file *file)
+{
+        struct vfsmount *mnt = file->f_path.mnt;
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        put_write_access(inode);
+        if (special_file(inode->i_mode))
+                return;
+        if (file_check_writeable(file) != 0)
+                return;
+        mnt_drop_write(mnt);
+        file_release_write(file);
+}
+EXPORT_SYMBOL_GPL(drop_file_write_access);
 /* __fput is called from task context when aio completion releases the last
 * last use of a struct file *.  Do not use otherwise.
 */
@@ -230,10 +274,10 @@ void __fput(struct file *file)
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
-        if (file->f_mode & FMODE_WRITE)
-                put_write_access(inode);
        put_pid(file->f_owner.pid);
        file_kill(file);
+        if (file->f_mode & FMODE_WRITE)
+                drop_file_write_access(file);
        file->f_path.dentry = NULL;
        file->f_path.mnt = NULL;
        file_free(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c0076077d338..06557679ca41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -751,7 +751,7 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 EXPORT_SYMBOL(generic_osync_inode);
 /**
- * writeback_acquire: attempt to get exclusive writeback access to a device
+ * writeback_acquire - attempt to get exclusive writeback access to a device
 * @bdi: the device's backing_dev_info structure
 *
 * It is a waste of resources to have more than one pdflush thread blocked on
@@ -768,7 +768,7 @@ int writeback_acquire(struct backing_dev_info *bdi)
 }
 /**
- * writeback_in_progress: determine whether there is writeback in progress
+ * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
 *
 * Determine whether there is writeback in progress against a backing device.
@@ -779,7 +779,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 /**
- * writeback_release: relinquish exclusive writeback access against a device.
+ * writeback_release - relinquish exclusive writeback access against a device.
 * @bdi: the device's backing_dev_info structure
 */
 void writeback_release(struct backing_dev_info *bdi)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7fb514b6d852..c4807b3fc8a3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -906,7 +906,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-                int err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, NULL);
                /* If permission is denied, try to refresh file
                   attributes.  This is also needed, because the root
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index de8e64c03f73..7f7947e3dfbb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL
+        depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
        select FS_POSIX_ACL
        select CRC32
        help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 8fff11058cee..e2350df02a07 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
-        glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+        glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1047a8c7226a..3e9bd46f27e3 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -116,7 +116,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
                goto out;
        er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
-        er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+        er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
        error = -ENOMEM;
        if (!er.er_data)
                goto out;
@@ -222,7 +222,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
                return error;
        }
-        clone = posix_acl_clone(acl, GFP_KERNEL);
+        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
                goto out;
@@ -272,7 +272,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
-        clone = posix_acl_clone(acl, GFP_KERNEL);
+        clone = posix_acl_clone(acl, GFP_NOFS);
        error = -ENOMEM;
        if (!clone)
                goto out;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e9456ebd3bb6..c19184f2e70e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -33,6 +33,7 @@
 * keep it small.
 */
 struct metapath {
+        struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
        __u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
@@ -135,9 +136,10 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                /* Get a free block, fill it with the stuffed data,
                   and write it out to disk */
+                unsigned int n = 1;
+                block = gfs2_alloc_block(ip, &n);
                if (isdir) {
-                        block = gfs2_alloc_meta(ip);
+                        gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
                        if (error)
                                goto out_brelse;
@@ -145,8 +147,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                                              dibh, sizeof(struct gfs2_dinode));
                        brelse(bh);
                } else {
-                        block = gfs2_alloc_data(ip);
                        error = gfs2_unstuffer_page(ip, dibh, block, page);
                        if (error)
                                goto out_brelse;
@@ -161,12 +161,11 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        if (ip->i_di.di_size) {
                *(__be64 *)(di + 1) = cpu_to_be64(block);
-                ip->i_di.di_blocks++;
+                gfs2_add_inode_blocks(&ip->i_inode, 1);
-                gfs2_set_inode_blocks(&ip->i_inode);
+                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
-                di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
        }
-        ip->i_di.di_height = 1;
+        ip->i_height = 1;
        di->di_height = cpu_to_be16(1);
 out_brelse:
@@ -176,114 +175,13 @@ out:
        return error;
 }
-/**
- * calc_tree_height - Calculate the height of a metadata tree
- * @ip: The GFS2 inode
- * @size: The proposed size of the file
- *
- * Work out how tall a metadata tree needs to be in order to accommodate a
- * file of a particular size. If size is less than the current size of
- * the inode, then the current size of the inode is used instead of the
- * supplied one.
- *
- * Returns: the height the tree should be
- */
-static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        u64 *arr;
-        unsigned int max, height;
-        if (ip->i_di.di_size > size)
-                size = ip->i_di.di_size;
-        if (gfs2_is_dir(ip)) {
-                arr = sdp->sd_jheightsize;
-                max = sdp->sd_max_jheight;
-        } else {
-                arr = sdp->sd_heightsize;
-                max = sdp->sd_max_height;
-        }
-        for (height = 0; height < max; height++)
-                if (arr[height] >= size)
-                        break;
-        return height;
-}
-/**
- * build_height - Build a metadata tree of the requested height
- * @ip: The GFS2 inode
- * @height: The height to build to
- *
- *
- * Returns: errno
- */
-static int build_height(struct inode *inode, unsigned height)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        unsigned new_height = height - ip->i_di.di_height;
-        struct buffer_head *dibh;
-        struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
-        struct gfs2_dinode *di;
-        int error;
-        __be64 *bp;
-        u64 bn;
-        unsigned n;
-        if (height <= ip->i_di.di_height)
-                return 0;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                return error;
-        for(n = 0; n < new_height; n++) {
-                bn = gfs2_alloc_meta(ip);
-                blocks[n] = gfs2_meta_new(ip->i_gl, bn);
-                gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
-        }
-        n = 0;
-        bn = blocks[0]->b_blocknr;
-        if (new_height > 1) {
-                for(; n < new_height-1; n++) {
-                        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
-                                          GFS2_FORMAT_IN);
-                        gfs2_buffer_clear_tail(blocks[n],
-                                               sizeof(struct gfs2_meta_header));
-                        bp = (__be64 *)(blocks[n]->b_data +
-                                     sizeof(struct gfs2_meta_header));
-                        *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
-                        brelse(blocks[n]);
-                        blocks[n] = NULL;
-                }
-        }
-        gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-        gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
-                              dibh, sizeof(struct gfs2_dinode));
-        brelse(blocks[n]);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        di = (struct gfs2_dinode *)dibh->b_data;
-        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-        *(__be64 *)(di + 1) = cpu_to_be64(bn);
-        ip->i_di.di_height += new_height;
-        ip->i_di.di_blocks += new_height;
-        gfs2_set_inode_blocks(&ip->i_inode);
-        di->di_height = cpu_to_be16(ip->i_di.di_height);
-        di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
-        brelse(dibh);
-        return error;
-}
 /**
 * find_metapath - Find path through the metadata tree
- * @ip: The inode pointer
+ * @sdp: The superblock
 * @mp: The metapath to return the result in
 * @block: The disk block to look up
+ * @height: The pre-calculated height of the metadata tree
 *
 *   This routine returns a struct metapath structure that defines a path
 *   through the metadata of inode "ip" to get to block "block".
@@ -338,21 +236,29 @@ static int build_height(struct inode *inode, unsigned height)
 *
 */
-static void find_metapath(struct gfs2_inode *ip, u64 block,
+static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
-                          struct metapath *mp)
+                          struct metapath *mp, unsigned int height)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        u64 b = block;
        unsigned int i;
-        for (i = ip->i_di.di_height; i--;)
+        for (i = height; i--;)
-                mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+                mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
+}
+static inline unsigned int zero_metapath_length(const struct metapath *mp,
+                                                unsigned height)
+{
+        unsigned int i;
+        for (i = 0; i < height - 1; i++) {
+                if (mp->mp_list[i] != 0)
+                        return i;
+        }
+        return height;
 }
 /**
 * metapointer - Return pointer to start of metadata in a buffer
- * @bh: The buffer
 * @height: The metadata height (0 = dinode)
 * @mp: The metapath
 *
@@ -361,93 +267,302 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
 * metadata tree.
 */
-static inline __be64 *metapointer(struct buffer_head *bh, int *boundary,
+static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
-                               unsigned int height, const struct metapath *mp)
 {
+        struct buffer_head *bh = mp->mp_bh[height];
        unsigned int head_size = (height > 0) ?
                sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-        __be64 *ptr;
+        return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
-        *boundary = 0;
-        ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
-        if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size))
-                *boundary = 1;
-        return ptr;
 }
 /**
- * lookup_block - Get the next metadata block in metadata tree
+ * lookup_metapath - Walk the metadata tree to a specific point
- * @ip: The GFS2 inode
+ * @ip: The inode
- * @bh: Buffer containing the pointers to metadata blocks
- * @height: The height of the tree (0 = dinode)
 * @mp: The metapath
- * @create: Non-zero if we may create a new meatdata block
- * @new: Used to indicate if we did create a new metadata block
- * @block: the returned disk block number
 *
- * Given a metatree, complete to a particular height, checks to see if the next
+ * Assumes that the inode's buffer has already been looked up and
- * height of the tree exists. If not the next height of the tree is created.
+ * hooked onto mp->mp_bh[0] and that the metapath has been initialised
- * The block number of the next height of the metadata tree is returned.
+ * by find_metapath().
+ *
+ * If this function encounters part of the tree which has not been
+ * allocated, it returns the current height of the tree at the point
+ * at which it found the unallocated block. Blocks which are found are
+ * added to the mp->mp_bh[] list.
 *
+ * Returns: error or height of metadata tree
 */
-static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
-                        unsigned int height, struct metapath *mp, int create,
-                        int *new, u64 *block)
 {
-        int boundary;
+        unsigned int end_of_metadata = ip->i_height - 1;
-        __be64 *ptr = metapointer(bh, &boundary, height, mp);
+        unsigned int x;
+        __be64 *ptr;
+        u64 dblock;
+        int ret;
-        if (*ptr) {
+        for (x = 0; x < end_of_metadata; x++) {
-                *block = be64_to_cpu(*ptr);
+                ptr = metapointer(x, mp);
-                return boundary;
+                dblock = be64_to_cpu(*ptr);
-        }
+                if (!dblock)
+                        return x + 1;
-        *block = 0;
+                ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
+                if (ret)
+                        return ret;
+        }
-        if (!create)
+        return ip->i_height;
-                return 0;
+}
-        if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+static inline void release_metapath(struct metapath *mp)
-                *block = gfs2_alloc_data(ip);
+{
-        else
+        int i;
-                *block = gfs2_alloc_meta(ip);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
+                if (mp->mp_bh[i] == NULL)
+                        break;
+                brelse(mp->mp_bh[i]);
+        }
+}
-        *ptr = cpu_to_be64(*block);
+/**
-        ip->i_di.di_blocks++;
+ * gfs2_extent_length - Returns length of an extent of blocks
-        gfs2_set_inode_blocks(&ip->i_inode);
+ * @start: Start of the buffer
+ * @len: Length of the buffer in bytes
+ * @ptr: Current position in the buffer
+ * @limit: Max extent length to return (0 = unlimited)
+ * @eob: Set to 1 if we hit "end of block"
+ *
+ * If the first block is zero (unallocated) it will return the number of
+ * unallocated blocks in the extent, otherwise it will return the number
+ * of contiguous blocks in the extent.
+ *
+ * Returns: The length of the extent (minimum of one block)
+ */
-        *new = 1;
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
-        return 0;
+{
+        const __be64 *end = (start + len);
+        const __be64 *first = ptr;
+        u64 d = be64_to_cpu(*ptr);
+        *eob = 0;
+        do {
+                ptr++;
+                if (ptr >= end)
+                        break;
+                if (limit && --limit == 0)
+                        break;
+                if (d)
+                        d++;
+        } while(be64_to_cpu(*ptr) == d);
+        if (ptr >= end)
+                *eob = 1;
+        return (ptr - first);
 }
-static inline void bmap_lock(struct inode *inode, int create)
+static inline void bmap_lock(struct gfs2_inode *ip, int create)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
        if (create)
                down_write(&ip->i_rw_mutex);
        else
                down_read(&ip->i_rw_mutex);
 }
-static inline void bmap_unlock(struct inode *inode, int create)
+static inline void bmap_unlock(struct gfs2_inode *ip, int create)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
        if (create)
                up_write(&ip->i_rw_mutex);
        else
                up_read(&ip->i_rw_mutex);
 }
+static inline __be64 *gfs2_indirect_init(struct metapath *mp,
+                                         struct gfs2_glock *gl, unsigned int i,
+                                         unsigned offset, u64 bn)
+{
+        __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
+                       ((i > 1) ? sizeof(struct gfs2_meta_header) :
+                                 sizeof(struct gfs2_dinode)));
+        BUG_ON(i < 1);
+        BUG_ON(mp->mp_bh[i] != NULL);
+        mp->mp_bh[i] = gfs2_meta_new(gl, bn);
+        gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+        gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+        gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
+        ptr += offset;
+        *ptr = cpu_to_be64(bn);
+        return ptr;
+}
+enum alloc_state {
+        ALLOC_DATA = 0,
+        ALLOC_GROW_DEPTH = 1,
+        ALLOC_GROW_HEIGHT = 2,
+        /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
+};
+/**
+ * gfs2_bmap_alloc - Build a metadata tree of the requested height
+ * @inode: The GFS2 inode
+ * @lblock: The logical starting block of the extent
+ * @bh_map: This is used to return the mapping details
+ * @mp: The metapath
+ * @sheight: The starting height (i.e. whats already mapped)
+ * @height: The height to build to
+ * @maxlen: The max number of data blocks to alloc
+ *
+ * In this routine we may have to alloc:
+ *   i) Indirect blocks to grow the metadata tree height
+ *  ii) Indirect blocks to fill in lower part of the metadata tree
+ * iii) Data blocks
+ *
+ * The function is in two parts. The first part works out the total
+ * number of blocks which we need. The second part does the actual
+ * allocation asking for an extent at a time (if enough contiguous free
+ * blocks are available, there will only be one request per bmap call)
+ * and uses the state machine to initialise the blocks in order.
+ *
+ * Returns: errno on error
+ */
+static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
+                           struct buffer_head *bh_map, struct metapath *mp,
+                           const unsigned int sheight,
+                           const unsigned int height,
+                           const unsigned int maxlen)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct buffer_head *dibh = mp->mp_bh[0];
+        u64 bn, dblock = 0;
+        unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+        unsigned dblks = 0;
+        unsigned ptrs_per_blk;
+        const unsigned end_of_metadata = height - 1;
+        int eob = 0;
+        enum alloc_state state;
+        __be64 *ptr;
+        __be64 zero_bn = 0;
+        BUG_ON(sheight < 1);
+        BUG_ON(dibh == NULL);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if (height == sheight) {
+                struct buffer_head *bh;
+                /* Bottom indirect block exists, find unalloced extent size */
+                ptr = metapointer(end_of_metadata, mp);
+                bh = mp->mp_bh[end_of_metadata];
+                dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
+                                           &eob);
+                BUG_ON(dblks < 1);
+                state = ALLOC_DATA;
+        } else {
+                /* Need to allocate indirect blocks */
+                ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
+                dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+                if (height == ip->i_height) {
+                        /* Writing into existing tree, extend tree down */
+                        iblks = height - sheight;
+                        state = ALLOC_GROW_DEPTH;
+                } else {
+                        /* Building up tree height */
+                        state = ALLOC_GROW_HEIGHT;
+                        iblks = height - ip->i_height;
+                        zmpl = zero_metapath_length(mp, height);
+                        iblks -= zmpl;
+                        iblks += height;
+                }
+        }
+        /* start of the second part of the function (state machine) */
+        blks = dblks + iblks;
+        i = sheight;
+        do {
+                n = blks - alloced;
+                bn = gfs2_alloc_block(ip, &n);
+                alloced += n;
+                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
+                        gfs2_trans_add_unrevoke(sdp, bn, n);
+                switch (state) {
+                /* Growing height of tree */
+                case ALLOC_GROW_HEIGHT:
+                        if (i == 1) {
+                                ptr = (__be64 *)(dibh->b_data +
+                                                 sizeof(struct gfs2_dinode));
+                                zero_bn = *ptr;
+                        }
+                        for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
+                                gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
+                        if (i - 1 == height - ip->i_height) {
+                                i--;
+                                gfs2_buffer_copy_tail(mp->mp_bh[i],
+                                                sizeof(struct gfs2_meta_header),
+                                                dibh, sizeof(struct gfs2_dinode));
+                                gfs2_buffer_clear_tail(dibh,
+                                                sizeof(struct gfs2_dinode) +
+                                                sizeof(__be64));
+                                ptr = (__be64 *)(mp->mp_bh[i]->b_data +
+                                        sizeof(struct gfs2_meta_header));
+                                *ptr = zero_bn;
+                                state = ALLOC_GROW_DEPTH;
+                                for(i = zmpl; i < height; i++) {
+                                        if (mp->mp_bh[i] == NULL)
+                                                break;
+                                        brelse(mp->mp_bh[i]);
+                                        mp->mp_bh[i] = NULL;
+                                }
+                                i = zmpl;
+                        }
+                        if (n == 0)
+                                break;
+                /* Branching from existing tree */
+                case ALLOC_GROW_DEPTH:
+                        if (i > 1 && i < height)
+                                gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+                        for (; i < height && n > 0; i++, n--)
+                                gfs2_indirect_init(mp, ip->i_gl, i,
+                                                   mp->mp_list[i-1], bn++);
+                        if (i == height)
+                                state = ALLOC_DATA;
+                        if (n == 0)
+                                break;
+                /* Tree complete, adding data blocks */
+                case ALLOC_DATA:
+                        BUG_ON(n > dblks);
+                        BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
+                        gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+                        dblks = n;
+                        ptr = metapointer(end_of_metadata, mp);
+                        dblock = bn;
+                        while (n-- > 0)
+                                *ptr++ = cpu_to_be64(bn++);
+                        break;
+                }
+        } while (state != ALLOC_DATA);
+        ip->i_height = height;
+        gfs2_add_inode_blocks(&ip->i_inode, alloced);
+        gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
+        map_bh(bh_map, inode->i_sb, dblock);
+        bh_map->b_size = dblks << inode->i_blkbits;
+        set_buffer_new(bh_map);
+        return 0;
+}
 /**
 * gfs2_block_map - Map a block from an inode to a disk block
 * @inode: The inode
 * @lblock: The logical block number
 * @bh_map: The bh to be mapped
+ * @create: True if its ok to alloc blocks to satify the request
 *
- * Find the block number on the current device which corresponds to an
+ * Sets buffer_mapped() if successful, sets buffer_boundary() if a
- * inode's block. If the block had to be created, "new" will be set.
+ * read of metadata will be required before the next block can be
+ * mapped. Sets buffer_new() if new blocks were allocated.
 *
 * Returns: errno
 */
@@ -457,97 +572,78 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *bh;
+        unsigned int bsize = sdp->sd_sb.sb_bsize;
-        unsigned int bsize;
+        const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
-        unsigned int height;
+        const u64 *arr = sdp->sd_heightsize;
-        unsigned int end_of_metadata;
+        __be64 *ptr;
-        unsigned int x;
-        int error = 0;
-        int new = 0;
-        u64 dblock = 0;
-        int boundary;
-        unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
-        struct metapath mp;
        u64 size;
-        struct buffer_head *dibh = NULL;
+        struct metapath mp;
+        int ret;
+        int eob;
+        unsigned int len;
+        struct buffer_head *bh;
+        u8 height;
        BUG_ON(maxlen == 0);
-        if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+        memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
-                return 0;
+        bmap_lock(ip, create);
-        bmap_lock(inode, create);
        clear_buffer_mapped(bh_map);
        clear_buffer_new(bh_map);
        clear_buffer_boundary(bh_map);
-        bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+        if (gfs2_is_dir(ip)) {
-        size = (lblock + 1) * bsize;
+                bsize = sdp->sd_jbsize;
+                arr = sdp->sd_jheightsize;
-        if (size > ip->i_di.di_size) {
-                height = calc_tree_height(ip, size);
-                if (ip->i_di.di_height < height) {
-                        if (!create)
-                                goto out_ok;
-        
-                        error = build_height(inode, height);
-                        if (error)
-                                goto out_fail;
-                }
        }
-        find_metapath(ip, lblock, &mp);
+        ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
-        end_of_metadata = ip->i_di.di_height - 1;
+        if (ret)
-        error = gfs2_meta_inode_buffer(ip, &bh);
+                goto out;
-        if (error)
-                goto out_fail;
-        dibh = bh;
-        get_bh(dibh);
-        for (x = 0; x < end_of_metadata; x++) {
+        height = ip->i_height;
-                lookup_block(ip, bh, x, &mp, create, &new, &dblock);
+        size = (lblock + 1) * bsize;
-                brelse(bh);
+        while (size > arr[height])
-                if (!dblock)
+                height++;
-                        goto out_ok;
+        find_metapath(sdp, lblock, &mp, height);
+        ret = 1;
+        if (height > ip->i_height || gfs2_is_stuffed(ip))
+                goto do_alloc;
+        ret = lookup_metapath(ip, &mp);
+        if (ret < 0)
+                goto out;
+        if (ret != ip->i_height)
+                goto do_alloc;
+        ptr = metapointer(ip->i_height - 1, &mp);
+        if (*ptr == 0)
+                goto do_alloc;
+        map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
+        bh = mp.mp_bh[ip->i_height - 1];
+        len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
+        bh_map->b_size = (len << inode->i_blkbits);
+        if (eob)
+                set_buffer_boundary(bh_map);
+        ret = 0;
+out:
+        release_metapath(&mp);
+        bmap_unlock(ip, create);
+        return ret;
-                error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+do_alloc:
-                if (error)
+        /* All allocations are done here, firstly check create flag */
-                        goto out_fail;
+        if (!create) {
+                BUG_ON(gfs2_is_stuffed(ip));
+                ret = 0;
+                goto out;
        }
-        boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock);
+        /* At this point ret is the tree depth of already allocated blocks */
-        if (dblock) {
+        ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
-                map_bh(bh_map, inode->i_sb, dblock);
+        goto out;
-                if (boundary)
-                        set_buffer_boundary(bh_map);
-                if (new) {
-                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                        gfs2_dinode_out(ip, dibh->b_data);
-                        set_buffer_new(bh_map);
-                        goto out_brelse;
-                }
-                while(--maxlen && !buffer_boundary(bh_map)) {
-                        u64 eblock;
-                        mp.mp_list[end_of_metadata]++;
-                        boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock);
-                        if (eblock != ++dblock)
-                                break;
-                        bh_map->b_size += (1 << inode->i_blkbits);
-                        if (boundary)
-                                set_buffer_boundary(bh_map);
-                }
-        }
-out_brelse:
-        brelse(bh);
-out_ok:
-        error = 0;
-out_fail:
-        if (dibh)
-                brelse(dibh);
-        bmap_unlock(inode, create);
-        return error;
 }
+/*
+ * Deprecated: do not use in new code
+ */
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
 {
        struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
@@ -558,7 +654,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
        BUG_ON(!dblock);
        BUG_ON(!new);
-        bh.b_size = 1 << (inode->i_blkbits + 5);
+        bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
        ret = gfs2_block_map(inode, lblock, &bh, create);
        *extlen = bh.b_size >> inode->i_blkbits;
        *dblock = bh.b_blocknr;
@@ -621,7 +717,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (error)
                goto out;
-        if (height < ip->i_di.di_height - 1)
+        if (height < ip->i_height - 1)
                for (; top < bottom; top++, first = 0) {
                        if (!*top)
                                continue;
@@ -679,7 +775,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                sm->sm_first = 0;
        }
-        metadata = (height != ip->i_di.di_height - 1);
+        metadata = (height != ip->i_height - 1);
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
@@ -713,7 +809,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        else
                goto out; /* Nothing to do */
-        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
        for (x = 0; x < rlist.rl_rgrps; x++) {
                struct gfs2_rgrpd *rgd;
@@ -760,10 +856,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                }
                *p = 0;
-                if (!ip->i_di.di_blocks)
+                gfs2_add_inode_blocks(&ip->i_inode, -1);
-                        gfs2_consist_inode(ip);
-                ip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&ip->i_inode);
        }
        if (bstart) {
                if (metadata)
@@ -804,19 +897,16 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al;
        struct buffer_head *dibh;
-        unsigned int h;
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
-        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        error = gfs2_quota_lock_check(ip);
        if (error)
                goto out;
-        error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (error)
-                goto out_gunlock_q;
        al->al_requested = sdp->sd_max_height + RES_DATA;
        error = gfs2_inplace_reserve(ip);
@@ -829,34 +919,25 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
        if (error)
                goto out_ipres;
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out_end_trans;
        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
                if (gfs2_is_stuffed(ip)) {
                        error = gfs2_unstuff_dinode(ip, NULL);
                        if (error)
-                                goto out_end_trans;
+                                goto out_brelse;
-                }
-                h = calc_tree_height(ip, size);
-                if (ip->i_di.di_height < h) {
-                        down_write(&ip->i_rw_mutex);
-                        error = build_height(&ip->i_inode, h);
-                        up_write(&ip->i_rw_mutex);
-                        if (error)
-                                goto out_end_trans;
                }
        }
        ip->i_di.di_size = size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                goto out_end_trans;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
+out_brelse:
+        brelse(dibh);
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipres:
@@ -986,7 +1067,8 @@ out:
 static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 {
-        unsigned int height = ip->i_di.di_height;
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int height = ip->i_height;
        u64 lblock;
        struct metapath mp;
        int error;
@@ -994,10 +1076,11 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
        if (!size)
                lblock = 0;
        else
-                lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+                lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
-        find_metapath(ip, lblock, &mp);
+        find_metapath(sdp, lblock, &mp, ip->i_height);
-        gfs2_alloc_get(ip);
+        if (!gfs2_alloc_get(ip))
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -1037,10 +1120,8 @@ static int trunc_end(struct gfs2_inode *ip)
                goto out;
        if (!ip->i_di.di_size) {
-                ip->i_di.di_height = 0;
+                ip->i_height = 0;
-                ip->i_di.di_goal_meta =
+                ip->i_goal = ip->i_no_addr;
-                        ip->i_di.di_goal_data =
-                        ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        }
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
@@ -1197,10 +1278,9 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                              unsigned int len, int *alloc_required)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        u64 lblock, lblock_stop, dblock;
+        struct buffer_head bh;
-        u32 extlen;
+        unsigned int shift;
-        int new = 0;
+        u64 lblock, lblock_stop, size;
-        int error = 0;
        *alloc_required = 0;
@@ -1214,6 +1294,8 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                return 0;
        }
+        *alloc_required = 1;
+        shift = sdp->sd_sb.sb_bsize_shift;
        if (gfs2_is_dir(ip)) {
                unsigned int bsize = sdp->sd_jbsize;
                lblock = offset;
@@ -1221,27 +1303,25 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                lblock_stop = offset + len + bsize - 1;
                do_div(lblock_stop, bsize);
        } else {
-                unsigned int shift = sdp->sd_sb.sb_bsize_shift;
                u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
                lblock = offset >> shift;
                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                if (lblock_stop > end_of_file) {
+                if (lblock_stop > end_of_file)
-                        *alloc_required = 1;
                        return 0;
-                }
        }
-        for (; lblock < lblock_stop; lblock += extlen) {
+        size = (lblock_stop - lblock) << shift;
-                error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+        do {
-                if (error)
+                bh.b_state = 0;
-                        return error;
+                bh.b_size = size;
+                gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
-                if (!dblock) {
+                if (!buffer_mapped(&bh))
-                        *alloc_required = 1;
                        return 0;
-                }
+                size -= bh.b_size;
-        }
+                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+        } while(size > 0);
+        *alloc_required = 0;
        return 0;
 }
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c34709512b19..eed040d8ba3a 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -159,6 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
        unsigned int o;
        int copied = 0;
        int error = 0;
+        int new = 0;
        if (!size)
                return 0;
@@ -183,7 +184,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
        while (copied < size) {
                unsigned int amount;
                struct buffer_head *bh;
-                int new = 0;
                amount = size - copied;
                if (amount > sdp->sd_sb.sb_bsize - o)
@@ -757,7 +757,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf;
-                unsigned hsize = 1 << ip->i_di.di_depth;
+                unsigned hsize = 1 << ip->i_depth;
                unsigned index;
                u64 ln;
                if (hsize * sizeof(u64) != ip->i_di.di_size) {
@@ -765,7 +765,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
                        return ERR_PTR(-EIO);
                }
-                index = name->hash >> (32 - ip->i_di.di_depth);
+                index = name->hash >> (32 - ip->i_depth);
                error = get_first_leaf(ip, index, &bh);
                if (error)
                        return ERR_PTR(error);
@@ -803,14 +803,15 @@ got_dent:
 static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        u64 bn = gfs2_alloc_meta(ip);
+        unsigned int n = 1;
+        u64 bn = gfs2_alloc_block(ip, &n);
        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "", .len = 0, .hash = 0 };
        if (!bh)
                return NULL;
+        gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
        leaf = (struct gfs2_leaf *)bh->b_data;
@@ -905,12 +906,11 @@ static int dir_make_exhash(struct inode *inode)
                *lp = cpu_to_be64(bn);
        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
-        dip->i_di.di_blocks++;
+        gfs2_add_inode_blocks(&dip->i_inode, 1);
-        gfs2_set_inode_blocks(&dip->i_inode);
        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
-        dip->i_di.di_depth = y;
+        dip->i_depth = y;
        gfs2_dinode_out(dip, dibh->b_data);
@@ -941,7 +941,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        int x, moved = 0;
        int error;
-        index = name->hash >> (32 - dip->i_di.di_depth);
+        index = name->hash >> (32 - dip->i_depth);
        error = get_leaf_nr(dip, index, &leaf_no);
        if (error)
                return error;
@@ -952,7 +952,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
                return error;
        oleaf = (struct gfs2_leaf *)obh->b_data;
-        if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+        if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
                brelse(obh);
                return 1; /* can't split */
        }
@@ -967,10 +967,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        bn = nbh->b_blocknr;
        /*  Compute the start and len of leaf pointers in the hash table.  */
-        len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+        len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
        half_len = len >> 1;
        if (!half_len) {
-                printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+                printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
                gfs2_consist_inode(dip);
                error = -EIO;
                goto fail_brelse;
@@ -997,7 +997,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        kfree(lp);
        /*  Compute the divider  */
-        divider = (start + half_len) << (32 - dip->i_di.di_depth);
+        divider = (start + half_len) << (32 - dip->i_depth);
        /*  Copy the entries  */
        dirent_first(dip, obh, &dent);
@@ -1021,13 +1021,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
                        new->de_inum = dent->de_inum; /* No endian worries */
                        new->de_type = dent->de_type; /* No endian worries */
-                        nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+                        be16_add_cpu(&nleaf->lf_entries, 1);
                        dirent_del(dip, obh, prev, dent);
                        if (!oleaf->lf_entries)
                                gfs2_consist_inode(dip);
-                        oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+                        be16_add_cpu(&oleaf->lf_entries, -1);
                        if (!prev)
                                prev = dent;
@@ -1044,8 +1044,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
                gfs2_trans_add_bh(dip->i_gl, dibh, 1);
-                dip->i_di.di_blocks++;
+                gfs2_add_inode_blocks(&dip->i_inode, 1);
-                gfs2_set_inode_blocks(&dip->i_inode);
                gfs2_dinode_out(dip, dibh->b_data);
                brelse(dibh);
        }
@@ -1082,7 +1081,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        int x;
        int error = 0;
-        hsize = 1 << dip->i_di.di_depth;
+        hsize = 1 << dip->i_depth;
        if (hsize * sizeof(u64) != dip->i_di.di_size) {
                gfs2_consist_inode(dip);
                return -EIO;
@@ -1090,7 +1089,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        /*  Allocate both the "from" and "to" buffers in one big chunk  */
-        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1125,7 +1124,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (!gfs2_assert_withdraw(sdp, !error)) {
-                dip->i_di.di_depth++;
+                dip->i_depth++;
                gfs2_dinode_out(dip, dibh->b_data);
                brelse(dibh);
        }
@@ -1370,16 +1369,16 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        int error = 0;
        unsigned depth = 0;
-        hsize = 1 << dip->i_di.di_depth;
+        hsize = 1 << dip->i_depth;
        if (hsize * sizeof(u64) != dip->i_di.di_size) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
        hash = gfs2_dir_offset2hash(*offset);
-        index = hash >> (32 - dip->i_di.di_depth);
+        index = hash >> (32 - dip->i_depth);
-        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
        if (!lp)
                return -ENOMEM;
@@ -1405,7 +1404,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
                if (error)
                        break;
-                len = 1 << (dip->i_di.di_depth - depth);
+                len = 1 << (dip->i_depth - depth);
                index = (index & ~(len - 1)) + len;
        }
@@ -1444,7 +1443,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
        error = -ENOMEM;
        /* 96 is max number of dirents which can be stuffed into an inode */
-        darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
+        darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
        if (darr) {
                g.pdent = darr;
                g.offset = 0;
@@ -1549,7 +1548,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        u32 index;
        u64 bn;
-        index = name->hash >> (32 - ip->i_di.di_depth);
+        index = name->hash >> (32 - ip->i_depth);
        error = get_first_leaf(ip, index, &obh);
        if (error)
                return error;
@@ -1579,8 +1578,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        if (error)
                return error;
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        ip->i_di.di_blocks++;
+        gfs2_add_inode_blocks(&ip->i_inode, 1);
-        gfs2_set_inode_blocks(&ip->i_inode);
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
        return 0;
@@ -1616,7 +1614,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        dent->de_type = cpu_to_be16(type);
                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
-                                leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
                        brelse(bh);
                        error = gfs2_meta_inode_buffer(ip, &bh);
@@ -1641,7 +1639,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        continue;
                if (error < 0)
                        break;
-                if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+                if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
                        error = dir_double_exhash(ip);
                        if (error)
                                break;
@@ -1785,13 +1783,13 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
        u64 leaf_no;
        int error = 0;
-        hsize = 1 << dip->i_di.di_depth;
+        hsize = 1 << dip->i_depth;
        if (hsize * sizeof(u64) != dip->i_di.di_size) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
-        lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+        lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
        if (!lp)
                return -ENOMEM;
@@ -1817,7 +1815,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
                        if (error)
                                goto out;
                        leaf = (struct gfs2_leaf *)bh->b_data;
-                        len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+                        len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
                        brelse(bh);
                        error = lc(dip, index, len, leaf_no, data);
@@ -1866,15 +1864,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-        ht = kzalloc(size, GFP_KERNEL);
+        ht = kzalloc(size, GFP_NOFS);
        if (!ht)
                return -ENOMEM;
-        gfs2_alloc_get(dip);
+        if (!gfs2_alloc_get(dip)) {
+                error = -ENOMEM;
+                goto out;
+        }
        error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
-                goto out;
+                goto out_put;
        error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
        if (error)
@@ -1894,7 +1895,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                l_blocks++;
        }
-        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
        for (x = 0; x < rlist.rl_rgrps; x++) {
                struct gfs2_rgrpd *rgd;
@@ -1921,11 +1922,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                brelse(bh);
                gfs2_free_meta(dip, blk, 1);
+                gfs2_add_inode_blocks(&dip->i_inode, -1);
-                if (!dip->i_di.di_blocks)
-                        gfs2_consist_inode(dip);
-                dip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&dip->i_inode);
        }
        error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
@@ -1952,8 +1949,9 @@ out_rlist:
        gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
 out_qs:
        gfs2_quota_unhold(dip);
-out:
+out_put:
        gfs2_alloc_put(dip);
+out:
        kfree(ht);
        return error;
 }
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index bee99704ea10..e3f76f451b0a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -277,10 +277,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
                }
                *dataptrs = 0;
-                if (!ip->i_di.di_blocks)
+                gfs2_add_inode_blocks(&ip->i_inode, -1);
-                        gfs2_consist_inode(ip);
-                ip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&ip->i_inode);
        }
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
@@ -321,6 +318,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -449,7 +448,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
        unsigned int x;
        int error = 0;
-        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
        if (!bh)
                return -ENOMEM;
@@ -582,10 +581,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_header *ea;
+        unsigned int n = 1;
        u64 block;
-        block = gfs2_alloc_meta(ip);
+        block = gfs2_alloc_block(ip, &n);
+        gfs2_trans_add_unrevoke(sdp, block, 1);
        *bhp = gfs2_meta_new(ip->i_gl, block);
        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
        gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
@@ -597,8 +597,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
        ea->ea_flags = GFS2_EAFLAG_LAST;
        ea->ea_num_ptrs = 0;
-        ip->i_di.di_blocks++;
+        gfs2_add_inode_blocks(&ip->i_inode, 1);
-        gfs2_set_inode_blocks(&ip->i_inode);
        return 0;
 }
@@ -642,15 +641,15 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                        struct buffer_head *bh;
                        u64 block;
                        int mh_size = sizeof(struct gfs2_meta_header);
+                        unsigned int n = 1;
-                        block = gfs2_alloc_meta(ip);
+                        block = gfs2_alloc_block(ip, &n);
+                        gfs2_trans_add_unrevoke(sdp, block, 1);
                        bh = gfs2_meta_new(ip->i_gl, block);
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
                        gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
-                        ip->i_di.di_blocks++;
+                        gfs2_add_inode_blocks(&ip->i_inode, 1);
-                        gfs2_set_inode_blocks(&ip->i_inode);
                        copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
                                                           data_len;
@@ -684,15 +683,13 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
-        error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        error = gfs2_quota_lock_check(ip);
        if (error)
                goto out;
-        error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (error)
-                goto out_gunlock_q;
        al->al_requested = blks;
        error = gfs2_inplace_reserve(ip);
@@ -966,9 +963,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
        } else {
                u64 blk;
+                unsigned int n = 1;
-                blk = gfs2_alloc_meta(ip);
+                blk = gfs2_alloc_block(ip, &n);
+                gfs2_trans_add_unrevoke(sdp, blk, 1);
                indbh = gfs2_meta_new(ip->i_gl, blk);
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
                gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
@@ -978,8 +975,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                *eablk = cpu_to_be64(ip->i_di.di_eattr);
                ip->i_di.di_eattr = blk;
                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
-                ip->i_di.di_blocks++;
+                gfs2_add_inode_blocks(&ip->i_inode, 1);
-                gfs2_set_inode_blocks(&ip->i_inode);
                eablk++;
        }
@@ -1210,7 +1206,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
        unsigned int x;
        int error;
-        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+        bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
        if (!bh)
                return -ENOMEM;
@@ -1347,7 +1343,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        else
                goto out;
-        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+        gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
        for (x = 0; x < rlist.rl_rgrps; x++) {
                struct gfs2_rgrpd *rgd;
@@ -1387,10 +1383,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
                }
                *eablk = 0;
-                if (!ip->i_di.di_blocks)
+                gfs2_add_inode_blocks(&ip->i_inode, -1);
-                        gfs2_consist_inode(ip);
-                ip->i_di.di_blocks--;
-                gfs2_set_inode_blocks(&ip->i_inode);
        }
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
@@ -1442,10 +1435,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
        ip->i_di.di_eattr = 0;
-        if (!ip->i_di.di_blocks)
+        gfs2_add_inode_blocks(&ip->i_inode, -1);
-                gfs2_consist_inode(ip);
-        ip->i_di.di_blocks--;
-        gfs2_set_inode_blocks(&ip->i_inode);
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1474,6 +1464,8 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
        int error;
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7175a4d06435..d636b3e80f5d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "lops.h"
 #include "meta_io.h"
 #include "quota.h"
@@ -183,7 +182,8 @@ static void glock_free(struct gfs2_glock *gl)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct inode *aspace = gl->gl_aspace;
-        gfs2_lm_put_lock(sdp, gl->gl_lock);
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
        if (aspace)
                gfs2_aspace_put(aspace);
@@ -197,7 +197,7 @@ static void glock_free(struct gfs2_glock *gl)
 *
 */
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
        atomic_inc(&gl->gl_ref);
 }
@@ -293,6 +293,16 @@ static void glock_work_func(struct work_struct *work)
        gfs2_glock_put(gl);
 }
+static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                     void **lockp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
+        return error;
+}
 /**
 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
 * @sdp: The GFS2 superblock
@@ -338,8 +348,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_ip = 0;
        gl->gl_ops = glops;
        gl->gl_req_gh = NULL;
-        gl->gl_req_bh = NULL;
-        gl->gl_vn = 0;
        gl->gl_stamp = jiffies;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
@@ -595,11 +603,12 @@ static void run_queue(struct gfs2_glock *gl)
                        blocked = rq_mutex(gh);
                } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
                        blocked = rq_demote(gl);
-                        if (gl->gl_waiters2 && !blocked) {
+                        if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
+                                     !blocked) {
                                set_bit(GLF_DEMOTE, &gl->gl_flags);
                                gl->gl_demote_state = LM_ST_UNLOCKED;
                        }
-                        gl->gl_waiters2 = 0;
+                        clear_bit(GLF_WAITERS2, &gl->gl_flags);
                } else if (!list_empty(&gl->gl_waiters3)) {
                        gh = list_entry(gl->gl_waiters3.next,
                                        struct gfs2_holder, gh_list);
@@ -710,7 +719,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
                if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
-                        gl->gl_waiters2 = 1;
+                        set_bit(GLF_WAITERS2, &gl->gl_flags);
                else 
                        gl->gl_demote_state = LM_ST_UNLOCKED;
        }
@@ -743,6 +752,43 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 }
 /**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_holder *gh = gl->gl_req_gh;
+        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
+        gfs2_assert_warn(sdp, !ret);
+        state_change(gl, LM_ST_UNLOCKED);
+        if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
+                spin_lock(&gl->gl_spin);
+                gh->gh_error = 0;
+                spin_unlock(&gl->gl_spin);
+                gfs2_glock_xmote_th(gl, gl->gl_req_gh);
+                gfs2_glock_put(gl);
+                return;
+        }
+        spin_lock(&gl->gl_spin);
+        gfs2_demote_wake(gl);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        spin_unlock(&gl->gl_spin);
+        gfs2_glock_put(gl);
+}
+/**
 * xmote_bh - Called after the lock module is done acquiring a lock
 * @gl: The glock in question
 * @ret: the int returned from the lock module
@@ -754,25 +800,19 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
        struct gfs2_sbd *sdp = gl->gl_sbd;
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh = gl->gl_req_gh;
-        int prev_state = gl->gl_state;
        int op_done = 1;
+        if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
+                drop_bh(gl, ret);
+                return;
+        }
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
        state_change(gl, ret & LM_OUT_ST_MASK);
-        if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
-                if (glops->go_inval)
-                        glops->go_inval(gl, DIO_METADATA);
-        } else if (gl->gl_state == LM_ST_DEFERRED) {
-                /* We might not want to do this here.
-                   Look at moving to the inode glops. */
-                if (glops->go_inval)
-                        glops->go_inval(gl, 0);
-        }
        /*  Deal with each possible exit condition  */
        if (!gh) {
@@ -782,7 +822,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
                } else {
                        spin_lock(&gl->gl_spin);
                        if (gl->gl_state != gl->gl_demote_state) {
-                                gl->gl_req_bh = NULL;
                                spin_unlock(&gl->gl_spin);
                                gfs2_glock_drop_th(gl);
                                gfs2_glock_put(gl);
@@ -793,6 +832,14 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
                }
        } else {
                spin_lock(&gl->gl_spin);
+                if (ret & LM_OUT_CONV_DEADLK) {
+                        gh->gh_error = 0;
+                        set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
+                        spin_unlock(&gl->gl_spin);
+                        gfs2_glock_drop_th(gl);
+                        gfs2_glock_put(gl);
+                        return;
+                }
                list_del_init(&gh->gh_list);
                gh->gh_error = -EIO;
                if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
@@ -824,7 +871,6 @@ out:
        if (op_done) {
                spin_lock(&gl->gl_spin);
                gl->gl_req_gh = NULL;
-                gl->gl_req_bh = NULL;
                clear_bit(GLF_LOCK, &gl->gl_flags);
                spin_unlock(&gl->gl_spin);
        }
@@ -835,6 +881,17 @@ out:
                gfs2_holder_wake(gh);
 }
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+                                 unsigned int cur_state, unsigned int req_state,
+                                 unsigned int flags)
+{
+        int ret = 0;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+                                                         req_state, flags);
+        return ret;
+}
 /**
 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
 * @gl: The glock in question
@@ -856,6 +913,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
+        if (state == LM_ST_DEFERRED && glops->go_inval)
+                glops->go_inval(gl, DIO_METADATA);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -863,7 +922,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
        gfs2_assert_warn(sdp, state != gl->gl_state);
        gfs2_glock_hold(gl);
-        gl->gl_req_bh = xmote_bh;
        lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
@@ -876,49 +934,13 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
                xmote_bh(gl, lck_ret);
 }
-/**
+static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- * drop_bh - Called after a lock module unlock completes
+                                   unsigned int cur_state)
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 {
-        struct gfs2_sbd *sdp = gl->gl_sbd;
+        int ret = 0;
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-        struct gfs2_holder *gh = gl->gl_req_gh;
+                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+        return ret;
-        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-        gfs2_assert_warn(sdp, !ret);
-        state_change(gl, LM_ST_UNLOCKED);
-        if (glops->go_inval)
-                glops->go_inval(gl, DIO_METADATA);
-        if (gh) {
-                spin_lock(&gl->gl_spin);
-                list_del_init(&gh->gh_list);
-                gh->gh_error = 0;
-                spin_unlock(&gl->gl_spin);
-        }
-        spin_lock(&gl->gl_spin);
-        gfs2_demote_wake(gl);
-        gl->gl_req_gh = NULL;
-        gl->gl_req_bh = NULL;
-        clear_bit(GLF_LOCK, &gl->gl_flags);
-        spin_unlock(&gl->gl_spin);
-        gfs2_glock_put(gl);
-        if (gh)
-                gfs2_holder_wake(gh);
 }
 /**
@@ -935,13 +957,14 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
+        if (glops->go_inval)
+                glops->go_inval(gl, DIO_METADATA);
        gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
        gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
        gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
        gfs2_glock_hold(gl);
-        gl->gl_req_bh = drop_bh;
        ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
@@ -964,16 +987,17 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 static void do_cancels(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        spin_lock(&gl->gl_spin);
        while (gl->gl_req_gh != gh &&
               !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
               !list_empty(&gh->gh_list)) {
-                if (gl->gl_req_bh && !(gl->gl_req_gh &&
+                if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-                                     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
                        spin_unlock(&gl->gl_spin);
-                        gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+                        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                                sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
                        msleep(100);
                        spin_lock(&gl->gl_spin);
                } else {
@@ -1041,7 +1065,6 @@ static int glock_wait_internal(struct gfs2_holder *gh)
                spin_lock(&gl->gl_spin);
                gl->gl_req_gh = NULL;
-                gl->gl_req_bh = NULL;
                clear_bit(GLF_LOCK, &gl->gl_flags);
                run_queue(gl);
                spin_unlock(&gl->gl_spin);
@@ -1428,6 +1451,14 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
                gfs2_glock_dq_uninit(&ghs[x]);
 }
+static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+        return error;
+}
 /**
 * gfs2_lvb_hold - attach a LVB from a glock
 * @gl: The glock in question
@@ -1463,12 +1494,15 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 void gfs2_lvb_unhold(struct gfs2_glock *gl)
 {
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        gfs2_glock_hold(gl);
        gfs2_glmutex_lock(gl);
        gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
        if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-                gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+                if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                        sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
                gl->gl_lvb = NULL;
                gfs2_glock_put(gl);
        }
@@ -1534,8 +1568,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
                gl = gfs2_glock_find(sdp, &async->lc_name);
                if (gfs2_assert_warn(sdp, gl))
                        return;
-                if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+                xmote_bh(gl, async->lc_ret);
-                        gl->gl_req_bh(gl, async->lc_ret);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
                up_read(&gfs2_umount_flush_sem);
@@ -1594,10 +1627,10 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
                gfs2_glock_hold(gl);
                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
                atomic_inc(&sdp->sd_reclaim_count);
-        }
+                spin_unlock(&sdp->sd_reclaim_lock);
-        spin_unlock(&sdp->sd_reclaim_lock);
+                wake_up(&sdp->sd_reclaim_wq);
+        } else
-        wake_up(&sdp->sd_reclaim_wq);
+                spin_unlock(&sdp->sd_reclaim_lock);
 }
 /**
@@ -1897,7 +1930,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
                print_dbg(gi, "  gl_owner = -1\n");
        print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
        print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-        print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
        print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
        print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
        print_dbg(gi, "  reclaim = %s\n",
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2f9c6d136b37..cdad3e6f8150 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -32,24 +32,23 @@
 #define GLR_TRYFAILED           13
 #define GLR_CANCELED            14
-static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
-        int locked = 0;
        struct pid *pid;
        /* Look in glock's list of holders for one with current task as owner */
        spin_lock(&gl->gl_spin);
        pid = task_pid(current);
        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-                if (gh->gh_owner_pid == pid) {
+                if (gh->gh_owner_pid == pid)
-                        locked = 1;
+                        goto out;
-                        break;
-                }
        }
+        gh = NULL;
+out:
        spin_unlock(&gl->gl_spin);
-        return locked;
+        return gh;
 }
 static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
@@ -79,7 +78,6 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
-void gfs2_glock_hold(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c663b7a0f410..d31badadef8f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -126,7 +126,13 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
                return;
        gfs2_meta_inval(gl);
-        gl->gl_vn++;
+        if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
+                gl->gl_sbd->sd_rindex_uptodate = 0;
+        else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
+                struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+                rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+        }
 }
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 525dcae352d6..9c2c0b90b22a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -44,7 +44,6 @@ struct gfs2_log_header_host {
 struct gfs2_log_operations {
        void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
-        void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
        void (*lo_before_commit) (struct gfs2_sbd *sdp);
        void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
        void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -70,7 +69,6 @@ struct gfs2_bitmap {
 };
 struct gfs2_rgrp_host {
-        u32 rg_flags;
        u32 rg_free;
        u32 rg_dinodes;
        u64 rg_igeneration;
@@ -87,17 +85,17 @@ struct gfs2_rgrpd {
        u32 rd_data;                    /* num of data blocks in rgrp */
        u32 rd_bitbytes;                /* number of bytes in data bitmaps */
        struct gfs2_rgrp_host rd_rg;
-        u64 rd_rg_vn;
        struct gfs2_bitmap *rd_bits;
        unsigned int rd_bh_count;
        struct mutex rd_mutex;
        u32 rd_free_clone;
        struct gfs2_log_element rd_le;
-        u32 rd_last_alloc_data;
+        u32 rd_last_alloc;
-        u32 rd_last_alloc_meta;
        struct gfs2_sbd *rd_sbd;
-        unsigned long rd_flags;
+        unsigned char rd_flags;
-#define GFS2_RDF_CHECK        0x0001          /* Need to check for unlinked inodes */
+#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
+#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
 };
 enum gfs2_state_bits {
@@ -168,6 +166,8 @@ enum {
        GLF_DIRTY               = 5,
        GLF_DEMOTE_IN_PROGRESS  = 6,
        GLF_LFLUSH              = 7,
+        GLF_WAITERS2            = 8,
+        GLF_CONV_DEADLK         = 9,
 };
 struct gfs2_glock {
@@ -187,18 +187,15 @@ struct gfs2_glock {
        struct list_head gl_holders;
        struct list_head gl_waiters1;   /* HIF_MUTEX */
        struct list_head gl_waiters3;   /* HIF_PROMOTE */
-        int gl_waiters2;                /* GIF_DEMOTE */
        const struct gfs2_glock_operations *gl_ops;
        struct gfs2_holder *gl_req_gh;
-        gfs2_glop_bh_t gl_req_bh;
        void *gl_lock;
        char *gl_lvb;
        atomic_t gl_lvb_count;
-        u64 gl_vn;
        unsigned long gl_stamp;
        unsigned long gl_tchange;
        void *gl_object;
@@ -213,6 +210,8 @@ struct gfs2_glock {
        struct delayed_work gl_work;
 };
+#define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
 struct gfs2_alloc {
        /* Quota stuff */
@@ -241,14 +240,9 @@ enum {
 struct gfs2_dinode_host {
        u64 di_size;            /* number of bytes in file */
-        u64 di_blocks;          /* number of blocks in file */
-        u64 di_goal_meta;       /* rgrp to alloc from next */
-        u64 di_goal_data;       /* data block goal */
        u64 di_generation;      /* generation number for NFS */
        u32 di_flags;           /* GFS2_DIF_... */
-        u16 di_height;          /* height of metadata */
        /* These only apply to directories  */
-        u16 di_depth;           /* Number of bits in the table */
        u32 di_entries;         /* The number of entries in the directory */
        u64 di_eattr;           /* extended attribute block number */
 };
@@ -265,9 +259,10 @@ struct gfs2_inode {
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
        struct gfs2_alloc *i_alloc;
-        u64 i_last_rg_alloc;
+        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
+        u8 i_height;
+        u8 i_depth;
 };
 /*
@@ -490,9 +485,9 @@ struct gfs2_sbd {
        u32 sd_qc_per_block;
        u32 sd_max_dirres;      /* Max blocks needed to add a directory entry */
        u32 sd_max_height;      /* Max height of a file's metadata tree */
-        u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+        u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
        u32 sd_max_jheight; /* Max height of journaled file's meta tree */
-        u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+        u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
        struct gfs2_args sd_args;       /* Mount arguments */
        struct gfs2_tune sd_tune;       /* Filesystem tuning structure */
@@ -533,7 +528,7 @@ struct gfs2_sbd {
        /* Resource group stuff */
-        u64 sd_rindex_vn;
+        int sd_rindex_uptodate;
        spinlock_t sd_rindex_spin;
        struct mutex sd_rindex_mutex;
        struct list_head sd_rindex_list;
@@ -637,9 +632,6 @@ struct gfs2_sbd {
        /* Counters */
-        atomic_t sd_glock_count;
-        atomic_t sd_glock_held_count;
-        atomic_t sd_inode_count;
        atomic_t sd_reclaimed;
        char sd_fsname[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 37725ade3c51..3a9ef526c308 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -149,7 +149,8 @@ void gfs2_set_iop(struct inode *inode)
        } else if (S_ISLNK(mode)) {
                inode->i_op = &gfs2_symlink_iops;
        } else {
-                inode->i_op = &gfs2_dev_iops;
+                inode->i_op = &gfs2_file_iops;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
        }
        unlock_new_inode(inode);
@@ -248,12 +249,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
+        u16 height, depth;
-        if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) {
+        if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
-                if (gfs2_consist_inode(ip))
+                goto corrupt;
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
        ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
        ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
        ip->i_inode.i_rdev = 0;
@@ -275,8 +274,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
        di->di_size = be64_to_cpu(str->di_size);
        i_size_write(&ip->i_inode, di->di_size);
-        di->di_blocks = be64_to_cpu(str->di_blocks);
+        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-        gfs2_set_inode_blocks(&ip->i_inode);
        ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
        ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
        ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
@@ -284,15 +282,20 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
-        di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+        ip->i_goal = be64_to_cpu(str->di_goal_meta);
-        di->di_goal_data = be64_to_cpu(str->di_goal_data);
        di->di_generation = be64_to_cpu(str->di_generation);
        di->di_flags = be32_to_cpu(str->di_flags);
        gfs2_set_inode_flags(&ip->i_inode);
-        di->di_height = be16_to_cpu(str->di_height);
+        height = be16_to_cpu(str->di_height);
+        if (unlikely(height > GFS2_MAX_META_HEIGHT))
-        di->di_depth = be16_to_cpu(str->di_depth);
+                goto corrupt;
+        ip->i_height = (u8)height;
+        depth = be16_to_cpu(str->di_depth);
+        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+                goto corrupt;
+        ip->i_depth = (u8)depth;
        di->di_entries = be32_to_cpu(str->di_entries);
        di->di_eattr = be64_to_cpu(str->di_eattr);
@@ -300,6 +303,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
                gfs2_set_aops(&ip->i_inode);
        return 0;
+corrupt:
+        if (gfs2_consist_inode(ip))
+                gfs2_dinode_print(ip);
+        return -EIO;
 }
 /**
@@ -337,13 +344,15 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        struct gfs2_rgrpd *rgd;
        int error;
-        if (ip->i_di.di_blocks != 1) {
+        if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                return -EIO;
        }
        al = gfs2_alloc_get(ip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -487,7 +496,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                return dir;
        }
-        if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) {
+        if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
                if (error)
                        return ERR_PTR(error);
@@ -818,7 +827,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        int error;
        munge_mode_uid_gid(dip, &mode, &uid, &gid);
-        gfs2_alloc_get(dip);
+        if (!gfs2_alloc_get(dip))
+                return -ENOMEM;
        error = gfs2_quota_lock(dip, uid, gid);
        if (error)
@@ -853,6 +863,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        int error;
        al = gfs2_alloc_get(dip);
+        if (!al)
+                return -ENOMEM;
        error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
        if (error)
@@ -1219,7 +1231,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        x = ip->i_di.di_size + 1;
        if (x > *len) {
-                *buf = kmalloc(x, GFP_KERNEL);
+                *buf = kmalloc(x, GFP_NOFS);
                if (!*buf) {
                        error = -ENOMEM;
                        goto out_brelse;
@@ -1391,21 +1403,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
        str->di_size = cpu_to_be64(di->di_size);
-        str->di_blocks = cpu_to_be64(di->di_blocks);
+        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
        str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
-        str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
+        str->di_goal_meta = cpu_to_be64(ip->i_goal);
-        str->di_goal_data = cpu_to_be64(di->di_goal_data);
+        str->di_goal_data = cpu_to_be64(ip->i_goal);
        str->di_generation = cpu_to_be64(di->di_generation);
        str->di_flags = cpu_to_be32(di->di_flags);
-        str->di_height = cpu_to_be16(di->di_height);
+        str->di_height = cpu_to_be16(ip->i_height);
        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
                                             !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
                                             GFS2_FORMAT_DE : 0);
-        str->di_depth = cpu_to_be16(di->di_depth);
+        str->di_depth = cpu_to_be16(ip->i_depth);
        str->di_entries = cpu_to_be32(di->di_entries);
        str->di_eattr = cpu_to_be64(di->di_eattr);
@@ -1423,15 +1435,13 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
        printk(KERN_INFO "  no_addr = %llu\n",
               (unsigned long long)ip->i_no_addr);
        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
-        printk(KERN_INFO "  di_blocks = %llu\n",
+        printk(KERN_INFO "  blocks = %llu\n",
-               (unsigned long long)di->di_blocks);
+               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
-        printk(KERN_INFO "  di_goal_meta = %llu\n",
+        printk(KERN_INFO "  i_goal = %llu\n",
-               (unsigned long long)di->di_goal_meta);
+               (unsigned long long)ip->i_goal);
-        printk(KERN_INFO "  di_goal_data = %llu\n",
-               (unsigned long long)di->di_goal_data);
        printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
-        printk(KERN_INFO "  di_height = %u\n", di->di_height);
+        printk(KERN_INFO "  i_height = %u\n", ip->i_height);
-        printk(KERN_INFO "  di_depth = %u\n", di->di_depth);
+        printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
        printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
        printk(KERN_INFO "  di_eattr = %llu\n",
               (unsigned long long)di->di_eattr);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d44650662615..580da454b38f 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,9 +10,11 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
+#include "util.h"
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
-        return !ip->i_di.di_height;
+        return !ip->i_height;
 }
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
@@ -37,13 +39,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
        return S_ISDIR(ip->i_inode.i_mode);
 }
-static inline void gfs2_set_inode_blocks(struct inode *inode)
+static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
+{
+        inode->i_blocks = blocks <<
+                (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+}
+static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        return inode->i_blocks >>
-        inode->i_blocks = ip->i_di.di_blocks <<
                (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
 }
+static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
+{
+        gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
+        change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
+        inode->i_blocks += change;
+}
 static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
                                  u64 no_formal_ino)
 {
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
deleted file mode 100644
index cfcc39b86a53..000000000000
--- a/fs/gfs2/lm.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "lm.h"
-#include "super.h"
-#include "util.h"
-/**
- * gfs2_lm_mount - mount a locking protocol
- * @sdp: the filesystem
- * @args: mount arguements
- * @silent: if 1, don't complain if the FS isn't a GFS2 fs
- *
- * Returns: errno
- */
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
-{
-        char *proto = sdp->sd_proto_name;
-        char *table = sdp->sd_table_name;
-        int flags = 0;
-        int error;
-        if (sdp->sd_args.ar_spectator)
-                flags |= LM_MFLAG_SPECTATOR;
-        fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
-        error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
-                                     gfs2_glock_cb, sdp,
-                                     GFS2_MIN_LVB_SIZE, flags,
-                                     &sdp->sd_lockstruct, &sdp->sd_kobj);
-        if (error) {
-                fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
-                        proto, table, sdp->sd_args.ar_hostdata);
-                goto out;
-        }
-        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
-                                  GFS2_MIN_LVB_SIZE)) {
-                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-                goto out;
-        }
-        if (sdp->sd_args.ar_spectator)
-                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
-        else
-                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
-                         sdp->sd_lockstruct.ls_jid);
-        fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-        if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
-            !sdp->sd_args.ar_ignore_local_fs) {
-                sdp->sd_args.ar_localflocks = 1;
-                sdp->sd_args.ar_localcaching = 1;
-        }
-out:
-        return error;
-}
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
-                                        sdp->sd_lockstruct.ls_lockspace);
-}
-void gfs2_lm_unmount(struct gfs2_sbd *sdp)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-}
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-{
-        va_list args;
-        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return 0;
-        va_start(args, fmt);
-        vprintk(fmt, args);
-        va_end(args);
-        fs_err(sdp, "about to withdraw this file system\n");
-        BUG_ON(sdp->sd_args.ar_debug);
-        fs_err(sdp, "telling LM to withdraw\n");
-        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
-        fs_err(sdp, "withdrawn\n");
-        dump_stack();
-        return -1;
-}
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                     void **lockp)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
-                                sdp->sd_lockstruct.ls_lockspace, name, lockp);
-        return error;
-}
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
-}
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                          unsigned int cur_state, unsigned int req_state,
-                          unsigned int flags)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-                                                         req_state, flags);
-        return ret;
-}
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                            unsigned int cur_state)
-{
-        int ret = 0;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-        return ret;
-}
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
-}
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
-        return error;
-}
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
-}
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                      struct file *file, struct file_lock *fl)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
-                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-        return error;
-}
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                  struct file *file, int cmd, struct file_lock *fl)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_plock(
-                                sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
-        return error;
-}
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                    struct file *file, struct file_lock *fl)
-{
-        int error = -EIO;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                error = sdp->sd_lockstruct.ls_ops->lm_punlock(
-                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-        return error;
-}
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-                           unsigned int message)
-{
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
-                        sdp->sd_lockstruct.ls_lockspace, jid, message);
-}
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
deleted file mode 100644
index 21cdc30ee08c..000000000000
--- a/fs/gfs2/lm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __LM_DOT_H__
-#define __LM_DOT_H__
-struct gfs2_sbd;
-#define GFS2_MIN_LVB_SIZE 32
-int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
-void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
-void gfs2_lm_unmount(struct gfs2_sbd *sdp);
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
-                                __attribute__ ((format(printf, 2, 3)));
-int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                     void **lockp);
-void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
-unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                         unsigned int cur_state, unsigned int req_state,
-                         unsigned int flags);
-unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-                           unsigned int cur_state);
-void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
-int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
-void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
-int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                      struct file *file, struct file_lock *fl);
-int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                  struct file *file, int cmd, struct file_lock *fl);
-int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-                    struct file *file, struct file_lock *fl);
-void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-                           unsigned int message);
-#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 542a797ac89a..cf7ea8abec87 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -137,7 +137,8 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
                /* Conversion deadlock avoidance by DLM */
-                if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+                if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
+                    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
                    !(lkf & DLM_LKF_NOQUEUE) &&
                    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
                        lkf |= DLM_LKF_CONVDEADLK;
@@ -164,7 +165,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 {
        struct gdlm_lock *lp;
-        lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+        lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
        if (!lp)
                return -ENOMEM;
@@ -382,7 +383,7 @@ static int gdlm_add_lvb(struct gdlm_lock *lp)
 {
        char *lvb;
-        lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+        lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
        if (!lvb)
                return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 9e8265d28377..58fcf8c5bf39 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -183,5 +183,10 @@ int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
                struct file_lock *);
 int gdlm_punlock(void *, struct lm_lockname *, struct file *,
                struct file_lock *);
+/* mount.c */
+extern const struct lm_lockops gdlm_ops;
 #endif
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index a0e7eda643ed..36a225850bd8 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,8 +11,6 @@
 #include "lock_dlm.h"
-extern struct lm_lockops gdlm_ops;
 static int __init init_lock_dlm(void)
 {
        int error;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a87b09839761..8479da47049c 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -12,8 +12,6 @@
 #include "lock_dlm.h"
-extern struct lm_lockops gdlm_ops;
 static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
 {
        return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 521694fc19d6..e53db6fd28ab 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -135,7 +135,15 @@ static void process_complete(struct gdlm_lock *lp)
                         lp->lksb.sb_status, lp->lockname.ln_type,
                         (unsigned long long)lp->lockname.ln_number,
                         lp->flags);
-                return;
+                if (lp->lksb.sb_status == -EDEADLOCK &&
+                    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
+                        lp->req = lp->cur;
+                        acb.lc_ret |= LM_OUT_CONV_DEADLK;
+                        if (lp->cur == DLM_LOCK_IV)
+                                lp->lksb.sb_lkid = 0;
+                        goto out;
+                } else
+                        return;
        }
        /*
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index d3b8ce6fbbe3..284a5ece8d94 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -140,7 +140,7 @@ static int nolock_hold_lvb(void *lock, char **lvbp)
        struct nolock_lockspace *nl = lock;
        int error = 0;
-        *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+        *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
        if (!*lvbp)
                error = -ENOMEM;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 161ab6f2058e..548264b1836d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -769,8 +769,8 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
        gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
        reserved = calc_reserved(sdp);
+        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
-        gfs2_assert_withdraw(sdp, unused >= 0);
        atomic_add(unused, &sdp->sd_log_blks_free);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
                             sdp->sd_jdesc->jd_blocks);
@@ -779,6 +779,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        gfs2_log_unlock(sdp);
 }
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+        struct list_head *head = &tr->tr_list_buf;
+        struct gfs2_bufdata *bd;
+        gfs2_log_lock(sdp);
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+                list_del_init(&bd->bd_list_tr);
+                tr->tr_num_buf--;
+        }
+        gfs2_log_unlock(sdp);
+        gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
 /**
 * gfs2_log_commit - Commit a transaction to the log
 * @sdp: the filesystem
@@ -790,7 +805,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
        log_refund(sdp, tr);
-        lops_incore_commit(sdp, tr);
+        buf_lo_incore_commit(sdp, tr);
        sdp->sd_vfs->s_dirt = 1;
        up_read(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index fae59d69d01a..4390f6f4047d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -152,21 +152,6 @@ out:
        unlock_buffer(bd->bd_bh);
 }
-static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
-{
-        struct list_head *head = &tr->tr_list_buf;
-        struct gfs2_bufdata *bd;
-        gfs2_log_lock(sdp);
-        while (!list_empty(head)) {
-                bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
-                list_del_init(&bd->bd_list_tr);
-                tr->tr_num_buf--;
-        }
-        gfs2_log_unlock(sdp);
-        gfs2_assert_warn(sdp, !tr->tr_num_buf);
-}
 static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 {
        struct buffer_head *bh;
@@ -419,8 +404,10 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                        blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
                        error = gfs2_revoke_add(sdp, blkno, start);
-                        if (error < 0)
+                        if (error < 0) {
+                                brelse(bh);
                                return error;
+                        }
                        else if (error)
                                sdp->sd_found_revokes++;
@@ -737,7 +724,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 const struct gfs2_log_operations gfs2_buf_lops = {
        .lo_add = buf_lo_add,
-        .lo_incore_commit = buf_lo_incore_commit,
        .lo_before_commit = buf_lo_before_commit,
        .lo_after_commit = buf_lo_after_commit,
        .lo_before_scan = buf_lo_before_scan,
@@ -763,7 +749,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
 const struct gfs2_log_operations gfs2_databuf_lops = {
        .lo_add = databuf_lo_add,
-        .lo_incore_commit = buf_lo_incore_commit,
        .lo_before_commit = databuf_lo_before_commit,
        .lo_after_commit = databuf_lo_after_commit,
        .lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 41a00df75587..3c0b2737658a 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -57,15 +57,6 @@ static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
                le->le_ops->lo_add(sdp, le);
 }
-static inline void lops_incore_commit(struct gfs2_sbd *sdp,
-                                      struct gfs2_trans *tr)
-{
-        int x;
-        for (x = 0; gfs2_log_ops[x]; x++)
-                if (gfs2_log_ops[x]->lo_incore_commit)
-                        gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
-}
 static inline void lops_before_commit(struct gfs2_sbd *sdp)
 {
        int x;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 9c7765c12d62..053e2ebbbd50 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -89,6 +89,12 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_bufdata_cachep)
                goto fail;
+        gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
+                                              sizeof(struct gfs2_rgrpd),
+                                              0, 0, NULL);
+        if (!gfs2_rgrpd_cachep)
+                goto fail;
        error = register_filesystem(&gfs2_fs_type);
        if (error)
                goto fail;
@@ -108,6 +114,9 @@ fail_unregister:
 fail:
        gfs2_glock_exit();
+        if (gfs2_rgrpd_cachep)
+                kmem_cache_destroy(gfs2_rgrpd_cachep);
        if (gfs2_bufdata_cachep)
                kmem_cache_destroy(gfs2_bufdata_cachep);
@@ -133,6 +142,7 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
        kmem_cache_destroy(gfs2_inode_cachep);
        kmem_cache_destroy(gfs2_glock_cachep);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ac772b6d9dbb..90a04a6e3789 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
 #include <linux/backing-dev.h>
-#include <linux/pagevec.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -104,11 +103,9 @@ static int gfs2_writepage_common(struct page *page,
        loff_t i_size = i_size_read(inode);
        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
        unsigned offset;
-        int ret = -EIO;
        if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
                goto out;
-        ret = 0;
        if (current->journal_info)
                goto redirty;
        /* Is the page fully outside i_size? (truncate in progress) */
@@ -280,7 +277,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
        int i;
        int ret;
-        ret = gfs2_trans_begin(sdp, nrblocks, 0);
+        ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
        if (ret < 0)
                return ret;
@@ -510,23 +507,26 @@ static int __gfs2_readpage(void *file, struct page *page)
 static int gfs2_readpage(struct file *file, struct page *page)
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        struct gfs2_holder gh;
+        struct gfs2_holder *gh;
        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
+        gh = gfs2_glock_is_locked_by_me(ip->i_gl);
-        error = gfs2_glock_nq_atime(&gh);
+        if (!gh) {
-        if (unlikely(error)) {
+                gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
+                if (!gh)
+                        return -ENOBUFS;
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
                unlock_page(page);
-                goto out;
+                error = gfs2_glock_nq_atime(gh);
+                if (likely(error != 0))
+                        goto out;
+                return AOP_TRUNCATED_PAGE;
        }
        error = __gfs2_readpage(file, page);
-        gfs2_glock_dq(&gh);
+        gfs2_glock_dq(gh);
 out:
-        gfs2_holder_uninit(&gh);
+        gfs2_holder_uninit(gh);
-        if (error == GLR_TRYFAILED) {
+        kfree(gh);
-                yield();
-                return AOP_TRUNCATED_PAGE;
-        }
        return error;
 }
@@ -648,15 +648,15 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_unlock;
+                }
-                error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                error = gfs2_quota_lock_check(ip);
                if (error)
                        goto out_alloc_put;
-                error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-                if (error)
-                        goto out_qunlock;
                al->al_requested = data_blocks + ind_blocks;
                error = gfs2_inplace_reserve(ip);
                if (error)
@@ -828,7 +828,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        unsigned int to = from + len;
        int ret;
-        BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0);
+        BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
        ret = gfs2_meta_inode_buffer(ip, &dibh);
        if (unlikely(ret)) {
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 793e334d098e..4a5e676b4420 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        struct gfs2_holder d_gh;
        struct gfs2_inode *ip = NULL;
        int error;
-        int had_lock=0;
+        int had_lock = 0;
        if (inode) {
                if (is_bad_inode(inode))
@@ -54,7 +54,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        if (sdp->sd_args.ar_localcaching)
                goto valid;
-        had_lock = gfs2_glock_is_locked_by_me(dip->i_gl);
+        had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
        if (!had_lock) {
                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
                if (error)
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 334c7f85351b..990d9f4bc463 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -204,8 +204,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
                                        inum->no_addr,
                                        0, 0);
-        if (!inode)
-                goto fail;
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index f4842f2548cd..e1b7d525a066 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -30,7 +30,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "log.h"
 #include "meta_io.h"
 #include "quota.h"
@@ -39,6 +38,7 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
+#include "ops_inode.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -369,12 +369,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        if (al == NULL)
                goto out_unlock;
-        ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        ret = gfs2_quota_lock_check(ip);
        if (ret)
                goto out_alloc_put;
-        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        if (ret)
-                goto out_quota_unlock;
        al->al_requested = data_blocks + ind_blocks;
        ret = gfs2_inplace_reserve(ip);
        if (ret)
@@ -596,6 +593,36 @@ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
        return generic_setlease(file, arg, fl);
 }
+static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                      struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
+static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                  struct file *file, int cmd, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_plock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+        return error;
+}
+static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+                    struct file *file, struct file_lock *fl)
+{
+        int error = -EIO;
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+                                sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+        return error;
+}
 /**
 * gfs2_lock - acquire/release a posix lock on a file
 * @file: the file pointer
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4bee6aa845e4..ef9c6c4f80f6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -26,7 +26,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "lm.h"
 #include "mount.h"
 #include "ops_fstype.h"
 #include "ops_dentry.h"
@@ -363,6 +362,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
        return rc;
 }
+static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+                                        sdp->sd_lockstruct.ls_lockspace);
+}
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct gfs2_holder ji_gh;
@@ -542,7 +548,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        }
        ip = GFS2_I(sdp->sd_rindex);
        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
-        sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
        sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
@@ -705,6 +711,69 @@ fail:
 }
 /**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+        char *proto = sdp->sd_proto_name;
+        char *table = sdp->sd_table_name;
+        int flags = LM_MFLAG_CONV_NODROP;
+        int error;
+        if (sdp->sd_args.ar_spectator)
+                flags |= LM_MFLAG_SPECTATOR;
+        fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+        error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+                                     gfs2_glock_cb, sdp,
+                                     GFS2_MIN_LVB_SIZE, flags,
+                                     &sdp->sd_lockstruct, &sdp->sd_kobj);
+        if (error) {
+                fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+                        proto, table, sdp->sd_args.ar_hostdata);
+                goto out;
+        }
+        if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+            gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+                                  GFS2_MIN_LVB_SIZE)) {
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+                goto out;
+        }
+        if (sdp->sd_args.ar_spectator)
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+        else
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+                         sdp->sd_lockstruct.ls_jid);
+        fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+        if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+            !sdp->sd_args.ar_ignore_local_fs) {
+                sdp->sd_args.ar_localflocks = 1;
+                sdp->sd_args.ar_localcaching = 1;
+        }
+out:
+        return error;
+}
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+/**
 * fill_super - Read in superblock
 * @sb: The VFS superblock
 * @data: Mount options
@@ -874,7 +943,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
 {
        struct kstat stat;
        struct nameidata nd;
-        struct file_system_type *fstype;
        struct super_block *sb = NULL, *s;
        int error;
@@ -886,8 +954,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
        }
        error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-        fstype = get_fs_type("gfs2");
+        list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
-        list_for_each_entry(s, &fstype->fs_supers, s_instances) {
                if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
                    (S_ISDIR(stat.mode) &&
                     s == nd.path.dentry->d_inode->i_sb)) {
@@ -931,7 +998,6 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                error = PTR_ERR(new);
                goto error;
        }
-        module_put(fs_type->owner);
        new->s_flags = flags;
        strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
        sb_set_blocksize(new, sb->s_blocksize);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e87412902bed..2686ad4c0029 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -200,15 +200,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (alloc_required) {
                struct gfs2_alloc *al = gfs2_alloc_get(dip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_gunlock;
+                }
-                error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                error = gfs2_quota_lock_check(dip);
                if (error)
                        goto out_alloc;
-                error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
-                if (error)
-                        goto out_gunlock_q;
                al->al_requested = sdp->sd_max_dirres;
                error = gfs2_inplace_reserve(dip);
@@ -716,15 +716,15 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        if (alloc_required) {
                struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+                if (!al) {
+                        error = -ENOMEM;
+                        goto out_gunlock;
+                }
-                error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+                error = gfs2_quota_lock_check(ndip);
                if (error)
                        goto out_alloc;
-                error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid);
-                if (error)
-                        goto out_gunlock_q;
                al->al_requested = sdp->sd_max_dirres;
                error = gfs2_inplace_reserve(ndip);
@@ -898,7 +898,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
        int error;
        int unlock = 0;
-        if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
                        return error;
@@ -953,7 +953,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
                ogid = ngid = NO_QUOTA_CHANGE;
-        gfs2_alloc_get(ip);
+        if (!gfs2_alloc_get(ip))
+                return -ENOMEM;
        error = gfs2_quota_lock(ip, nuid, ngid);
        if (error)
@@ -981,8 +982,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
-                gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
+                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
-                gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+                gfs2_quota_change(ip, -blocks, ouid, ogid);
+                gfs2_quota_change(ip, blocks, nuid, ngid);
        }
 out_end_trans:
@@ -1064,7 +1066,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
        int error;
        int unlock = 0;
-        if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) {
+        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
                if (error)
                        return error;
@@ -1148,16 +1150,6 @@ const struct inode_operations gfs2_file_iops = {
        .removexattr = gfs2_removexattr,
 };
-const struct inode_operations gfs2_dev_iops = {
-        .permission = gfs2_permission,
-        .setattr = gfs2_setattr,
-        .getattr = gfs2_getattr,
-        .setxattr = gfs2_setxattr,
-        .getxattr = gfs2_getxattr,
-        .listxattr = gfs2_listxattr,
-        .removexattr = gfs2_removexattr,
-};
 const struct inode_operations gfs2_dir_iops = {
        .create = gfs2_create,
        .lookup = gfs2_lookup,
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index fd8cee231e1d..14b4b797622a 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -15,7 +15,6 @@
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct inode_operations gfs2_dev_iops;
 extern const struct file_operations gfs2_file_fops;
 extern const struct file_operations gfs2_dir_fops;
 extern const struct file_operations gfs2_file_fops_nolock;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 5e524217944a..2278c68b7e35 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -25,7 +25,6 @@
 #include "incore.h"
 #include "glock.h"
 #include "inode.h"
-#include "lm.h"
 #include "log.h"
 #include "mount.h"
 #include "ops_super.h"
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a08dabd6ce90..56aaf915c59a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -94,7 +94,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        struct gfs2_quota_data *qd;
        int error;
-        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
        if (!qd)
                return -ENOMEM;
@@ -616,16 +616,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        s64 value;
        int err = -EIO;
-        if (gfs2_is_stuffed(ip)) {
+        if (gfs2_is_stuffed(ip))
-                struct gfs2_alloc *al = NULL;
-                al = gfs2_alloc_get(ip);
-                /* just request 1 blk */
-                al->al_requested = 1;
-                gfs2_inplace_reserve(ip);
                gfs2_unstuff_dinode(ip, NULL);
-                gfs2_inplace_release(ip);
+        
-                gfs2_alloc_put(ip);
-        }
        page = grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -690,14 +683,14 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        unsigned int qx, x;
        struct gfs2_quota_data *qd;
        loff_t offset;
-        unsigned int nalloc = 0;
+        unsigned int nalloc = 0, blocks;
        struct gfs2_alloc *al = NULL;
        int error;
        gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
                              &data_blocks, &ind_blocks);
-        ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+        ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
        if (!ghs)
                return -ENOMEM;
@@ -727,30 +720,33 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                        nalloc++;
        }
-        if (nalloc) {
+        al = gfs2_alloc_get(ip);
-                al = gfs2_alloc_get(ip);
+        if (!al) {
+                error = -ENOMEM;
+                goto out_gunlock;
+        }
+        /* 
+         * 1 blk for unstuffing inode if stuffed. We add this extra
+         * block to the reservation unconditionally. If the inode
+         * doesn't need unstuffing, the block will be released to the 
+         * rgrp since it won't be allocated during the transaction
+         */
+        al->al_requested = 1;
+        /* +1 in the end for block requested above for unstuffing */
+        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
-                al->al_requested = nalloc * (data_blocks + ind_blocks);
+        if (nalloc)
+                al->al_requested += nalloc * (data_blocks + ind_blocks);                
+        error = gfs2_inplace_reserve(ip);
+        if (error)
+                goto out_alloc;
-                error = gfs2_inplace_reserve(ip);
+        if (nalloc)
-                if (error)
+                blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
-                        goto out_alloc;
+        error = gfs2_trans_begin(sdp, blocks, 0);
-                error = gfs2_trans_begin(sdp,
+        if (error)
-                                         al->al_rgd->rd_length +
+                goto out_ipres;
-                                         num_qd * data_blocks +
-                                         nalloc * ind_blocks +
-                                         RES_DINODE + num_qd +
-                                         RES_STATFS, 0);
-                if (error)
-                        goto out_ipres;
-        } else {
-                error = gfs2_trans_begin(sdp,
-                                         num_qd * data_blocks +
-                                         RES_DINODE + num_qd, 0);
-                if (error)
-                        goto out_gunlock;
-        }
        for (x = 0; x < num_qd; x++) {
                qd = qda[x];
@@ -769,11 +765,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 out_end_trans:
        gfs2_trans_end(sdp);
 out_ipres:
-        if (nalloc)
+        gfs2_inplace_release(ip);
-                gfs2_inplace_release(ip);
 out_alloc:
-        if (nalloc)
+        gfs2_alloc_put(ip);
-                gfs2_alloc_put(ip);
 out_gunlock:
        gfs2_glock_dq_uninit(&i_gh);
 out:
@@ -1124,12 +1118,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
        error = -ENOMEM;
        sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
-                                       sizeof(unsigned char *), GFP_KERNEL);
+                                       sizeof(unsigned char *), GFP_NOFS);
        if (!sdp->sd_quota_bitmap)
                return error;
        for (x = 0; x < sdp->sd_quota_chunks; x++) {
-                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+                sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
                if (!sdp->sd_quota_bitmap[x])
                        goto fail;
        }
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index a8be1417051f..3b7f4b0e5dfe 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -32,4 +32,21 @@ int gfs2_quota_init(struct gfs2_sbd *sdp);
 void gfs2_quota_scan(struct gfs2_sbd *sdp);
 void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int ret;
+        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+                return 0;
+        ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+        if (ret)
+                return ret;
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+        ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        if (ret)
+                gfs2_quota_unlock(ip);
+        return ret;
+}
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 6fb07d67ca8a..2888e4b4b1c5 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -20,7 +20,6 @@
 #include "bmap.h"
 #include "glock.h"
 #include "glops.h"
-#include "lm.h"
 #include "lops.h"
 #include "meta_io.h"
 #include "recovery.h"
@@ -69,7 +68,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
                return 0;
        }
-        rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+        rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
        if (!rr)
                return -ENOMEM;
@@ -150,7 +149,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
                          struct gfs2_log_header_host *head)
 {
        struct buffer_head *bh;
-        struct gfs2_log_header_host lh;
+        struct gfs2_log_header_host uninitialized_var(lh);
        const u32 nothing = 0;
        u32 hash;
        int error;
@@ -425,6 +424,16 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
        return error;
 }
+static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                                  unsigned int message)
+{
+        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+                sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+                        sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
 /**
 * gfs2_recover_journal - recovery a given journal
 * @jd: the struct gfs2_jdesc describing the journal
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3552110b2e5f..7e8f0b1d6c6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/prefetch.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -33,6 +34,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
 /*
 * These routines are used by the resource group routines (rgrp.c)
 * to keep track of block allocation.  Each block is represented by two
@@ -53,7 +64,8 @@ static const char valid_change[16] = {
 };
 static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-                        unsigned char old_state, unsigned char new_state);
+                        unsigned char old_state, unsigned char new_state,
+                        unsigned int *n);
 /**
 * gfs2_setbit - Set a bit in the bitmaps
@@ -64,26 +76,32 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 *
 */
-static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
-                        unsigned int buflen, u32 block,
+                               unsigned char *buf2, unsigned int offset,
-                        unsigned char new_state)
+                               unsigned int buflen, u32 block,
+                               unsigned char new_state)
 {
-        unsigned char *byte, *end, cur_state;
+        unsigned char *byte1, *byte2, *end, cur_state;
-        unsigned int bit;
+        const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
-        byte = buffer + (block / GFS2_NBBY);
+        byte1 = buf1 + offset + (block / GFS2_NBBY);
-        bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+        end = buf1 + offset + buflen;
-        end = buffer + buflen;
-        gfs2_assert(rgd->rd_sbd, byte < end);
+        BUG_ON(byte1 >= end);
-        cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+        cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
-        if (valid_change[new_state * 4 + cur_state]) {
+        if (unlikely(!valid_change[new_state * 4 + cur_state])) {
-                *byte ^= cur_state << bit;
-                *byte |= new_state << bit;
-        } else
                gfs2_consist_rgrpd(rgd);
+                return;
+        }
+        *byte1 ^= (cur_state ^ new_state) << bit;
+        if (buf2) {
+                byte2 = buf2 + offset + (block / GFS2_NBBY);
+                cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
+                *byte2 ^= (cur_state ^ new_state) << bit;
+        }
 }
 /**
@@ -94,10 +112,12 @@ static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
 *
 */
-static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
-                                  unsigned int buflen, u32 block)
+                                         const unsigned char *buffer,
+                                         unsigned int buflen, u32 block)
 {
-        unsigned char *byte, *end, cur_state;
+        const unsigned char *byte, *end;
+        unsigned char cur_state;
        unsigned int bit;
        byte = buffer + (block / GFS2_NBBY);
@@ -126,47 +146,66 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
 * Return: the block number (bitmap buffer scope) that was found
 */
-static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
+static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
-                       unsigned char old_state)
+                       u8 old_state)
 {
-        unsigned char *byte;
+        const u8 *byte, *start, *end;
-        u32 blk = goal;
+        int bit, startbit;
-        unsigned int bit, bitlong;
+        u32 g1, g2, misaligned;
-        unsigned long *plong, plong55;
+        unsigned long *plong;
+        unsigned long lskipval;
-        byte = buffer + (goal / GFS2_NBBY);
-        plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
+        lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
-        bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+        g1 = (goal / GFS2_NBBY);
-        bitlong = bit;
+        start = buffer + g1;
-#if BITS_PER_LONG == 32
+        byte = start;
-        plong55 = 0x55555555;
+        end = buffer + buflen;
-#else
+        g2 = ALIGN(g1, sizeof(unsigned long));
-        plong55 = 0x5555555555555555;
+        plong = (unsigned long *)(buffer + g2);
-#endif
+        startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-        while (byte < buffer + buflen) {
+        misaligned = g2 - g1;
+        if (!misaligned)
-                if (bitlong == 0 && old_state == 0 && *plong == plong55) {
+                goto ulong_aligned;
-                        plong++;
+/* parse the bitmap a byte at a time */
-                        byte += sizeof(unsigned long);
+misaligned:
-                        blk += sizeof(unsigned long) * GFS2_NBBY;
+        while (byte < end) {
-                        continue;
+                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+                        return goal +
+                                (((byte - start) * GFS2_NBBY) +
+                                 ((bit - startbit) >> 1));
                }
-                if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-                        return blk;
                bit += GFS2_BIT_SIZE;
-                if (bit >= 8) {
+                if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
                        bit = 0;
                        byte++;
+                        misaligned--;
+                        if (!misaligned) {
+                                plong = (unsigned long *)byte;
+                                goto ulong_aligned;
+                        }
                }
-                bitlong += GFS2_BIT_SIZE;
-                if (bitlong >= sizeof(unsigned long) * 8) {
-                        bitlong = 0;
-                        plong++;
-                }
-                blk++;
        }
+        return BFITNOENT;
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+        /* Stop at "end - 1" or else prefetch can go past the end and segfault.
+           We could "if" it but we'd lose some of the performance gained.
+           This way will only slow down searching the very last 4/8 bytes
+           depending on architecture.  I've experimented with several ways
+           of writing this section such as using an else before the goto
+           but this one seems to be the fastest. */
+        while ((unsigned char *)plong < end - 1) {
+                prefetch(plong + 1);
+                if (((*plong) & LBITMASK) != lskipval)
+                        break;
+                plong++;
+        }
+        if ((unsigned char *)plong < end) {
+                byte = (const u8 *)plong;
+                misaligned += sizeof(unsigned long) - 1;
+                goto misaligned;
+        }
        return BFITNOENT;
 }
@@ -179,14 +218,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
 * Returns: The number of bits
 */
-static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
-                              unsigned int buflen, unsigned char state)
+                         unsigned int buflen, u8 state)
 {
-        unsigned char *byte = buffer;
+        const u8 *byte = buffer;
-        unsigned char *end = buffer + buflen;
+        const u8 *end = buffer + buflen;
-        unsigned char state1 = state << 2;
+        const u8 state1 = state << 2;
-        unsigned char state2 = state << 4;
+        const u8 state2 = state << 4;
-        unsigned char state3 = state << 6;
+        const u8 state3 = state << 6;
        u32 count = 0;
        for (; byte < end; byte++) {
@@ -353,7 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
                }
                kfree(rgd->rd_bits);
-                kfree(rgd);
+                kmem_cache_free(gfs2_rgrpd_cachep, rgd);
        }
 }
@@ -516,7 +555,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
                return error;
        }
-        rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+        rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
        error = -ENOMEM;
        if (!rgd)
                return error;
@@ -539,7 +578,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
                return error;
        rgd->rd_gl->gl_object = rgd;
-        rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
        rgd->rd_flags |= GFS2_RDF_CHECK;
        return error;
 }
@@ -575,7 +614,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
                }
        }
-        sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
@@ -609,7 +648,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
                }
        }
-        sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
@@ -642,9 +681,9 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
                return error;
        /* Read new copy from disk if we don't have the latest */
-        if (sdp->sd_rindex_vn != gl->gl_vn) {
+        if (!sdp->sd_rindex_uptodate) {
                mutex_lock(&sdp->sd_rindex_mutex);
-                if (sdp->sd_rindex_vn != gl->gl_vn) {
+                if (!sdp->sd_rindex_uptodate) {
                        error = gfs2_ri_update(ip);
                        if (error)
                                gfs2_glock_dq_uninit(ri_gh);
@@ -655,21 +694,31 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
        return error;
 }
-static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf)
+static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
        const struct gfs2_rgrp *str = buf;
+        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+        u32 rg_flags;
-        rg->rg_flags = be32_to_cpu(str->rg_flags);
+        rg_flags = be32_to_cpu(str->rg_flags);
+        if (rg_flags & GFS2_RGF_NOALLOC)
+                rgd->rd_flags |= GFS2_RDF_NOALLOC;
+        else
+                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
        rg->rg_free = be32_to_cpu(str->rg_free);
        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
 }
-static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf)
+static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
+        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
+        u32 rg_flags = 0;
-        str->rg_flags = cpu_to_be32(rg->rg_flags);
+        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+                rg_flags |= GFS2_RGF_NOALLOC;
+        str->rg_flags = cpu_to_be32(rg_flags);
        str->rg_free = cpu_to_be32(rg->rg_free);
        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
        str->__pad = cpu_to_be32(0);
@@ -726,9 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                }
        }
-        if (rgd->rd_rg_vn != gl->gl_vn) {
+        if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
-                gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-                rgd->rd_rg_vn = gl->gl_vn;
+                rgd->rd_flags |= GFS2_RDF_UPTODATE;
        }
        spin_lock(&sdp->sd_rindex_spin);
@@ -840,7 +889,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int ret = 0;
-        if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
                return 0;
        spin_lock(&sdp->sd_rindex_spin);
@@ -866,13 +915,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        unsigned int n;
        for(;;) {
                if (goal >= rgd->rd_data)
                        break;
                down_write(&sdp->sd_log_flush_lock);
+                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
-                                     GFS2_BLKST_UNLINKED);
+                                     GFS2_BLKST_UNLINKED, &n);
                up_write(&sdp->sd_log_flush_lock);
                if (block == BFITNOENT)
                        break;
@@ -904,24 +955,20 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
                                            u64 rglast)
 {
-        struct gfs2_rgrpd *rgd = NULL;
+        struct gfs2_rgrpd *rgd;
        spin_lock(&sdp->sd_rindex_spin);
-        if (list_empty(&sdp->sd_rindex_recent_list))
+        if (rglast) {
-                goto out;
+                list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+                        if (rgrp_contains_block(rgd, rglast))
-        if (!rglast)
+                                goto out;
-                goto first;
+                }
-        list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-                if (rgd->rd_addr == rglast)
-                        goto out;
        }
+        rgd = NULL;
-first:
+        if (!list_empty(&sdp->sd_rindex_recent_list))
-        rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
+                rgd = list_entry(sdp->sd_rindex_recent_list.next,
-                         rd_recent);
+                                 struct gfs2_rgrpd, rd_recent);
 out:
        spin_unlock(&sdp->sd_rindex_spin);
        return rgd;
@@ -1067,7 +1114,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        /* Try recently successful rgrps */
-        rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+        rgd = recent_rgrp_first(sdp, ip->i_goal);
        while (rgd) {
                rg_locked = 0;
@@ -1151,8 +1198,6 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        }
 out:
-        ip->i_last_rg_alloc = rgd->rd_addr;
        if (begin) {
                recent_rgrp_add(rgd);
                rgd = gfs2_rgrpd_get_next(rgd);
@@ -1275,6 +1320,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 * @goal: the goal block within the RG (start here to search for avail block)
 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
 * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ * @n: The extent length
 *
 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
 * Add the found bitmap buffer to the transaction.
@@ -1290,13 +1336,17 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 */
 static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
-                        unsigned char old_state, unsigned char new_state)
+                        unsigned char old_state, unsigned char new_state,
+                        unsigned int *n)
 {
        struct gfs2_bitmap *bi = NULL;
-        u32 length = rgd->rd_length;
+        const u32 length = rgd->rd_length;
        u32 blk = 0;
        unsigned int buf, x;
+        const unsigned int elen = *n;
+        const u8 *buffer;
+        *n = 0;
        /* Find bitmap block that contains bits for goal block */
        for (buf = 0; buf < length; buf++) {
                bi = rgd->rd_bits + buf;
@@ -1317,12 +1367,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
        for (x = 0; x <= length; x++) {
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
+                buffer = bi->bi_bh->b_data + bi->bi_offset;
                if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
-                        blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
+                        buffer = bi->bi_clone + bi->bi_offset;
-                                          bi->bi_len, goal, old_state);
-                else
+                blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
-                        blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
-                                          bi->bi_len, goal, old_state);
                if (blk != BFITNOENT)
                        break;
@@ -1333,12 +1382,23 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
        }
        if (blk != BFITNOENT && old_state != new_state) {
+                *n = 1;
                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-                gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
                            bi->bi_len, blk, new_state);
-                if (bi->bi_clone)
+                goal = blk;
-                        gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
+                while (*n < elen) {
-                                    bi->bi_len, blk, new_state);
+                        goal++;
+                        if (goal >= (bi->bi_len * GFS2_NBBY))
+                                break;
+                        if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+                            GFS2_BLKST_FREE)
+                                break;
+                        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
+                                    bi->bi_offset, bi->bi_len, goal,
+                                    new_state);
+                        (*n)++;
+                }
        }
        return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
@@ -1393,7 +1453,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
                               bi->bi_len);
                }
                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
-                gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+                gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
                            bi->bi_len, buf_blk, new_state);
        }
@@ -1401,13 +1461,13 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 /**
- * gfs2_alloc_data - Allocate a data block
+ * gfs2_alloc_block - Allocate a block
- * @ip: the inode to allocate the data block for
+ * @ip: the inode to allocate the block for
 *
 * Returns: the allocated block
 */
-u64 gfs2_alloc_data(struct gfs2_inode *ip)
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1415,77 +1475,31 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
        u32 goal, blk;
        u64 block;
-        if (rgrp_contains_block(rgd, ip->i_di.di_goal_data))
+        if (rgrp_contains_block(rgd, ip->i_goal))
-                goal = ip->i_di.di_goal_data - rgd->rd_data0;
+                goal = ip->i_goal - rgd->rd_data0;
        else
-                goal = rgd->rd_last_alloc_data;
+                goal = rgd->rd_last_alloc;
-        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc_data = blk;
+        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
-        ip->i_di.di_goal_data = block;
+        ip->i_goal = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
-        rgd->rd_rg.rg_free--;
+        rgd->rd_rg.rg_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-        al->al_alloced++;
+        al->al_alloced += *n;
-        gfs2_statfs_change(sdp, 0, -1, 0);
+        gfs2_statfs_change(sdp, 0, -*n, 0);
-        gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
+        gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone--;
+        rgd->rd_free_clone -= *n;
-        spin_unlock(&sdp->sd_rindex_spin);
-        return block;
-}
-/**
- * gfs2_alloc_meta - Allocate a metadata block
- * @ip: the inode to allocate the metadata block for
- *
- * Returns: the allocated block
- */
-u64 gfs2_alloc_meta(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc *al = ip->i_alloc;
-        struct gfs2_rgrpd *rgd = al->al_rgd;
-        u32 goal, blk;
-        u64 block;
-        if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta))
-                goal = ip->i_di.di_goal_meta - rgd->rd_data0;
-        else
-                goal = rgd->rd_last_alloc_meta;
-        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
-        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc_meta = blk;
-        block = rgd->rd_data0 + blk;
-        ip->i_di.di_goal_meta = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
-        rgd->rd_rg.rg_free--;
-        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
-        al->al_alloced++;
-        gfs2_statfs_change(sdp, 0, -1, 0);
-        gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        gfs2_trans_add_unrevoke(sdp, block);
-        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone--;
        spin_unlock(&sdp->sd_rindex_spin);
        return block;
@@ -1505,12 +1519,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        struct gfs2_rgrpd *rgd = al->al_rgd;
        u32 blk;
        u64 block;
+        unsigned int n = 1;
-        blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+        blk = rgblk_search(rgd, rgd->rd_last_alloc,
-                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc_meta = blk;
+        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
@@ -1519,12 +1534,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        rgd->rd_rg.rg_dinodes++;
        *generation = rgd->rd_rg.rg_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        al->al_alloced++;
        gfs2_statfs_change(sdp, 0, -1, +1);
-        gfs2_trans_add_unrevoke(sdp, block);
+        gfs2_trans_add_unrevoke(sdp, block, 1);
        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone--;
@@ -1553,7 +1568,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd->rd_rg.rg_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
@@ -1581,7 +1596,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd->rd_rg.rg_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
@@ -1601,7 +1616,7 @@ void gfs2_unlink_di(struct inode *inode)
        if (!rgd)
                return;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
 }
@@ -1621,7 +1636,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
        rgd->rd_rg.rg_free++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
-        gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_statfs_change(sdp, 0, +1, -1);
        gfs2_trans_add_rg(rgd);
@@ -1699,8 +1714,7 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
 *
 */
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
-                      int flags)
 {
        unsigned int x;
@@ -1708,7 +1722,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
                                GFP_NOFS | __GFP_NOFAIL);
        for (x = 0; x < rlist->rl_rgrps; x++)
                gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
-                                state, flags,
+                                state, 0,
                                &rlist->rl_ghs[x]);
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 149bb161f4b6..3181c7e624bf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -46,8 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip);
 unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-u64 gfs2_alloc_data(struct gfs2_inode *ip);
+u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
-u64 gfs2_alloc_meta(struct gfs2_inode *ip);
 u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -64,8 +63,7 @@ struct gfs2_rgrp_list {
 void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
                    u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-                      int flags);
 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 u64 gfs2_ri_total(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ef0562c3bc71..7aeacbc65f35 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -210,7 +210,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        struct page *page;
        struct bio *bio;
-        page = alloc_page(GFP_KERNEL);
+        page = alloc_page(GFP_NOFS);
        if (unlikely(!page))
                return -ENOBUFS;
@@ -218,7 +218,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        ClearPageDirty(page);
        lock_page(page);
-        bio = bio_alloc(GFP_KERNEL, 1);
+        bio = bio_alloc(GFP_NOFS, 1);
        if (unlikely(!bio)) {
                __free_page(page);
                return -ENOBUFS;
@@ -316,6 +316,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
                sdp->sd_heightsize[x] = space;
        }
        sdp->sd_max_height = x;
+        sdp->sd_heightsize[x] = ~0;
        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
@@ -334,6 +335,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
                sdp->sd_jheightsize[x] = space;
        }
        sdp->sd_max_jheight = x;
+        sdp->sd_jheightsize[x] = ~0;
        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
        return 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 60a870e430be..44361ecc44f7 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -17,6 +17,7 @@ void gfs2_tune_init(struct gfs2_tune *gt);
 int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
 int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
 int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index eaa3b7b2f99e..9ab9fc85ecd0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -20,7 +20,6 @@
 #include "gfs2.h"
 #include "incore.h"
-#include "lm.h"
 #include "sys.h"
 #include "super.h"
 #include "glock.h"
@@ -328,15 +327,9 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 }                                                                           \
 static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-COUNTERS_ATTR(glock_count,      "%u\n");
-COUNTERS_ATTR(glock_held_count, "%u\n");
-COUNTERS_ATTR(inode_count,      "%u\n");
 COUNTERS_ATTR(reclaimed,        "%u\n");
 static struct attribute *counters_attrs[] = {
-        &counters_attr_glock_count.attr,
-        &counters_attr_glock_held_count.attr,
-        &counters_attr_inode_count.attr,
        &counters_attr_reclaimed.attr,
        NULL,
 };
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 73e5d92a657c..f677b8a83f0c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -146,30 +146,25 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        lops_add(sdp, &bd->bd_le);
 }
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
 {
-        struct gfs2_bufdata *bd;
+        struct gfs2_bufdata *bd, *tmp;
-        int found = 0;
+        struct gfs2_trans *tr = current->journal_info;
+        unsigned int n = len;
        gfs2_log_lock(sdp);
+        list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
-        list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
+                if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
-                if (bd->bd_blkno == blkno) {
                        list_del_init(&bd->bd_le.le_list);
                        gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
                        sdp->sd_log_num_revoke--;
-                        found = 1;
+                        kmem_cache_free(gfs2_bufdata_cachep, bd);
-                        break;
+                        tr->tr_num_revoke_rm++;
+                        if (--n == 0)
+                                break;
                }
        }
        gfs2_log_unlock(sdp);
-        if (found) {
-                struct gfs2_trans *tr = current->journal_info;
-                kmem_cache_free(gfs2_bufdata_cachep, bd);
-                tr->tr_num_revoke_rm++;
-        }
 }
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index e826f0dab80a..edf9d4bd908e 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
 void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
 #endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 424a0774eda8..d31e355c61fb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -19,12 +19,12 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "glock.h"
-#include "lm.h"
 #include "util.h"
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
+struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
@@ -32,6 +32,28 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
               sdp->sd_fsname);
 }
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+        va_list args;
+        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return 0;
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        fs_err(sdp, "about to withdraw this file system\n");
+        BUG_ON(sdp->sd_args.ar_debug);
+        fs_err(sdp, "telling LM to withdraw\n");
+        gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+        fs_err(sdp, "withdrawn\n");
+        dump_stack();
+        return -1;
+}
 /**
 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
 * Returns: -1 if this call withdrew the machine,
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 28938a46cf47..509c5d60bd80 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,6 +147,7 @@ gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
 extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
+extern struct kmem_cache *gfs2_rgrpd_cachep;
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
                                           unsigned int *p)
@@ -163,6 +164,7 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
 void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
                      unsigned int bit, int new_value);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
 #endif /* __UTIL_DOT_H__ */
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 878bf25dbc6a..92fb358ce824 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -229,7 +229,7 @@ skip:
 static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 {
        struct hfs_btree *tree;
-        struct hfs_bnode *node, *new_node;
+        struct hfs_bnode *node, *new_node, *next_node;
        struct hfs_bnode_desc node_desc;
        int num_recs, new_rec_off, new_off, old_rec_off;
        int data_start, data_end, size;
@@ -248,6 +248,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
        new_node->type = node->type;
        new_node->height = node->height;
+        if (node->next)
+                next_node = hfs_bnode_find(tree, node->next);
+        else
+                next_node = NULL;
+        if (IS_ERR(next_node)) {
+                hfs_bnode_put(node);
+                hfs_bnode_put(new_node);
+                return next_node;
+        }
        size = tree->node_size / 2 - node->num_recs * 2 - 14;
        old_rec_off = tree->node_size - 4;
        num_recs = 1;
@@ -261,6 +272,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
                /* panic? */
                hfs_bnode_put(node);
                hfs_bnode_put(new_node);
+                if (next_node)
+                        hfs_bnode_put(next_node);
                return ERR_PTR(-ENOSPC);
        }
@@ -315,8 +328,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
        hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
        /* update next bnode header */
-        if (new_node->next) {
+        if (next_node) {
-                struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
                next_node->prev = new_node->this;
                hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
                node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 29683645fa0a..5f4023678251 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -340,16 +340,23 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
        if (inode->i_nlink > 0)
                drop_nlink(inode);
-        hfsplus_delete_inode(inode);
+        if (inode->i_ino == cnid)
-        if (inode->i_ino != cnid && !inode->i_nlink) {
+                clear_nlink(inode);
-                if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+        if (!inode->i_nlink) {
-                        res = hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+                if (inode->i_ino != cnid) {
-                        if (!res)
+                        HFSPLUS_SB(sb).file_count--;
-                                hfsplus_delete_inode(inode);
+                        if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+                                res = hfsplus_delete_cat(inode->i_ino,
+                                                         HFSPLUS_SB(sb).hidden_dir,
+                                                         NULL);
+                                if (!res)
+                                        hfsplus_delete_inode(inode);
+                        } else
+                                inode->i_flags |= S_DEAD;
                } else
-                        inode->i_flags |= S_DEAD;
+                        hfsplus_delete_inode(inode);
        } else
-                clear_nlink(inode);
+                HFSPLUS_SB(sb).file_count--;
        inode->i_ctime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index b60c0affbec5..f457d2ca51ab 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
 #include <asm/uaccess.h>
@@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
                return put_user(flags, (int __user *)arg);
        case HFSPLUS_IOC_EXT2_SETFLAGS: {
-                if (IS_RDONLY(inode))
+                int err = 0;
-                        return -EROFS;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
-                if (!is_owner_or_cap(inode))
+                        return err;
-                        return -EACCES;
+                if (!is_owner_or_cap(inode)) {
-                if (get_user(flags, (int __user *)arg))
+                        err = -EACCES;
-                        return -EFAULT;
+                        goto setflags_out;
+                }
+                if (get_user(flags, (int __user *)arg)) {
+                        err = -EFAULT;
+                        goto setflags_out;
+                }
                if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
                    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-                        if (!capable(CAP_LINUX_IMMUTABLE))
+                        if (!capable(CAP_LINUX_IMMUTABLE)) {
-                                return -EPERM;
+                                err = -EPERM;
+                                goto setflags_out;
+                        }
                }
                /* don't silently ignore unsupported ext2 flags */
-                if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
+                if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
-                        return -EOPNOTSUPP;
+                        err = -EOPNOTSUPP;
+                        goto setflags_out;
+                }
                if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
                        inode->i_flags |= S_IMMUTABLE;
                        HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                inode->i_ctime = CURRENT_TIME_SEC;
                mark_inode_dirty(inode);
-                return 0;
+setflags_out:
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
        }
        default:
                return -ENOTTY;
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index a1e1f0f61aa5..8601d8ef3b55 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -1,23 +1,24 @@
 /*
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
 * Licensed under the GPL
 */
-#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/dcache.h>
 #include <linux/file.h>
-#include <linux/module.h>
+#include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/list.h>
 #include <linux/kernel.h>
-#include <linux/ctype.h>
+#include <linux/list.h>
-#include <linux/dcache.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/statfs.h>
+#include <linux/types.h>
 #include <asm/uaccess.h>
-#include <asm/fcntl.h>
 #include "os.h"
-static int init_inode(struct inode *inode, struct dentry *dentry);
+static struct inode *get_inode(struct super_block *, struct dentry *);
 struct hppfs_data {
        struct list_head list;
@@ -51,14 +52,14 @@ static int is_pid(struct dentry *dentry)
        int i;
        sb = dentry->d_sb;
-        if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
+        if ((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
-                return(0);
+                return 0;
-        for(i = 0; i < dentry->d_name.len; i++){
+        for (i = 0; i < dentry->d_name.len; i++) {
-                if(!isdigit(dentry->d_name.name[i]))
+                if (!isdigit(dentry->d_name.name[i]))
-                        return(0);
+                        return 0;
        }
-        return(1);
+        return 1;
 }
 static char *dentry_name(struct dentry *dentry, int extra)
@@ -70,8 +71,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
        len = 0;
        parent = dentry;
-        while(parent->d_parent != parent){
+        while (parent->d_parent != parent) {
-                if(is_pid(parent))
+                if (is_pid(parent))
                        len += strlen("pid") + 1;
                else len += parent->d_name.len + 1;
                parent = parent->d_parent;
@@ -80,12 +81,13 @@ static char *dentry_name(struct dentry *dentry, int extra)
        root = "proc";
        len += strlen(root);
        name = kmalloc(len + extra + 1, GFP_KERNEL);
-        if(name == NULL) return(NULL);
+        if (name == NULL)
+                return NULL;
        name[len] = '\0';
        parent = dentry;
-        while(parent->d_parent != parent){
+        while (parent->d_parent != parent) {
-                if(is_pid(parent)){
+                if (is_pid(parent)) {
                        seg_name = "pid";
                        seg_len = strlen("pid");
                }
@@ -100,27 +102,25 @@ static char *dentry_name(struct dentry *dentry, int extra)
                parent = parent->d_parent;
        }
        strncpy(name, root, strlen(root));
-        return(name);
+        return name;
 }
-struct dentry_operations hppfs_dentry_ops = {
-};
 static int file_removed(struct dentry *dentry, const char *file)
 {
        char *host_file;
        int extra, fd;
        extra = 0;
-        if(file != NULL) extra += strlen(file) + 1;
+        if (file != NULL)
+                extra += strlen(file) + 1;
        host_file = dentry_name(dentry, extra + strlen("/remove"));
-        if(host_file == NULL){
+        if (host_file == NULL) {
-                printk("file_removed : allocation failed\n");
+                printk(KERN_ERR "file_removed : allocation failed\n");
-                return(-ENOMEM);
+                return -ENOMEM;
        }
-        if(file != NULL){
+        if (file != NULL) {
                strcat(host_file, "/");
                strcat(host_file, file);
        }
@@ -128,45 +128,11 @@ static int file_removed(struct dentry *dentry, const char *file)
        fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
        kfree(host_file);
-        if(fd > 0){
+        if (fd > 0) {
                os_close_file(fd);
-                return(1);
+                return 1;
-        }
-        return(0);
-}
-static void hppfs_read_inode(struct inode *ino)
-{
-        struct inode *proc_ino;
-        if(HPPFS_I(ino)->proc_dentry == NULL)
-                return;
-        proc_ino = HPPFS_I(ino)->proc_dentry->d_inode;
-        ino->i_uid = proc_ino->i_uid;
-        ino->i_gid = proc_ino->i_gid;
-        ino->i_atime = proc_ino->i_atime;
-        ino->i_mtime = proc_ino->i_mtime;
-        ino->i_ctime = proc_ino->i_ctime;
-        ino->i_ino = proc_ino->i_ino;
-        ino->i_mode = proc_ino->i_mode;
-        ino->i_nlink = proc_ino->i_nlink;
-        ino->i_size = proc_ino->i_size;
-        ino->i_blocks = proc_ino->i_blocks;
-}
-static struct inode *hppfs_iget(struct super_block *sb)
-{
-        struct inode *inode;
-        inode = iget_locked(sb, 0);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
-        if (inode->i_state & I_NEW) {
-                hppfs_read_inode(inode);
-                unlock_new_inode(inode);
        }
-        return inode;
+        return 0;
 }
 static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
@@ -177,55 +143,45 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
        int err, deleted;
        deleted = file_removed(dentry, NULL);
-        if(deleted < 0)
+        if (deleted < 0)
-                return(ERR_PTR(deleted));
+                return ERR_PTR(deleted);
-        else if(deleted)
+        else if (deleted)
-                return(ERR_PTR(-ENOENT));
+                return ERR_PTR(-ENOENT);
        err = -ENOMEM;
        parent = HPPFS_I(ino)->proc_dentry;
        mutex_lock(&parent->d_inode->i_mutex);
        proc_dentry = d_lookup(parent, &dentry->d_name);
-        if(proc_dentry == NULL){
+        if (proc_dentry == NULL) {
                proc_dentry = d_alloc(parent, &dentry->d_name);
-                if(proc_dentry == NULL){
+                if (proc_dentry == NULL) {
                        mutex_unlock(&parent->d_inode->i_mutex);
                        goto out;
                }
                new = (*parent->d_inode->i_op->lookup)(parent->d_inode,
                                                       proc_dentry, NULL);
-                if(new){
+                if (new) {
                        dput(proc_dentry);
                        proc_dentry = new;
                }
        }
        mutex_unlock(&parent->d_inode->i_mutex);
-        if(IS_ERR(proc_dentry))
+        if (IS_ERR(proc_dentry))
-                return(proc_dentry);
+                return proc_dentry;
-        inode = hppfs_iget(ino->i_sb);
+        err = -ENOMEM;
-        if (IS_ERR(inode)) {
+        inode = get_inode(ino->i_sb, proc_dentry);
-                err = PTR_ERR(inode);
+        if (!inode)
                goto out_dput;
-        }
-        err = init_inode(inode, proc_dentry);
-        if(err)
-                goto out_put;
-        hppfs_read_inode(inode);
        d_add(dentry, inode);
-        dentry->d_op = &hppfs_dentry_ops;
+        return NULL;
-        return(NULL);
- out_put:
-        iput(inode);
 out_dput:
        dput(proc_dentry);
 out:
-        return(ERR_PTR(err));
+        return ERR_PTR(err);
 }
 static const struct inode_operations hppfs_file_iops = {
@@ -239,15 +195,16 @@ static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
        read = file->f_path.dentry->d_inode->i_fop->read;
-        if(!is_user)
+        if (!is_user)
                set_fs(KERNEL_DS);
        n = (*read)(file, buf, count, &file->f_pos);
-        if(!is_user)
+        if (!is_user)
                set_fs(USER_DS);
-        if(ppos) *ppos = file->f_pos;
+        if (ppos)
+                *ppos = file->f_pos;
        return n;
 }
@@ -259,24 +216,23 @@ static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
        n = -ENOMEM;
        new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-        if(new_buf == NULL){
+        if (new_buf == NULL) {
-                printk("hppfs_read_file : kmalloc failed\n");
+                printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
                goto out;
        }
        n = 0;
-        while(count > 0){
+        while (count > 0) {
                cur = min_t(ssize_t, count, PAGE_SIZE);
                err = os_read_file(fd, new_buf, cur);
-                if(err < 0){
+                if (err < 0) {
-                        printk("hppfs_read : read failed, errno = %d\n",
+                        printk(KERN_ERR "hppfs_read : read failed, "
-                               err);
+                               "errno = %d\n", err);
                        n = err;
                        goto out_free;
-                }
+                } else if (err == 0)
-                else if(err == 0)
                        break;
-                if(copy_to_user(buf, new_buf, err)){
+                if (copy_to_user(buf, new_buf, err)) {
                        n = -EFAULT;
                        goto out_free;
                }
@@ -297,35 +253,36 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
        loff_t off;
        int err;
-        if(hppfs->contents != NULL){
+        if (hppfs->contents != NULL) {
-                if(*ppos >= hppfs->len) return(0);
+                if (*ppos >= hppfs->len)
+                        return 0;
                data = hppfs->contents;
                off = *ppos;
-                while(off >= sizeof(data->contents)){
+                while (off >= sizeof(data->contents)) {
                        data = list_entry(data->list.next, struct hppfs_data,
                                          list);
                        off -= sizeof(data->contents);
                }
-                if(off + count > hppfs->len)
+                if (off + count > hppfs->len)
                        count = hppfs->len - off;
                copy_to_user(buf, &data->contents[off], count);
                *ppos += count;
-        }
+        } else if (hppfs->host_fd != -1) {
-        else if(hppfs->host_fd != -1){
                err = os_seek_file(hppfs->host_fd, *ppos);
-                if(err){
+                if (err) {
-                        printk("hppfs_read : seek failed, errno = %d\n", err);
+                        printk(KERN_ERR "hppfs_read : seek failed, "
-                        return(err);
+                               "errno = %d\n", err);
+                        return err;
                }
                count = hppfs_read_file(hppfs->host_fd, buf, count);
-                if(count > 0)
+                if (count > 0)
                        *ppos += count;
        }
        else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
-        return(count);
+        return count;
 }
 static ssize_t hppfs_write(struct file *file, const char __user *buf, size_t len,
@@ -342,7 +299,7 @@ static ssize_t hppfs_write(struct file *file, const char __user *buf, size_t len
        err = (*write)(proc_file, buf, len, &proc_file->f_pos);
        file->f_pos = proc_file->f_pos;
-        return(err);
+        return err;
 }
 static int open_host_sock(char *host_file, int *filter_out)
@@ -354,13 +311,13 @@ static int open_host_sock(char *host_file, int *filter_out)
        strcpy(end, "/rw");
        *filter_out = 1;
        fd = os_connect_socket(host_file);
-        if(fd > 0)
+        if (fd > 0)
-                return(fd);
+                return fd;
        strcpy(end, "/r");
        *filter_out = 0;
        fd = os_connect_socket(host_file);
-        return(fd);
+        return fd;
 }
 static void free_contents(struct hppfs_data *head)
@@ -368,9 +325,10 @@ static void free_contents(struct hppfs_data *head)
        struct hppfs_data *data;
        struct list_head *ele, *next;
-        if(head == NULL) return;
+        if (head == NULL)
+                return;
-        list_for_each_safe(ele, next, &head->list){
+        list_for_each_safe(ele, next, &head->list) {
                data = list_entry(ele, struct hppfs_data, list);
                kfree(data);
        }
@@ -387,8 +345,8 @@ static struct hppfs_data *hppfs_get_data(int fd, int filter,
        err = -ENOMEM;
        data = kmalloc(sizeof(*data), GFP_KERNEL);
-        if(data == NULL){
+        if (data == NULL) {
-                printk("hppfs_get_data : head allocation failed\n");
+                printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
                goto failed;
        }
@@ -397,36 +355,36 @@ static struct hppfs_data *hppfs_get_data(int fd, int filter,
        head = data;
        *size_out = 0;
-        if(filter){
+        if (filter) {
-                while((n = read_proc(proc_file, data->contents,
+                while ((n = read_proc(proc_file, data->contents,
                                     sizeof(data->contents), NULL, 0)) > 0)
                        os_write_file(fd, data->contents, n);
                err = os_shutdown_socket(fd, 0, 1);
-                if(err){
+                if (err) {
-                        printk("hppfs_get_data : failed to shut down "
+                        printk(KERN_ERR "hppfs_get_data : failed to shut down "
                               "socket\n");
                        goto failed_free;
                }
        }
-        while(1){
+        while (1) {
                n = os_read_file(fd, data->contents, sizeof(data->contents));
-                if(n < 0){
+                if (n < 0) {
                        err = n;
-                        printk("hppfs_get_data : read failed, errno = %d\n",
+                        printk(KERN_ERR "hppfs_get_data : read failed, "
-                               err);
+                               "errno = %d\n", err);
                        goto failed_free;
-                }
+                } else if (n == 0)
-                else if(n == 0)
                        break;
                *size_out += n;
-                if(n < sizeof(data->contents))
+                if (n < sizeof(data->contents))
                        break;
                new = kmalloc(sizeof(*data), GFP_KERNEL);
-                if(new == 0){
+                if (new == 0) {
-                        printk("hppfs_get_data : data allocation failed\n");
+                        printk(KERN_ERR "hppfs_get_data : data allocation "
+                               "failed\n");
                        err = -ENOMEM;
                        goto failed_free;
                }
@@ -435,12 +393,12 @@ static struct hppfs_data *hppfs_get_data(int fd, int filter,
                list_add(&new->list, &data->list);
                data = new;
        }
-        return(head);
+        return head;
 failed_free:
        free_contents(head);
 failed:
-        return(ERR_PTR(err));
+        return ERR_PTR(err);
 }
 static struct hppfs_private *hppfs_data(void)
@@ -448,77 +406,79 @@ static struct hppfs_private *hppfs_data(void)
        struct hppfs_private *data;
        data = kmalloc(sizeof(*data), GFP_KERNEL);
-        if(data == NULL)
+        if (data == NULL)
-                return(data);
+                return data;
        *data = ((struct hppfs_private ) { .host_fd             = -1,
                                           .len                 = -1,
                                           .contents            = NULL } );
-        return(data);
+        return data;
 }
 static int file_mode(int fmode)
 {
-        if(fmode == (FMODE_READ | FMODE_WRITE))
+        if (fmode == (FMODE_READ | FMODE_WRITE))
-                return(O_RDWR);
+                return O_RDWR;
-        if(fmode == FMODE_READ)
+        if (fmode == FMODE_READ)
-                return(O_RDONLY);
+                return O_RDONLY;
-        if(fmode == FMODE_WRITE)
+        if (fmode == FMODE_WRITE)
-                return(O_WRONLY);
+                return O_WRONLY;
-        return(0);
+        return 0;
 }
 static int hppfs_open(struct inode *inode, struct file *file)
 {
        struct hppfs_private *data;
        struct dentry *proc_dentry;
+        struct vfsmount *proc_mnt;
        char *host_file;
        int err, fd, type, filter;
        err = -ENOMEM;
        data = hppfs_data();
-        if(data == NULL)
+        if (data == NULL)
                goto out;
        host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
-        if(host_file == NULL)
+        if (host_file == NULL)
                goto out_free2;
        proc_dentry = HPPFS_I(inode)->proc_dentry;
+        proc_mnt = inode->i_sb->s_fs_info;
        /* XXX This isn't closed anywhere */
-        data->proc_file = dentry_open(dget(proc_dentry), NULL,
+        data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
                                      file_mode(file->f_mode));
        err = PTR_ERR(data->proc_file);
-        if(IS_ERR(data->proc_file))
+        if (IS_ERR(data->proc_file))
                goto out_free1;
        type = os_file_type(host_file);
-        if(type == OS_TYPE_FILE){
+        if (type == OS_TYPE_FILE) {
                fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
-                if(fd >= 0)
+                if (fd >= 0)
                        data->host_fd = fd;
-                else printk("hppfs_open : failed to open '%s', errno = %d\n",
+                else
-                            host_file, -fd);
+                        printk(KERN_ERR "hppfs_open : failed to open '%s', "
+                               "errno = %d\n", host_file, -fd);
                data->contents = NULL;
-        }
+        } else if (type == OS_TYPE_DIR) {
-        else if(type == OS_TYPE_DIR){
                fd = open_host_sock(host_file, &filter);
-                if(fd > 0){
+                if (fd > 0) {
                        data->contents = hppfs_get_data(fd, filter,
                                                        data->proc_file,
                                                        file, &data->len);
-                        if(!IS_ERR(data->contents))
+                        if (!IS_ERR(data->contents))
                                data->host_fd = fd;
-                }
+                } else
-                else printk("hppfs_open : failed to open a socket in "
+                        printk(KERN_ERR "hppfs_open : failed to open a socket "
-                            "'%s', errno = %d\n", host_file, -fd);
+                               "in '%s', errno = %d\n", host_file, -fd);
        }
        kfree(host_file);
        file->private_data = data;
-        return(0);
+        return 0;
 out_free1:
        kfree(host_file);
@@ -526,34 +486,36 @@ static int hppfs_open(struct inode *inode, struct file *file)
        free_contents(data->contents);
        kfree(data);
 out:
-        return(err);
+        return err;
 }
 static int hppfs_dir_open(struct inode *inode, struct file *file)
 {
        struct hppfs_private *data;
        struct dentry *proc_dentry;
+        struct vfsmount *proc_mnt;
        int err;
        err = -ENOMEM;
        data = hppfs_data();
-        if(data == NULL)
+        if (data == NULL)
                goto out;
        proc_dentry = HPPFS_I(inode)->proc_dentry;
-        data->proc_file = dentry_open(dget(proc_dentry), NULL,
+        proc_mnt = inode->i_sb->s_fs_info;
+        data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
                                      file_mode(file->f_mode));
        err = PTR_ERR(data->proc_file);
-        if(IS_ERR(data->proc_file))
+        if (IS_ERR(data->proc_file))
                goto out_free;
        file->private_data = data;
-        return(0);
+        return 0;
 out_free:
        kfree(data);
 out:
-        return(err);
+        return err;
 }
 static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
@@ -564,13 +526,13 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
        loff_t ret;
        llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek;
-        if(llseek != NULL){
+        if (llseek != NULL) {
                ret = (*llseek)(proc_file, off, where);
-                if(ret < 0)
+                if (ret < 0)
-                        return(ret);
+                        return ret;
        }
-        return(default_llseek(file, off, where));
+        return default_llseek(file, off, where);
 }
 static const struct file_operations hppfs_file_fops = {
@@ -592,11 +554,11 @@ static int hppfs_filldir(void *d, const char *name, int size,
 {
        struct hppfs_dirent *dirent = d;
-        if(file_removed(dirent->dentry, name))
+        if (file_removed(dirent->dentry, name))
-                return(0);
+                return 0;
-        return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
+        return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
-                                  inode, type));
+                                  inode, type);
 }
 static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
@@ -607,7 +569,8 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
        struct hppfs_dirent dirent = ((struct hppfs_dirent)
                                      { .vfs_dirent     = ent,
                                        .filldir        = filldir,
-                                        .dentry         = file->f_path.dentry } );
+                                        .dentry         = file->f_path.dentry
+                                      });
        int err;
        readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir;
@@ -616,12 +579,12 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
        err = (*readdir)(proc_file, &dirent, hppfs_filldir);
        file->f_pos = proc_file->f_pos;
-        return(err);
+        return err;
 }
 static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        return(0);
+        return 0;
 }
 static const struct file_operations hppfs_dir_fops = {
@@ -639,7 +602,7 @@ static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
        sf->f_files = 0;
        sf->f_ffree = 0;
        sf->f_type = HPPFS_SUPER_MAGIC;
-        return(0);
+        return 0;
 }
 static struct inode *hppfs_alloc_inode(struct super_block *sb)
@@ -647,12 +610,12 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
        struct hppfs_inode_info *hi;
        hi = kmalloc(sizeof(*hi), GFP_KERNEL);
-        if(hi == NULL)
+        if (!hi)
-                return(NULL);
+                return NULL;
-        *hi = ((struct hppfs_inode_info) { .proc_dentry = NULL });
+        hi->proc_dentry = NULL;
        inode_init_once(&hi->vfs_inode);
-        return(&hi->vfs_inode);
+        return &hi->vfs_inode;
 }
 void hppfs_delete_inode(struct inode *ino)
@@ -665,21 +628,31 @@ static void hppfs_destroy_inode(struct inode *inode)
        kfree(HPPFS_I(inode));
 }
+static void hppfs_put_super(struct super_block *sb)
+{
+        mntput(sb->s_fs_info);
+}
 static const struct super_operations hppfs_sbops = {
        .alloc_inode    = hppfs_alloc_inode,
        .destroy_inode  = hppfs_destroy_inode,
        .delete_inode   = hppfs_delete_inode,
        .statfs         = hppfs_statfs,
+        .put_super      = hppfs_put_super,
 };
-static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
+                          int buflen)
 {
        struct file *proc_file;
        struct dentry *proc_dentry;
+        struct vfsmount *proc_mnt;
        int ret;
        proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-        proc_file = dentry_open(dget(proc_dentry), NULL, O_RDONLY);
+        proc_mnt = dentry->d_sb->s_fs_info;
+        proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), O_RDONLY);
        if (IS_ERR(proc_file))
                return PTR_ERR(proc_file);
@@ -694,10 +667,13 @@ static void* hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct file *proc_file;
        struct dentry *proc_dentry;
+        struct vfsmount *proc_mnt;
        void *ret;
        proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-        proc_file = dentry_open(dget(proc_dentry), NULL, O_RDONLY);
+        proc_mnt = dentry->d_sb->s_fs_info;
+        proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), O_RDONLY);
        if (IS_ERR(proc_file))
                return proc_file;
@@ -717,70 +693,72 @@ static const struct inode_operations hppfs_link_iops = {
        .follow_link    = hppfs_follow_link,
 };
-static int init_inode(struct inode *inode, struct dentry *dentry)
+static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 {
-        if(S_ISDIR(dentry->d_inode->i_mode)){
+        struct inode *proc_ino = dentry->d_inode;
+        struct inode *inode = new_inode(sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (S_ISDIR(dentry->d_inode->i_mode)) {
                inode->i_op = &hppfs_dir_iops;
                inode->i_fop = &hppfs_dir_fops;
-        }
+        } else if (S_ISLNK(dentry->d_inode->i_mode)) {
-        else if(S_ISLNK(dentry->d_inode->i_mode)){
                inode->i_op = &hppfs_link_iops;
                inode->i_fop = &hppfs_file_fops;
-        }
+        } else {
-        else {
                inode->i_op = &hppfs_file_iops;
                inode->i_fop = &hppfs_file_fops;
        }
        HPPFS_I(inode)->proc_dentry = dentry;
-        return(0);
+        inode->i_uid = proc_ino->i_uid;
+        inode->i_gid = proc_ino->i_gid;
+        inode->i_atime = proc_ino->i_atime;
+        inode->i_mtime = proc_ino->i_mtime;
+        inode->i_ctime = proc_ino->i_ctime;
+        inode->i_ino = proc_ino->i_ino;
+        inode->i_mode = proc_ino->i_mode;
+        inode->i_nlink = proc_ino->i_nlink;
+        inode->i_size = proc_ino->i_size;
+        inode->i_blocks = proc_ino->i_blocks;
+        return 0;
 }
 static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 {
        struct inode *root_inode;
-        struct file_system_type *procfs;
+        struct vfsmount *proc_mnt;
-        struct super_block *proc_sb;
+        int err = -ENOENT;
-        int err;
-        err = -ENOENT;
+        proc_mnt = do_kern_mount("proc", 0, "proc", NULL);
-        procfs = get_fs_type("proc");
+        if (IS_ERR(proc_mnt))
-        if(procfs == NULL)
                goto out;
-        if(list_empty(&procfs->fs_supers))
-                goto out;
-        proc_sb = list_entry(procfs->fs_supers.next, struct super_block,
-                             s_instances);
        sb->s_blocksize = 1024;
        sb->s_blocksize_bits = 10;
        sb->s_magic = HPPFS_SUPER_MAGIC;
        sb->s_op = &hppfs_sbops;
+        sb->s_fs_info = proc_mnt;
-        root_inode = hppfs_iget(sb);
-        if (IS_ERR(root_inode)) {
-                err = PTR_ERR(root_inode);
-                goto out;
-        }
-        err = init_inode(root_inode, proc_sb->s_root);
-        if(err)
-                goto out_put;
        err = -ENOMEM;
-        sb->s_root = d_alloc_root(root_inode);
+        root_inode = get_inode(sb, proc_mnt->mnt_sb->s_root);
-        if(sb->s_root == NULL)
+        if (!root_inode)
-                goto out_put;
+                goto out_mntput;
-        hppfs_read_inode(root_inode);
+        sb->s_root = d_alloc_root(root_inode);
+        if (!sb->s_root)
+                goto out_iput;
-        return(0);
+        return 0;
- out_put:
+ out_iput:
        iput(root_inode);
+ out_mntput:
+        mntput(proc_mnt);
 out:
        return(err);
 }
@@ -802,7 +780,7 @@ static struct file_system_type hppfs_type = {
 static int __init init_hppfs(void)
 {
-        return(register_filesystem(&hppfs_type));
+        return register_filesystem(&hppfs_type);
 }
 static void __exit exit_hppfs(void)
@@ -813,14 +791,3 @@ static void __exit exit_hppfs(void)
 module_init(init_hppfs)
 module_exit(exit_hppfs)
 MODULE_LICENSE("GPL");
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index eee9487ae47f..6846785fe904 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -954,7 +954,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
                        FMODE_WRITE | FMODE_READ,
                        &hugetlbfs_file_operations);
        if (!file)
-                goto out_inode;
+                goto out_dentry; /* inode is already attached */
        return file;
diff --git a/fs/inode.c b/fs/inode.c
index 53245ffcf93d..27ee1af50d02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        struct timespec now;
-        if (inode->i_flags & S_NOATIME)
+        if (mnt_want_write(mnt))
                return;
+        if (inode->i_flags & S_NOATIME)
+                goto out;
        if (IS_NOATIME(inode))
-                return;
+                goto out;
        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-                return;
+                goto out;
-        /*
+        if (mnt->mnt_flags & MNT_NOATIME)
-         * We may have a NULL vfsmount when coming from NFSD
+                goto out;
-         */
+        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-        if (mnt) {
+                goto out;
-                if (mnt->mnt_flags & MNT_NOATIME)
+        if (mnt->mnt_flags & MNT_RELATIME) {
-                        return;
+                /*
-                if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+                 * With relative atime, only update atime if the previous
-                        return;
+                 * atime is earlier than either the ctime or mtime.
+                 */
-                if (mnt->mnt_flags & MNT_RELATIME) {
+                if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
-                        /*
+                    timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
-                         * With relative atime, only update atime if the
+                        goto out;
-                         * previous atime is earlier than either the ctime or
-                         * mtime.
-                         */
-                        if (timespec_compare(&inode->i_mtime,
-                                                &inode->i_atime) < 0 &&
-                            timespec_compare(&inode->i_ctime,
-                                                &inode->i_atime) < 0)
-                                return;
-                }
        }
        now = current_fs_time(inode->i_sb);
        if (timespec_equal(&inode->i_atime, &now))
-                return;
+                goto out;
        inode->i_atime = now;
        mark_inode_dirty_sync(inode);
+out:
+        mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
@@ -1255,10 +1250,13 @@ void file_update_time(struct file *file)
        struct inode *inode = file->f_path.dentry->d_inode;
        struct timespec now;
        int sync_it = 0;
+        int err;
        if (IS_NOCMTIME(inode))
                return;
-        if (IS_RDONLY(inode))
+        err = mnt_want_write(file->f_path.mnt);
+        if (err)
                return;
        now = current_fs_time(inode->i_sb);
@@ -1279,6 +1277,7 @@ void file_update_time(struct file *file)
        if (sync_it)
                mark_inode_dirty_sync(inode);
+        mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(file_update_time);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 37dbd6404787..defb932eee9a 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -72,6 +72,17 @@ static int zisofs_readpage(struct file *file, struct page *page)
        offset = index & ~zisofs_block_page_mask;
        blockindex = offset >> zisofs_block_page_shift;
        maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /*
+         * If this page is wholly outside i_size we just return zero;
+         * do_generic_file_read() will handle this for us
+         */
+        if (page->index >= maxpage) {
+                SetPageUptodate(page);
+                unlock_page(page);
+                return 0;
+        }
        maxpage = min(zisofs_block_pages, maxpage-offset);
        for ( i = 0 ; i < maxpage ; i++, offset++ ) {
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 3943a8905eb2..0e081d5f32e8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -697,13 +697,14 @@ fail:
 */
 /**
- *  journal_t * journal_init_dev() - creates an initialises a journal structure
+ *  journal_t * journal_init_dev() - creates and initialises a journal structure
 *  @bdev: Block device on which to create the journal
 *  @fs_dev: Device which hold journalled filesystem for this journal.
 *  @start: Block nr Start of journal.
 *  @len:  Length of the journal in blocks.
 *  @blocksize: blocksize of journalling device
- *  @returns: a newly created journal_t *
+ *
+ *  Returns: a newly created journal_t *
 *
 *  journal_init_dev creates a journal which maps a fixed contiguous
 *  range of blocks on an arbitrary block device.
@@ -1619,14 +1620,14 @@ static int journal_init_journal_head_cache(void)
 {
        int retval;
-        J_ASSERT(journal_head_cache == 0);
+        J_ASSERT(journal_head_cache == NULL);
        journal_head_cache = kmem_cache_create("journal_head",
                                sizeof(struct journal_head),
                                0,              /* offset */
                                SLAB_TEMPORARY, /* flags */
                                NULL);          /* ctor */
        retval = 0;
-        if (journal_head_cache == 0) {
+        if (!journal_head_cache) {
                retval = -ENOMEM;
                printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
        }
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 2b8edf4d6eaa..43bc5e5ed064 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -478,7 +478,7 @@ static int do_one_pass(journal_t *journal,
                                        memcpy(nbh->b_data, obh->b_data,
                                                        journal->j_blocksize);
                                        if (flags & JFS_FLAG_ESCAPE) {
-                                                *((__be32 *)bh->b_data) =
+                                                *((__be32 *)nbh->b_data) =
                                                cpu_to_be32(JFS_MAGIC_NUMBER);
                                        }
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad2eacf570c6..d5f8eee7c88c 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -173,13 +173,13 @@ int __init journal_init_revoke_caches(void)
                                           0,
                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                           NULL);
-        if (revoke_record_cache == 0)
+        if (!revoke_record_cache)
                return -ENOMEM;
        revoke_table_cache = kmem_cache_create("revoke_table",
                                           sizeof(struct jbd_revoke_table_s),
                                           0, SLAB_TEMPORARY, NULL);
-        if (revoke_table_cache == 0) {
+        if (!revoke_table_cache) {
                kmem_cache_destroy(revoke_record_cache);
                revoke_record_cache = NULL;
                return -ENOMEM;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 038ed7436199..2c9e8f5d13aa 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -369,7 +369,7 @@ out:
 /**
- * int journal_restart() - restart a handle .
+ * int journal_restart() - restart a handle.
 * @handle:  handle to restart
 * @nblocks: nr credits requested
 *
@@ -844,8 +844,7 @@ out:
 }
 /**
- * int journal_get_undo_access() -  Notify intent to modify metadata with
+ * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
- *     non-rewindable consequences
 * @handle: transaction
 * @bh: buffer to undo
 * @credits: store the number of taken credits here (if not NULL)
@@ -921,12 +920,14 @@ out:
 }
 /**
- * int journal_dirty_data() -  mark a buffer as containing dirty data which
+ * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
- *                             needs to be flushed before we can commit the
- *                             current transaction.
 * @handle: transaction
 * @bh: bufferhead to mark
 *
+ * Description:
+ * Mark a buffer as containing dirty data which needs to be flushed before
+ * we can commit the current transaction.
+ *
 * The buffer is placed on the transaction's data list and is marked as
 * belonging to the transaction.
 *
@@ -1098,11 +1099,11 @@ no_journal:
 }
 /**
- * int journal_dirty_metadata() -  mark a buffer as containing dirty metadata
+ * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
 *
- * mark dirty metadata which needs to be journaled as part of the current
+ * Mark dirty metadata which needs to be journaled as part of the current
 * transaction.
 *
 * The buffer is placed on the transaction's metadata list and is marked
@@ -1425,7 +1426,8 @@ int journal_stop(handle_t *handle)
        return err;
 }
-/**int journal_force_commit() - force any uncommitted transactions
+/**
+ * int journal_force_commit() - force any uncommitted transactions
 * @journal: journal to force
 *
 * For synchronous operations: force any uncommitted transactions
@@ -1902,13 +1904,12 @@ zap_buffer_unlocked:
 }
 /**
- * void journal_invalidatepage()
+ * void journal_invalidatepage() - invalidate a journal page
- * @journal: journal to use for flush...
+ * @journal: journal to use for flush
 * @page:    page to flush
 * @offset:  length of page to invalidate.
 *
 * Reap page buffers containing data after offset in page.
- *
 */
 void journal_invalidatepage(journal_t *journal,
                      struct page *page,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 96ba846992e9..954cff001df6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -219,7 +219,7 @@ static int jbd2_journal_start_thread(journal_t *journal)
        if (IS_ERR(t))
                return PTR_ERR(t);
-        wait_event(journal->j_wait_done_commit, journal->j_task != 0);
+        wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
        return 0;
 }
@@ -231,7 +231,7 @@ static void journal_kill_thread(journal_t *journal)
        while (journal->j_task) {
                wake_up(&journal->j_wait_commit);
                spin_unlock(&journal->j_state_lock);
-                wait_event(journal->j_wait_done_commit, journal->j_task == 0);
+                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
                spin_lock(&journal->j_state_lock);
        }
        spin_unlock(&journal->j_state_lock);
@@ -1969,14 +1969,14 @@ static int journal_init_jbd2_journal_head_cache(void)
 {
        int retval;
-        J_ASSERT(jbd2_journal_head_cache == 0);
+        J_ASSERT(jbd2_journal_head_cache == NULL);
        jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
                                sizeof(struct journal_head),
                                0,              /* offset */
                                SLAB_TEMPORARY, /* flags */
                                NULL);          /* ctor */
        retval = 0;
-        if (jbd2_journal_head_cache == 0) {
+        if (!jbd2_journal_head_cache) {
                retval = -ENOMEM;
                printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
        }
@@ -2002,14 +2002,14 @@ static struct journal_head *journal_alloc_journal_head(void)
        atomic_inc(&nr_journal_heads);
 #endif
        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
-        if (ret == 0) {
+        if (!ret) {
                jbd_debug(1, "out of memory for journal_head\n");
                if (time_after(jiffies, last_warning + 5*HZ)) {
                        printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
                               __FUNCTION__);
                        last_warning = jiffies;
                }
-                while (ret == 0) {
+                while (!ret) {
                        yield();
                        ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
                }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 146411387ada..5d0405a9e7ca 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -535,7 +535,7 @@ static int do_one_pass(journal_t *journal,
                                        memcpy(nbh->b_data, obh->b_data,
                                                        journal->j_blocksize);
                                        if (flags & JBD2_FLAG_ESCAPE) {
-                                                *((__be32 *)bh->b_data) =
+                                                *((__be32 *)nbh->b_data) =
                                                cpu_to_be32(JBD2_MAGIC_NUMBER);
                                        }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index df36f42e19e1..2e1453a5e998 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -174,13 +174,13 @@ int __init jbd2_journal_init_revoke_caches(void)
                                           0,
                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                           NULL);
-        if (jbd2_revoke_record_cache == 0)
+        if (!jbd2_revoke_record_cache)
                return -ENOMEM;
        jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
                                           sizeof(struct jbd2_revoke_table_s),
                                           0, SLAB_TEMPORARY, NULL);
-        if (jbd2_revoke_table_cache == 0) {
+        if (!jbd2_revoke_table_cache) {
                kmem_cache_destroy(jbd2_revoke_record_cache);
                jbd2_revoke_record_cache = NULL;
                return -ENOMEM;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f9c5dd6f4b64..dcc2734e0b5d 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -129,7 +129,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        struct inode *inode = mapping->host;
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        uint32_t pageofs = pos & (PAGE_CACHE_SIZE - 1);
+        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
        pg = __grab_cache_page(mapping, index);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 0b78fdc9773b..a841f4973a74 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -15,7 +15,7 @@
 #include <linux/version.h>
 #include <linux/rbtree.h>
 #include <linux/posix_acl.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 struct jffs2_inode_info {
        /* We need an internal mutex similar to inode->i_mutex.
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 3a2197f3c812..18fca2b9e531 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -16,7 +16,7 @@
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/list.h>
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index a1f8e375ad21..afe222bf300f 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/ctype.h>
 #include <linux/capability.h>
+#include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <asm/current.h>
@@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return put_user(flags, (int __user *) arg);
        case JFS_IOC_SETFLAGS: {
                unsigned int oldflags;
+                int err;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
+                        return err;
-                if (!is_owner_or_cap(inode))
+                if (!is_owner_or_cap(inode)) {
-                        return -EACCES;
+                        err = -EACCES;
+                        goto setflags_out;
-                if (get_user(flags, (int __user *) arg))
+                }
-                        return -EFAULT;
+                if (get_user(flags, (int __user *) arg)) {
+                        err = -EFAULT;
+                        goto setflags_out;
+                }
                flags = jfs_map_ext2(flags, 1);
                if (!S_ISDIR(inode->i_mode))
                        flags &= ~JFS_DIRSYNC_FL;
                /* Is it quota file? Do not allow user to mess with it */
-                if (IS_NOQUOTA(inode))
+                if (IS_NOQUOTA(inode)) {
-                        return -EPERM;
+                        err = -EPERM;
+                        goto setflags_out;
+                }
                /* Lock against other parallel changes of flags */
                mutex_lock(&inode->i_mutex);
@@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
                        if (!capable(CAP_LINUX_IMMUTABLE)) {
                                mutex_unlock(&inode->i_mutex);
-                                return -EPERM;
+                                err = -EPERM;
+                                goto setflags_out;
                        }
                }
@@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                mutex_unlock(&inode->i_mutex);
                inode->i_ctime = CURRENT_TIME_SEC;
                mark_inode_dirty(inode);
-                return 0;
+setflags_out:
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
        }
        default:
                return -ENOTTY;
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index e1985066b1c6..2bc7d8aa5740 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2172,7 +2172,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
        }
        /* update the free count for this dmap */
-        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+        le32_add_cpu(&dp->nfree, -nblocks);
        BMAP_LOCK(bmp);
@@ -2316,7 +2316,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
        /* update the free count for this dmap.
         */
-        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
+        le32_add_cpu(&dp->nfree, nblocks);
        BMAP_LOCK(bmp);
@@ -3226,7 +3226,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
        }
        /* update the free count for this dmap */
-        dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
+        le32_add_cpu(&dp->nfree, -nblocks);
        /* reconstruct summary tree */
        dbInitDmapTree(dp);
@@ -3660,9 +3660,8 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
                        goto initTree;
                }
        } else {
-                dp->nblocks =
+                le32_add_cpu(&dp->nblocks, nblocks);
-                    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
+                le32_add_cpu(&dp->nfree, nblocks);
-                dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
        }
        /* word number containing start block number */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 11e6d471b364..1a6eb41569bc 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -61,7 +61,7 @@
 * determine the maximum free string for four (lower level) nodes
 * of the tree.
 */
-static __inline signed char TREEMAX(signed char *cp)
+static inline signed char TREEMAX(signed char *cp)
 {
        signed char tmp1, tmp2;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 9bf29f771737..734ec916beaf 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1019,8 +1019,7 @@ int diFree(struct inode *ip)
                /* update the free inode counts at the iag, ag and
                 * map level.
                 */
-                iagp->nfreeinos =
+                le32_add_cpu(&iagp->nfreeinos, 1);
-                    cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
                imap->im_agctl[agno].numfree += 1;
                atomic_inc(&imap->im_numfree);
@@ -1219,9 +1218,8 @@ int diFree(struct inode *ip)
        /* update the number of free inodes and number of free extents
         * for the iag.
         */
-        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
+        le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
-                                      (INOSPEREXT - 1));
+        le32_add_cpu(&iagp->nfreeexts, 1);
-        iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
        /* update the number of free inodes and backed inodes
         * at the ag and inode map level.
@@ -2124,7 +2122,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
        /* update the free inode count at the iag, ag, inode
         * map levels.
         */
-        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
+        le32_add_cpu(&iagp->nfreeinos, -1);
        imap->im_agctl[agno].numfree -= 1;
        atomic_dec(&imap->im_numfree);
@@ -2378,9 +2376,8 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
        /* update the free inode and free extent counts for the
         * iag.
         */
-        iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
+        le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
-                                      (INOSPEREXT - 1));
+        le32_add_cpu(&iagp->nfreeexts, -1);
-        iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
        /* update the free and backed inode counts for the ag.
         */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a000aaa75136..5a61ebf2cbcc 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -905,8 +905,7 @@ int xtInsert(tid_t tid,		/* transaction id */
        XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
        /* advance next available entry index */
-        p->header.nextindex =
+        le16_add_cpu(&p->header.nextindex, 1);
-            cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        /* Don't log it if there are no links to the file */
        if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -997,8 +996,7 @@ xtSplitUp(tid_t tid,
                            split->addr);
                /* advance next available entry index */
-                sp->header.nextindex =
+                le16_add_cpu(&sp->header.nextindex, 1);
-                    cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
                /* Don't log it if there are no links to the file */
                if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1167,9 +1165,7 @@ xtSplitUp(tid_t tid,
                                    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
                        /* advance next available entry index. */
-                        sp->header.nextindex =
+                        le16_add_cpu(&sp->header.nextindex, 1);
-                            cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
-                                        1);
                        /* Don't log it if there are no links to the file */
                        if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1738,8 +1734,7 @@ int xtExtend(tid_t tid,		/* transaction id */
                XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
                /* advance next available entry index */
-                p->header.nextindex =
+                le16_add_cpu(&p->header.nextindex, 1);
-                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        }
        /* get back old entry */
@@ -1905,8 +1900,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
                XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
                /* advance next available entry index */
-                p->header.nextindex =
+                le16_add_cpu(&p->header.nextindex, 1);
-                    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        }
        /* get back old XAD */
@@ -2567,8 +2561,7 @@ int xtAppend(tid_t tid,		/* transaction id */
        XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
        /* advance next available entry index */
-        p->header.nextindex =
+        le16_add_cpu(&p->header.nextindex, 1);
-            cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
        xtlck->lwm.offset =
            (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
@@ -2631,8 +2624,7 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
         * delete the entry from the leaf page
         */
        nextindex = le16_to_cpu(p->header.nextindex);
-        p->header.nextindex =
+        le16_add_cpu(&p->header.nextindex, -1);
-            cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
        /*
         * if the leaf page bocome empty, free the page
@@ -2795,9 +2787,7 @@ xtDeleteUp(tid_t tid, struct inode *ip,
                                        (nextindex - index -
                                         1) << L2XTSLOTSIZE);
-                        p->header.nextindex =
+                        le16_add_cpu(&p->header.nextindex, -1);
-                            cpu_to_le16(le16_to_cpu(p->header.nextindex) -
-                                        1);
                        jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
                                 (ulong) parent->bn, index);
                }
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 08226464e563..1ed8bd4de941 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -153,7 +153,7 @@ lockd(struct svc_rqst *rqstp)
         */
        while ((nlmsvc_users || !signalled()) && nlmsvc_pid == current->pid) {
                long timeout = MAX_SCHEDULE_TIMEOUT;
-                char buf[RPC_MAX_ADDRBUFLEN];
+                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
                if (signalled()) {
                        flush_signals(current);
diff --git a/fs/locks.c b/fs/locks.c
index f36f0e61558d..592faadbcec1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 #define IS_POSIX(fl)    (fl->fl_flags & FL_POSIX)
@@ -1275,13 +1274,13 @@ out:
 EXPORT_SYMBOL(__break_lease);
 /**
- *      lease_get_mtime
+ *      lease_get_mtime - get the last modified time of an inode
 *      @inode: the inode
 *      @time:  pointer to a timespec which will contain the last modified time
 *
 * This is to force NFS clients to flush their caches for files with
 * exclusive leases.  The justification is that if someone has an
- * exclusive lease, then they could be modifiying it.
+ * exclusive lease, then they could be modifying it.
 */
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
@@ -1801,17 +1800,21 @@ again:
        if (error)
                goto out;
-        for (;;) {
+        if (filp->f_op && filp->f_op->lock != NULL)
-                error = vfs_lock_file(filp, cmd, file_lock, NULL);
+                error = filp->f_op->lock(filp, cmd, file_lock);
-                if (error != -EAGAIN || cmd == F_SETLK)
+        else {
-                        break;
+                for (;;) {
-                error = wait_event_interruptible(file_lock->fl_wait,
+                        error = posix_lock_file(filp, file_lock, NULL);
-                                !file_lock->fl_next);
+                        if (error != -EAGAIN || cmd == F_SETLK)
-                if (!error)
+                                break;
-                        continue;
+                        error = wait_event_interruptible(file_lock->fl_wait,
+                                        !file_lock->fl_next);
+                        if (!error)
+                                continue;
-                locks_delete_block(file_lock);
+                        locks_delete_block(file_lock);
-                break;
+                        break;
+                }
        }
        /*
@@ -1925,17 +1928,21 @@ again:
        if (error)
                goto out;
-        for (;;) {
+        if (filp->f_op && filp->f_op->lock != NULL)
-                error = vfs_lock_file(filp, cmd, file_lock, NULL);
+                error = filp->f_op->lock(filp, cmd, file_lock);
-                if (error != -EAGAIN || cmd == F_SETLK64)
+        else {
-                        break;
+                for (;;) {
-                error = wait_event_interruptible(file_lock->fl_wait,
+                        error = posix_lock_file(filp, file_lock, NULL);
-                                !file_lock->fl_next);
+                        if (error != -EAGAIN || cmd == F_SETLK64)
-                if (!error)
+                                break;
-                        continue;
+                        error = wait_event_interruptible(file_lock->fl_wait,
+                                        !file_lock->fl_next);
+                        if (!error)
+                                continue;
-                locks_delete_block(file_lock);
+                        locks_delete_block(file_lock);
-                break;
+                        break;
+                }
        }
        /*
diff --git a/fs/mbcache.c b/fs/mbcache.c
index eb31b73e7d69..ec88ff3d04a9 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -399,11 +399,11 @@ mb_cache_destroy(struct mb_cache *cache)
 * if no more memory was available.
 */
 struct mb_cache_entry *
-mb_cache_entry_alloc(struct mb_cache *cache)
+mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
        struct mb_cache_entry *ce;
-        ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
+        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
        if (ce) {
                atomic_inc(&cache->c_entry_count);
                INIT_LIST_HEAD(&ce->e_lru_list);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5df564366f36..235e4d3873a8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -325,16 +325,12 @@ confused:
 }
 /**
- * mpage_readpages - populate an address space with some pages, and
+ * mpage_readpages - populate an address space with some pages & start reads against them
- *                       start reads against them.
- *
 * @mapping: the address_space
 * @pages: The address of a list_head which contains the target pages.  These
 *   pages have their ->index populated and are otherwise uninitialised.
- *
 *   The page at @pages->prev has the lowest file offset, and reads should be
 *   issued in @pages->prev to @pages->next order.
- *
 * @nr_pages: The number of pages at *@pages
 * @get_block: The filesystem's block mapper function.
 *
@@ -360,6 +356,7 @@ confused:
 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
 * submitted in the following order:
 *      12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
+ *
 * because the indirect block has to be read to get the mappings of blocks
 * 13,14,15,16.  Obviously, this impacts performance.
 *
@@ -656,9 +653,7 @@ out:
 }
 /**
- * mpage_writepages - walk the list of dirty pages of the given
+ * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
- * address space and writepage() all of them.
- * 
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @get_block: the filesystem's block mapper function.
diff --git a/fs/namei.c b/fs/namei.c
index 941c8e8228c0..e179f71bfcb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -106,7 +106,7 @@
 * any extra contention...
 */
-static int link_path_walk(const char *name, struct nameidata *nd);
+static int __link_path_walk(const char *name, struct nameidata *nd);
 /* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
@@ -563,6 +563,37 @@ walk_init_root(const char *name, struct nameidata *nd)
        return 1;
 }
+/*
+ * Wrapper to retry pathname resolution whenever the underlying
+ * file system returns an ESTALE.
+ *
+ * Retry the whole path once, forcing real lookup requests
+ * instead of relying on the dcache.
+ */
+static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
+{
+        struct path save = nd->path;
+        int result;
+        /* make sure the stuff we saved doesn't go away */
+        dget(save.dentry);
+        mntget(save.mnt);
+        result = __link_path_walk(name, nd);
+        if (result == -ESTALE) {
+                /* nd->path had been dropped */
+                nd->path = save;
+                dget(nd->path.dentry);
+                mntget(nd->path.mnt);
+                nd->flags |= LOOKUP_REVAL;
+                result = __link_path_walk(name, nd);
+        }
+        path_put(&save);
+        return result;
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
        int res = 0;
@@ -1020,36 +1051,6 @@ return_err:
        return err;
 }
-/*
- * Wrapper to retry pathname resolution whenever the underlying
- * file system returns an ESTALE.
- *
- * Retry the whole path once, forcing real lookup requests
- * instead of relying on the dcache.
- */
-static int link_path_walk(const char *name, struct nameidata *nd)
-{
-        struct nameidata save = *nd;
-        int result;
-        /* make sure the stuff we saved doesn't go away */
-        dget(save.path.dentry);
-        mntget(save.path.mnt);
-        result = __link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                *nd = save;
-                dget(nd->path.dentry);
-                mntget(nd->path.mnt);
-                nd->flags |= LOOKUP_REVAL;
-                result = __link_path_walk(name, nd);
-        }
-        path_put(&save.path);
-        return result;
-}
 static int path_walk(const char *name, struct nameidata *nd)
 {
        current->total_link_count = 0;
@@ -1364,13 +1365,13 @@ static int __lookup_one_len(const char *name, struct qstr *this,
 }
 /**
- * lookup_one_len:  filesystem helper to lookup single pathname component
+ * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:       pathname component to lookup
 * @base:       base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
- * Note that this routine is purely a helper for filesystem useage and should
+ * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.  Also note that by using this function to
+ * not be called by generic code.  Also note that by using this function the
 * nameidata argument is passed to the filesystem methods and a filesystem
 * using this helper needs to be prepared for that.
 */
@@ -1622,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                        return -EACCES;
                flag &= ~O_TRUNC;
-        } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
+        }
-                return -EROFS;
        error = vfs_permission(nd, acc_mode);
        if (error)
@@ -1676,7 +1676,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
        return 0;
 }
-static int open_namei_create(struct nameidata *nd, struct path *path,
+/*
+ * Be careful about ever adding any more callers of this
+ * function.  Its flags must be in the namei format, not
+ * what get passed to sys_open().
+ */
+static int __open_namei_create(struct nameidata *nd, struct path *path,
                                int flag, int mode)
 {
        int error;
@@ -1695,26 +1700,56 @@ static int open_namei_create(struct nameidata *nd, struct path *path,
 }
 /*
- *      open_namei()
+ * Note that while the flag value (low two bits) for sys_open means:
+ *      00 - read-only
+ *      01 - write-only
+ *      10 - read-write
+ *      11 - special
+ * it is changed into
+ *      00 - no permissions needed
+ *      01 - read-permission
+ *      10 - write-permission
+ *      11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
 *
- * namei for open - this is in fact almost the whole open-routine.
+*/
- *
+static inline int open_to_namei_flags(int flag)
- * Note that the low bits of "flag" aren't the same as in the open
+{
- * system call - they are 00 - no permissions needed
+        if ((flag+1) & O_ACCMODE)
- *                        01 - read permission needed
+                flag++;
- *                        10 - write permission needed
+        return flag;
- *                        11 - read/write permissions needed
+}
- * which is a lot more logical, and also allows the "no perm" needed
- * for symlinks (where the permissions are checked later).
+static int open_will_write_to_fs(int flag, struct inode *inode)
- * SMP-safe
+{
+        /*
+         * We'll never write to the fs underlying
+         * a device file.
+         */
+        if (special_file(inode->i_mode))
+                return 0;
+        return (flag & O_TRUNC);
+}
+/*
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
 */
-int open_namei(int dfd, const char *pathname, int flag,
+struct file *do_filp_open(int dfd, const char *pathname,
-                int mode, struct nameidata *nd)
+                int open_flag, int mode)
 {
+        struct file *filp;
+        struct nameidata nd;
        int acc_mode, error;
        struct path path;
        struct dentry *dir;
        int count = 0;
+        int will_write;
+        int flag = open_to_namei_flags(open_flag);
        acc_mode = ACC_MODE(flag);
@@ -1732,18 +1767,19 @@ int open_namei(int dfd, const char *pathname, int flag,
         */
        if (!(flag & O_CREAT)) {
                error = path_lookup_open(dfd, pathname, lookup_flags(flag),
-                                         nd, flag);
+                                         &nd, flag);
                if (error)
-                        return error;
+                        return ERR_PTR(error);
                goto ok;
        }
        /*
         * Create - we need to know the parent.
         */
-        error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
+        error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
+                                   &nd, flag, mode);
        if (error)
-                return error;
+                return ERR_PTR(error);
        /*
         * We have the parent and last component. First of all, check
@@ -1751,14 +1787,14 @@ int open_namei(int dfd, const char *pathname, int flag,
         * will not do.
         */
        error = -EISDIR;
-        if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+        if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
                goto exit;
-        dir = nd->path.dentry;
+        dir = nd.path.dentry;
-        nd->flags &= ~LOOKUP_PARENT;
+        nd.flags &= ~LOOKUP_PARENT;
        mutex_lock(&dir->d_inode->i_mutex);
-        path.dentry = lookup_hash(nd);
+        path.dentry = lookup_hash(&nd);
-        path.mnt = nd->path.mnt;
+        path.mnt = nd.path.mnt;
 do_last:
        error = PTR_ERR(path.dentry);
@@ -1767,18 +1803,31 @@ do_last:
                goto exit;
        }
-        if (IS_ERR(nd->intent.open.file)) {
+        if (IS_ERR(nd.intent.open.file)) {
-                mutex_unlock(&dir->d_inode->i_mutex);
+                error = PTR_ERR(nd.intent.open.file);
-                error = PTR_ERR(nd->intent.open.file);
+                goto exit_mutex_unlock;
-                goto exit_dput;
        }
        /* Negative dentry, just create the file */
        if (!path.dentry->d_inode) {
-                error = open_namei_create(nd, &path, flag, mode);
+                /*
+                 * This write is needed to ensure that a
+                 * ro->rw transition does not occur between
+                 * the time when the file is created and when
+                 * a permanent write count is taken through
+                 * the 'struct file' in nameidata_to_filp().
+                 */
+                error = mnt_want_write(nd.path.mnt);
                if (error)
+                        goto exit_mutex_unlock;
+                error = __open_namei_create(&nd, &path, flag, mode);
+                if (error) {
+                        mnt_drop_write(nd.path.mnt);
                        goto exit;
-                return 0;
+                }
+                filp = nameidata_to_filp(&nd, open_flag);
+                mnt_drop_write(nd.path.mnt);
+                return filp;
        }
        /*
@@ -1803,23 +1852,52 @@ do_last:
        if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
                goto do_link;
-        path_to_nameidata(&path, nd);
+        path_to_nameidata(&path, &nd);
        error = -EISDIR;
        if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
                goto exit;
 ok:
-        error = may_open(nd, acc_mode, flag);
+        /*
-        if (error)
+         * Consider:
+         * 1. may_open() truncates a file
+         * 2. a rw->ro mount transition occurs
+         * 3. nameidata_to_filp() fails due to
+         *    the ro mount.
+         * That would be inconsistent, and should
+         * be avoided. Taking this mnt write here
+         * ensures that (2) can not occur.
+         */
+        will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+        if (will_write) {
+                error = mnt_want_write(nd.path.mnt);
+                if (error)
+                        goto exit;
+        }
+        error = may_open(&nd, acc_mode, flag);
+        if (error) {
+                if (will_write)
+                        mnt_drop_write(nd.path.mnt);
                goto exit;
-        return 0;
+        }
+        filp = nameidata_to_filp(&nd, open_flag);
+        /*
+         * It is now safe to drop the mnt write
+         * because the filp has had a write taken
+         * on its behalf.
+         */
+        if (will_write)
+                mnt_drop_write(nd.path.mnt);
+        return filp;
+exit_mutex_unlock:
+        mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
-        path_put_conditional(&path, nd);
+        path_put_conditional(&path, &nd);
 exit:
-        if (!IS_ERR(nd->intent.open.file))
+        if (!IS_ERR(nd.intent.open.file))
-                release_open_intent(nd);
+                release_open_intent(&nd);
-        path_put(&nd->path);
+        path_put(&nd.path);
-        return error;
+        return ERR_PTR(error);
 do_link:
        error = -ELOOP;
@@ -1835,43 +1913,60 @@ do_link:
         * stored in nd->last.name and we will have to putname() it when we
         * are done. Procfs-like symlinks just set LAST_BIND.
         */
-        nd->flags |= LOOKUP_PARENT;
+        nd.flags |= LOOKUP_PARENT;
-        error = security_inode_follow_link(path.dentry, nd);
+        error = security_inode_follow_link(path.dentry, &nd);
        if (error)
                goto exit_dput;
-        error = __do_follow_link(&path, nd);
+        error = __do_follow_link(&path, &nd);
        if (error) {
                /* Does someone understand code flow here? Or it is only
                 * me so stupid? Anathema to whoever designed this non-sense
                 * with "intent.open".
                 */
-                release_open_intent(nd);
+                release_open_intent(&nd);
-                return error;
+                return ERR_PTR(error);
        }
-        nd->flags &= ~LOOKUP_PARENT;
+        nd.flags &= ~LOOKUP_PARENT;
-        if (nd->last_type == LAST_BIND)
+        if (nd.last_type == LAST_BIND)
                goto ok;
        error = -EISDIR;
-        if (nd->last_type != LAST_NORM)
+        if (nd.last_type != LAST_NORM)
                goto exit;
-        if (nd->last.name[nd->last.len]) {
+        if (nd.last.name[nd.last.len]) {
-                __putname(nd->last.name);
+                __putname(nd.last.name);
                goto exit;
        }
        error = -ELOOP;
        if (count++==32) {
-                __putname(nd->last.name);
+                __putname(nd.last.name);
                goto exit;
        }
-        dir = nd->path.dentry;
+        dir = nd.path.dentry;
        mutex_lock(&dir->d_inode->i_mutex);
-        path.dentry = lookup_hash(nd);
+        path.dentry = lookup_hash(&nd);
-        path.mnt = nd->path.mnt;
+        path.mnt = nd.path.mnt;
-        __putname(nd->last.name);
+        __putname(nd.last.name);
        goto do_last;
 }
 /**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:   path to open
+ * @flags:      open flags as per the open(2) second argument
+ * @mode:       mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+        return do_filp_open(AT_FDCWD, filename, flags, mode);
+}
+EXPORT_SYMBOL(filp_open);
+/**
 * lookup_create - lookup a dentry, creating it if it doesn't exist
 * @nd: nameidata info
 * @is_dir: directory flag
@@ -1944,6 +2039,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        return error;
 }
+static int may_mknod(mode_t mode)
+{
+        switch (mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
+        case 0: /* zero mode translates to S_IFREG */
+                return 0;
+        case S_IFDIR:
+                return -EPERM;
+        default:
+                return -EINVAL;
+        }
+}
 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
                                unsigned dev)
 {
@@ -1962,12 +2074,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
        if (error)
                goto out;
        dentry = lookup_create(&nd, 0);
-        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry)) {
+                error = PTR_ERR(dentry);
+                goto out_unlock;
+        }
        if (!IS_POSIXACL(nd.path.dentry->d_inode))
                mode &= ~current->fs->umask;
-        if (!IS_ERR(dentry)) {
+        error = may_mknod(mode);
-                switch (mode & S_IFMT) {
+        if (error)
+                goto out_dput;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_dput;
+        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
                        break;
@@ -1978,14 +2097,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
                        break;
-                case S_IFDIR:
-                        error = -EPERM;
-                        break;
-                default:
-                        error = -EINVAL;
-                }
-                dput(dentry);
        }
+        mnt_drop_write(nd.path.mnt);
+out_dput:
+        dput(dentry);
+out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        path_put(&nd.path);
 out:
@@ -2043,7 +2159,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
        if (!IS_POSIXACL(nd.path.dentry->d_inode))
                mode &= ~current->fs->umask;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_dput;
        error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+        mnt_drop_write(nd.path.mnt);
+out_dput:
        dput(dentry);
 out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2150,7 +2271,12 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto exit3;
        error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+        mnt_drop_write(nd.path.mnt);
+exit3:
        dput(dentry);
 exit2:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2231,7 +2357,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                inode = dentry->d_inode;
                if (inode)
                        atomic_inc(&inode->i_count);
+                error = mnt_want_write(nd.path.mnt);
+                if (error)
+                        goto exit2;
                error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+                mnt_drop_write(nd.path.mnt);
        exit2:
                dput(dentry);
        }
@@ -2312,7 +2442,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
        if (IS_ERR(dentry))
                goto out_unlock;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_dput;
        error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+        mnt_drop_write(nd.path.mnt);
+out_dput:
        dput(dentry);
 out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2407,7 +2542,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out_unlock;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_dput;
        error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+        mnt_drop_write(nd.path.mnt);
+out_dput:
        dput(new_dentry);
 out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2633,8 +2773,12 @@ static int do_rename(int olddfd, const char *oldname,
        if (new_dentry == trap)
                goto exit5;
+        error = mnt_want_write(oldnd.path.mnt);
+        if (error)
+                goto exit5;
        error = vfs_rename(old_dir->d_inode, old_dentry,
                                   new_dir->d_inode, new_dentry);
+        mnt_drop_write(oldnd.path.mnt);
 exit5:
        dput(new_dentry);
 exit4:
diff --git a/fs/namespace.c b/fs/namespace.c
index 7953c96a2071..678f7ce060f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/sysfs.h>
 #include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
        return tmp & (HASH_SIZE - 1);
 }
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
+                atomic_set(&mnt->__mnt_writers, 0);
                if (name) {
                        int size = strlen(name) + 1;
                        char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +84,263 @@ struct vfsmount *alloc_vfsmnt(const char *name)
        return mnt;
 }
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+        if (mnt->mnt_flags & MNT_READONLY)
+                return 1;
+        if (mnt->mnt_sb->s_flags & MS_RDONLY)
+                return 1;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+struct mnt_writer {
+        /*
+         * If holding multiple instances of this lock, they
+         * must be ordered by cpu number.
+         */
+        spinlock_t lock;
+        struct lock_class_key lock_class; /* compiles out with !lockdep */
+        unsigned long count;
+        struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static int __init init_mnt_writers(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+                spin_lock_init(&writer->lock);
+                lockdep_set_class(&writer->lock, &writer->lock_class);
+                writer->count = 0;
+        }
+        return 0;
+}
+fs_initcall(init_mnt_writers);
+static void unlock_mnt_writers(void)
+{
+        int cpu;
+        struct mnt_writer *cpu_writer;
+        for_each_possible_cpu(cpu) {
+                cpu_writer = &per_cpu(mnt_writers, cpu);
+                spin_unlock(&cpu_writer->lock);
+        }
+}
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+        if (!cpu_writer->mnt)
+                return;
+        /*
+         * This is in case anyone ever leaves an invalid,
+         * old ->mnt and a count of 0.
+         */
+        if (!cpu_writer->count)
+                return;
+        atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+        cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+                                          struct vfsmount *mnt)
+{
+        if (cpu_writer->mnt == mnt)
+                return;
+        __clear_mnt_count(cpu_writer);
+        cpu_writer->mnt = mnt;
+}
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/**
+ * mnt_want_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is
+ * about to be performed to it, and makes sure that
+ * writes are allowed before returning success.  When
+ * the write operation is finished, mnt_drop_write()
+ * must be called.  This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *mnt)
+{
+        int ret = 0;
+        struct mnt_writer *cpu_writer;
+        cpu_writer = &get_cpu_var(mnt_writers);
+        spin_lock(&cpu_writer->lock);
+        if (__mnt_is_readonly(mnt)) {
+                ret = -EROFS;
+                goto out;
+        }
+        use_cpu_writer_for_mount(cpu_writer, mnt);
+        cpu_writer->count++;
+out:
+        spin_unlock(&cpu_writer->lock);
+        put_cpu_var(mnt_writers);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+static void lock_mnt_writers(void)
+{
+        int cpu;
+        struct mnt_writer *cpu_writer;
+        for_each_possible_cpu(cpu) {
+                cpu_writer = &per_cpu(mnt_writers, cpu);
+                spin_lock(&cpu_writer->lock);
+                __clear_mnt_count(cpu_writer);
+                cpu_writer->mnt = NULL;
+        }
+}
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+        if (atomic_read(&mnt->__mnt_writers) >=
+            MNT_WRITER_UNDERFLOW_LIMIT)
+                return;
+        /*
+         * It isn't necessary to hold all of the locks
+         * at the same time, but doing it this way makes
+         * us share a lot more code.
+         */
+        lock_mnt_writers();
+        /*
+         * vfsmount_lock is for mnt_flags.
+         */
+        spin_lock(&vfsmount_lock);
+        /*
+         * If coalescing the per-cpu writer counts did not
+         * get us back to a positive writer count, we have
+         * a bug.
+         */
+        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+                printk(KERN_DEBUG "leak detected on mount(%p) writers "
+                                "count: %d\n",
+                        mnt, atomic_read(&mnt->__mnt_writers));
+                WARN_ON(1);
+                /* use the flag to keep the dmesg spam down */
+                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+        }
+        spin_unlock(&vfsmount_lock);
+        unlock_mnt_writers();
+}
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it.  Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+        int must_check_underflow = 0;
+        struct mnt_writer *cpu_writer;
+        cpu_writer = &get_cpu_var(mnt_writers);
+        spin_lock(&cpu_writer->lock);
+        use_cpu_writer_for_mount(cpu_writer, mnt);
+        if (cpu_writer->count > 0) {
+                cpu_writer->count--;
+        } else {
+                must_check_underflow = 1;
+                atomic_dec(&mnt->__mnt_writers);
+        }
+        spin_unlock(&cpu_writer->lock);
+        /*
+         * Logically, we could call this each time,
+         * but the __mnt_writers cacheline tends to
+         * be cold, and makes this expensive.
+         */
+        if (must_check_underflow)
+                handle_write_count_underflow(mnt);
+        /*
+         * This could be done right after the spinlock
+         * is taken because the spinlock keeps us on
+         * the cpu, and disables preemption.  However,
+         * putting it here bounds the amount that
+         * __mnt_writers can underflow.  Without it,
+         * we could theoretically wrap __mnt_writers.
+         */
+        put_cpu_var(mnt_writers);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+static int mnt_make_readonly(struct vfsmount *mnt)
+{
+        int ret = 0;
+        lock_mnt_writers();
+        /*
+         * With all the locks held, this value is stable
+         */
+        if (atomic_read(&mnt->__mnt_writers) > 0) {
+                ret = -EBUSY;
+                goto out;
+        }
+        /*
+         * nobody can do a successful mnt_want_write() with all
+         * of the counts in MNT_DENIED_WRITE and the locks held.
+         */
+        spin_lock(&vfsmount_lock);
+        if (!ret)
+                mnt->mnt_flags |= MNT_READONLY;
+        spin_unlock(&vfsmount_lock);
+out:
+        unlock_mnt_writers();
+        return ret;
+}
+static void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+        spin_lock(&vfsmount_lock);
+        mnt->mnt_flags &= ~MNT_READONLY;
+        spin_unlock(&vfsmount_lock);
+}
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
        mnt->mnt_sb = sb;
@@ -155,15 +416,15 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
-static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
-        old_nd->path.dentry = mnt->mnt_mountpoint;
+        old_path->dentry = mnt->mnt_mountpoint;
-        old_nd->path.mnt = mnt->mnt_parent;
+        old_path->mnt = mnt->mnt_parent;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt_root;
        list_del_init(&mnt->mnt_child);
        list_del_init(&mnt->mnt_hash);
-        old_nd->path.dentry->d_mounted--;
+        old_path->dentry->d_mounted--;
 }
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
@@ -174,12 +435,12 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
        dentry->d_mounted++;
 }
-static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
+static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
-        mnt_set_mountpoint(nd->path.mnt, nd->path.dentry, mnt);
+        mnt_set_mountpoint(path->mnt, path->dentry, mnt);
        list_add_tail(&mnt->mnt_hash, mount_hashtable +
-                        hash(nd->path.mnt, nd->path.dentry));
+                        hash(path->mnt, path->dentry));
-        list_add_tail(&mnt->mnt_child, &nd->path.mnt->mnt_mounts);
+        list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
 }
 /*
@@ -262,10 +523,8 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                /* stick the duplicate mount on the same expiry list
                 * as the original if that was on one */
                if (flag & CL_EXPIRE) {
-                        spin_lock(&vfsmount_lock);
                        if (!list_empty(&old->mnt_expire))
                                list_add(&mnt->mnt_expire, &old->mnt_expire);
-                        spin_unlock(&vfsmount_lock);
                }
        }
        return mnt;
@@ -273,7 +532,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 static inline void __mntput(struct vfsmount *mnt)
 {
+        int cpu;
        struct super_block *sb = mnt->mnt_sb;
+        /*
+         * We don't have to hold all of the locks at the
+         * same time here because we know that we're the
+         * last reference to mnt and that no new writers
+         * can come in.
+         */
+        for_each_possible_cpu(cpu) {
+                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+                if (cpu_writer->mnt != mnt)
+                        continue;
+                spin_lock(&cpu_writer->lock);
+                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+                cpu_writer->count = 0;
+                /*
+                 * Might as well do this so that no one
+                 * ever sees the pointer and expects
+                 * it to be valid.
+                 */
+                cpu_writer->mnt = NULL;
+                spin_unlock(&cpu_writer->lock);
+        }
+        /*
+         * This probably indicates that somebody messed
+         * up a mnt_want/drop_write() pair.  If this
+         * happens, the filesystem was probably unable
+         * to make r/w->r/o transitions.
+         */
+        WARN_ON(atomic_read(&mnt->__mnt_writers));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
@@ -419,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v)
                seq_putc(m, '.');
                mangle(m, mnt->mnt_sb->s_subtype);
        }
-        seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+        seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
        for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
                if (mnt->mnt_sb->s_flags & fs_infop->flag)
                        seq_puts(m, fs_infop->str);
@@ -548,6 +836,7 @@ void release_mounts(struct list_head *head)
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt_root;
                        mnt->mnt_parent = mnt;
+                        m->mnt_ghosts--;
                        spin_unlock(&vfsmount_lock);
                        dput(dentry);
                        mntput(m);
@@ -572,12 +861,16 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
                list_del_init(&p->mnt_child);
-                if (p->mnt_parent != p)
+                if (p->mnt_parent != p) {
+                        p->mnt_parent->mnt_ghosts++;
                        p->mnt_mountpoint->d_mounted--;
+                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
 }
+static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
 static int do_umount(struct vfsmount *mnt, int flags)
 {
        struct super_block *sb = mnt->mnt_sb;
@@ -650,6 +943,9 @@ static int do_umount(struct vfsmount *mnt, int flags)
        spin_lock(&vfsmount_lock);
        event++;
+        if (!(flags & MNT_DETACH))
+                shrink_submounts(mnt, &umount_list);
        retval = -EBUSY;
        if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
                if (!list_empty(&mnt->mnt_list))
@@ -744,7 +1040,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                                        int flag)
 {
        struct vfsmount *res, *p, *q, *r, *s;
-        struct nameidata nd;
+        struct path path;
        if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
                return NULL;
@@ -769,14 +1065,14 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                                q = q->mnt_parent;
                        }
                        p = s;
-                        nd.path.mnt = q;
+                        path.mnt = q;
-                        nd.path.dentry = p->mnt_mountpoint;
+                        path.dentry = p->mnt_mountpoint;
                        q = clone_mnt(p, p->mnt_root, flag);
                        if (!q)
                                goto Enomem;
                        spin_lock(&vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
-                        attach_mnt(q, &nd);
+                        attach_mnt(q, &path);
                        spin_unlock(&vfsmount_lock);
                }
        }
@@ -876,11 +1172,11 @@ void drop_collected_mounts(struct vfsmount *mnt)
 * in allocations.
 */
 static int attach_recursive_mnt(struct vfsmount *source_mnt,
-                        struct nameidata *nd, struct nameidata *parent_nd)
+                        struct path *path, struct path *parent_path)
 {
        LIST_HEAD(tree_list);
-        struct vfsmount *dest_mnt = nd->path.mnt;
+        struct vfsmount *dest_mnt = path->mnt;
-        struct dentry *dest_dentry = nd->path.dentry;
+        struct dentry *dest_dentry = path->dentry;
        struct vfsmount *child, *p;
        if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
@@ -892,9 +1188,9 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        }
        spin_lock(&vfsmount_lock);
-        if (parent_nd) {
+        if (parent_path) {
-                detach_mnt(source_mnt, parent_nd);
+                detach_mnt(source_mnt, parent_path);
-                attach_mnt(source_mnt, nd);
+                attach_mnt(source_mnt, path);
                touch_mnt_namespace(current->nsproxy->mnt_ns);
        } else {
                mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
@@ -930,7 +1226,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
        err = -ENOENT;
        if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry))
-                err = attach_recursive_mnt(mnt, nd, NULL);
+                err = attach_recursive_mnt(mnt, &nd->path, NULL);
 out_unlock:
        mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
        if (!err)
@@ -1013,6 +1309,23 @@ out:
        return err;
 }
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+        int error = 0;
+        int readonly_request = 0;
+        if (ms_flags & MS_RDONLY)
+                readonly_request = 1;
+        if (readonly_request == __mnt_is_readonly(mnt))
+                return 0;
+        if (readonly_request)
+                error = mnt_make_readonly(mnt);
+        else
+                __mnt_unmake_readonly(mnt);
+        return error;
+}
 /*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
@@ -1035,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
                return -EINVAL;
        down_write(&sb->s_umount);
-        err = do_remount_sb(sb, flags, data, 0);
+        if (flags & MS_BIND)
+                err = change_mount_flags(nd->path.mnt, flags);
+        else
+                err = do_remount_sb(sb, flags, data, 0);
        if (!err)
                nd->path.mnt->mnt_flags = mnt_flags;
        up_write(&sb->s_umount);
@@ -1059,7 +1375,8 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)
 */
 static noinline int do_move_mount(struct nameidata *nd, char *old_name)
 {
-        struct nameidata old_nd, parent_nd;
+        struct nameidata old_nd;
+        struct path parent_path;
        struct vfsmount *p;
        int err = 0;
        if (!capable(CAP_SYS_ADMIN))
@@ -1114,21 +1431,19 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name)
                if (p == old_nd.path.mnt)
                        goto out1;
-        err = attach_recursive_mnt(old_nd.path.mnt, nd, &parent_nd);
+        err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path);
        if (err)
                goto out1;
-        spin_lock(&vfsmount_lock);
        /* if the mount is moved, it should no longer be expire
         * automatically */
        list_del_init(&old_nd.path.mnt->mnt_expire);
-        spin_unlock(&vfsmount_lock);
 out1:
        mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
 out:
        up_write(&namespace_sem);
        if (!err)
-                path_put(&parent_nd.path);
+                path_put(&parent_path);
        path_put(&old_nd.path);
        return err;
 }
@@ -1189,12 +1504,9 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
        if ((err = graft_tree(newmnt, nd)))
                goto unlock;
-        if (fslist) {
+        if (fslist) /* add to the specified expiration list */
-                /* add to the specified expiration list */
-                spin_lock(&vfsmount_lock);
                list_add_tail(&newmnt->mnt_expire, fslist);
-                spin_unlock(&vfsmount_lock);
-        }
        up_write(&namespace_sem);
        return 0;
@@ -1206,75 +1518,6 @@ unlock:
 EXPORT_SYMBOL_GPL(do_add_mount);
-static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
-                                struct list_head *umounts)
-{
-        spin_lock(&vfsmount_lock);
-        /*
-         * Check if mount is still attached, if not, let whoever holds it deal
-         * with the sucker
-         */
-        if (mnt->mnt_parent == mnt) {
-                spin_unlock(&vfsmount_lock);
-                return;
-        }
-        /*
-         * Check that it is still dead: the count should now be 2 - as
-         * contributed by the vfsmount parent and the mntget above
-         */
-        if (!propagate_mount_busy(mnt, 2)) {
-                /* delete from the namespace */
-                touch_mnt_namespace(mnt->mnt_ns);
-                list_del_init(&mnt->mnt_list);
-                mnt->mnt_ns = NULL;
-                umount_tree(mnt, 1, umounts);
-                spin_unlock(&vfsmount_lock);
-        } else {
-                /*
-                 * Someone brought it back to life whilst we didn't have any
-                 * locks held so return it to the expiration list
-                 */
-                list_add_tail(&mnt->mnt_expire, mounts);
-                spin_unlock(&vfsmount_lock);
-        }
-}
-/*
- * go through the vfsmounts we've just consigned to the graveyard to
- * - check that they're still dead
- * - delete the vfsmount from the appropriate namespace under lock
- * - dispose of the corpse
- */
-static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
-{
-        struct mnt_namespace *ns;
-        struct vfsmount *mnt;
-        while (!list_empty(graveyard)) {
-                LIST_HEAD(umounts);
-                mnt = list_first_entry(graveyard, struct vfsmount, mnt_expire);
-                list_del_init(&mnt->mnt_expire);
-                /* don't do anything if the namespace is dead - all the
-                 * vfsmounts from it are going away anyway */
-                ns = mnt->mnt_ns;
-                if (!ns || !ns->root)
-                        continue;
-                get_mnt_ns(ns);
-                spin_unlock(&vfsmount_lock);
-                down_write(&namespace_sem);
-                expire_mount(mnt, mounts, &umounts);
-                up_write(&namespace_sem);
-                release_mounts(&umounts);
-                mntput(mnt);
-                put_mnt_ns(ns);
-                spin_lock(&vfsmount_lock);
-        }
-}
 /*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
@@ -1284,10 +1527,12 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 {
        struct vfsmount *mnt, *next;
        LIST_HEAD(graveyard);
+        LIST_HEAD(umounts);
        if (list_empty(mounts))
                return;
+        down_write(&namespace_sem);
        spin_lock(&vfsmount_lock);
        /* extract from the expiration list every vfsmount that matches the
@@ -1298,16 +1543,19 @@ void mark_mounts_for_expiry(struct list_head *mounts)
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
-                    atomic_read(&mnt->mnt_count) != 1)
+                        propagate_mount_busy(mnt, 1))
                        continue;
-                mntget(mnt);
                list_move(&mnt->mnt_expire, &graveyard);
        }
+        while (!list_empty(&graveyard)) {
-        expire_mount_list(&graveyard, mounts);
+                mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire);
+                touch_mnt_namespace(mnt->mnt_ns);
+                umount_tree(mnt, 1, &umounts);
+        }
        spin_unlock(&vfsmount_lock);
+        up_write(&namespace_sem);
+        release_mounts(&umounts);
 }
 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -1343,7 +1591,6 @@ resume:
                }
                if (!propagate_mount_busy(mnt, 1)) {
-                        mntget(mnt);
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
@@ -1363,22 +1610,22 @@ resume:
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 */
-void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
        LIST_HEAD(graveyard);
-        int found;
+        struct vfsmount *m;
-        spin_lock(&vfsmount_lock);
        /* extract submounts of 'mountpoint' from the expiration list */
-        while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+        while (select_submounts(mnt, &graveyard)) {
-                expire_mount_list(&graveyard, mounts);
+                while (!list_empty(&graveyard)) {
+                        m = list_first_entry(&graveyard, struct vfsmount,
-        spin_unlock(&vfsmount_lock);
+                                                mnt_expire);
+                        touch_mnt_namespace(mnt->mnt_ns);
+                        umount_tree(mnt, 1, umounts);
+                }
+        }
 }
-EXPORT_SYMBOL_GPL(shrink_submounts);
 /*
 * Some copy_from_user() implementations do not return the exact number of
 * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
@@ -1488,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_RELATIME)
                mnt_flags |= MNT_RELATIME;
+        if (flags & MS_RDONLY)
+                mnt_flags |= MNT_READONLY;
        flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
@@ -1683,7 +1932,7 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
                path_put(&old_pwd);
 }
-static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
+static void chroot_fs_refs(struct path *old_root, struct path *new_root)
 {
        struct task_struct *g, *p;
        struct fs_struct *fs;
@@ -1695,12 +1944,12 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
                if (fs) {
                        atomic_inc(&fs->count);
                        task_unlock(p);
-                        if (fs->root.dentry == old_nd->path.dentry
+                        if (fs->root.dentry == old_root->dentry
-                            && fs->root.mnt == old_nd->path.mnt)
+                            && fs->root.mnt == old_root->mnt)
-                                set_fs_root(fs, &new_nd->path);
+                                set_fs_root(fs, new_root);
-                        if (fs->pwd.dentry == old_nd->path.dentry
+                        if (fs->pwd.dentry == old_root->dentry
-                            && fs->pwd.mnt == old_nd->path.mnt)
+                            && fs->pwd.mnt == old_root->mnt)
-                                set_fs_pwd(fs, &new_nd->path);
+                                set_fs_pwd(fs, new_root);
                        put_fs_struct(fs);
                } else
                        task_unlock(p);
@@ -1737,7 +1986,8 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
                               const char __user * put_old)
 {
        struct vfsmount *tmp;
-        struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
+        struct nameidata new_nd, old_nd, user_nd;
+        struct path parent_path, root_parent;
        int error;
        if (!capable(CAP_SYS_ADMIN))
@@ -1811,19 +2061,19 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
                        goto out3;
        } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
                goto out3;
-        detach_mnt(new_nd.path.mnt, &parent_nd);
+        detach_mnt(new_nd.path.mnt, &parent_path);
        detach_mnt(user_nd.path.mnt, &root_parent);
        /* mount old root on put_old */
-        attach_mnt(user_nd.path.mnt, &old_nd);
+        attach_mnt(user_nd.path.mnt, &old_nd.path);
        /* mount new_root on / */
        attach_mnt(new_nd.path.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        spin_unlock(&vfsmount_lock);
-        chroot_fs_refs(&user_nd, &new_nd);
+        chroot_fs_refs(&user_nd.path, &new_nd.path);
        security_sb_post_pivotroot(&user_nd, &new_nd);
        error = 0;
-        path_put(&root_parent.path);
+        path_put(&root_parent);
-        path_put(&parent_nd.path);
+        path_put(&parent_path);
 out2:
        mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
        up_write(&namespace_sem);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c67b4bdcf719..ad8f167e54bc 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/ioctl.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
@@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
              unsigned int cmd, unsigned long arg)
 {
        struct ncp_server *server = NCP_SERVER(inode);
@@ -822,6 +823,57 @@ outrel:
        return -EINVAL;
 }
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+        switch (cmd) {
+        case NCP_IOC_GET_FS_INFO:
+        case NCP_IOC_GET_FS_INFO_V2:
+        case NCP_IOC_NCPREQUEST:
+        case NCP_IOC_SETDENTRYTTL:
+        case NCP_IOC_SIGN_INIT:
+        case NCP_IOC_LOCKUNLOCK:
+        case NCP_IOC_SET_SIGN_WANTED:
+                return 1;
+        case NCP_IOC_GETOBJECTNAME:
+        case NCP_IOC_SETOBJECTNAME:
+        case NCP_IOC_GETPRIVATEDATA:
+        case NCP_IOC_SETPRIVATEDATA:
+        case NCP_IOC_SETCHARSETS:
+        case NCP_IOC_GETCHARSETS:
+        case NCP_IOC_CONN_LOGGED_IN:
+        case NCP_IOC_GETDENTRYTTL:
+        case NCP_IOC_GETMOUNTUID2:
+        case NCP_IOC_SIGN_WANTED:
+        case NCP_IOC_GETROOT:
+        case NCP_IOC_SETROOT:
+                return 0;
+        default:
+                /* unkown IOCTL command, assume write */
+                return 1;
+        }
+}
+int ncp_ioctl(struct inode *inode, struct file *filp,
+              unsigned int cmd, unsigned long arg)
+{
+        int ret;
+        if (ncp_ioctl_need_write(cmd)) {
+                /*
+                 * inside the ioctl(), any failures which
+                 * are because of file_permission() are
+                 * -EACCESS, so it seems consistent to keep
+                 *  that here.
+                 */
+                if (mnt_want_write(filp->f_path.mnt))
+                        return -EACCES;
+        }
+        ret = __ncp_ioctl(inode, filp, cmd, arg);
+        if (ncp_ioctl_need_write(cmd))
+                mnt_drop_write(filp->f_path.mnt);
+        return ret;
+}
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index ecc06c619494..66648dd92d97 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -93,6 +93,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
                svc_process(rqstp);
        }
+        flush_signals(current);
        svc_exit_thread(rqstp);
        nfs_callback_info.pid = 0;
        complete(&nfs_callback_info.stopped);
@@ -171,7 +172,7 @@ void nfs_callback_down(void)
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
-        char buf[RPC_MAX_ADDRBUFLEN];
+        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
        /* Don't talk to strangers */
        clp = nfs_find_client(svc_addr(rqstp), 4);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c63eb720b68b..13619d24f023 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -254,7 +254,7 @@ static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap,
        if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
                return 0;
        p = xdr_reserve_space(xdr, 8);
-        if (unlikely(p == 0))
+        if (unlikely(!p))
                return htonl(NFS4ERR_RESOURCE);
        p = xdr_encode_hyper(p, change);
        return 0;
@@ -267,7 +267,7 @@ static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, u
        if (!(bitmap[0] & FATTR4_WORD0_SIZE))
                return 0;
        p = xdr_reserve_space(xdr, 8);
-        if (unlikely(p == 0))
+        if (unlikely(!p))
                return htonl(NFS4ERR_RESOURCE);
        p = xdr_encode_hyper(p, size);
        return 0;
@@ -278,7 +278,7 @@ static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *ti
        __be32 *p;
        p = xdr_reserve_space(xdr, 12);
-        if (unlikely(p == 0))
+        if (unlikely(!p))
                return htonl(NFS4ERR_RESOURCE);
        p = xdr_encode_hyper(p, time->tv_sec);
        *p = htonl(time->tv_nsec);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b9eadd18ba70..00a5e4405e16 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -49,7 +49,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        struct file_lock *fl;
        int status;
-        for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
+        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ae04892a5e5d..d9e30ac2798d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -710,6 +710,8 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
 {
        struct nfs_server *server = NFS_SERVER(inode);
+        if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+                return 0;
        if (nd != NULL) {
                /* VFS wants an on-the-wire revalidation */
                if (nd->flags & LOOKUP_REVAL)
@@ -965,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
        if (nd->flags & LOOKUP_DIRECTORY)
                return 0;
        /* Are we trying to write to a read only partition? */
-        if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+        if (__mnt_is_readonly(nd->path.mnt) &&
+            (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
                return 0;
        return 1;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ef57a5ae5904..5d2e9d9a4e28 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -64,7 +64,11 @@ const struct file_operations nfs_file_operations = {
        .write          = do_sync_write,
        .aio_read       = nfs_file_read,
        .aio_write      = nfs_file_write,
+#ifdef CONFIG_MMU
        .mmap           = nfs_file_mmap,
+#else
+        .mmap           = generic_file_mmap,
+#endif
        .open           = nfs_file_open,
        .flush          = nfs_file_flush,
        .release        = nfs_file_release,
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 8ae5dba2d4e5..86147b0ab2cf 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -309,7 +309,7 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
        mutex_lock(&idmap->idmap_im_lock);
        he = idmap_lookup_id(h, id);
-        if (he != 0) {
+        if (he) {
                memcpy(name, he->ih_name, he->ih_namelen);
                ret = he->ih_namelen;
                goto out;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 966a8850aa30..6f88d7c77ac9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -299,6 +299,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                                else
                                        inode->i_op = &nfs_mountpoint_inode_operations;
                                inode->i_fop = NULL;
+                                set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
                        }
                } else if (S_ISLNK(inode->i_mode))
                        inode->i_op = &nfs_symlink_inode_operations;
@@ -505,6 +506,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
                ctx->lockowner = current->files;
+                ctx->flags = 0;
                ctx->error = 0;
                ctx->dir_cookie = 0;
                atomic_set(&ctx->count, 1);
@@ -1003,8 +1005,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        server = NFS_SERVER(inode);
        /* Update the fsid? */
-        if (S_ISDIR(inode->i_mode)
+        if (S_ISDIR(inode->i_mode) &&
-                        && !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
+                        !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
                server->fsid = fattr->fsid;
        /*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 0f5619611b8d..931992763e68 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -3,6 +3,7 @@
 */
 #include <linux/mount.h>
+#include <linux/security.h>
 struct nfs_string;
@@ -57,6 +58,8 @@ struct nfs_parsed_mount_data {
                char                    *export_path;
                int                     protocol;
        } nfs_server;
+        struct security_mnt_opts lsm_opts;
 };
 /* client.c */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6233eb5e98c1..b962397004c1 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -785,7 +785,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
        struct file_lock *fl;
        int status = 0;
-        for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
+        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 3d7d9631e125..5a70be589bbe 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -533,7 +533,10 @@ readpage_async_filler(void *data, struct page *page)
        if (len < PAGE_CACHE_SIZE)
                zero_user_segment(page, len, PAGE_CACHE_SIZE);
-        nfs_pageio_add_request(desc->pgio, new);
+        if (!nfs_pageio_add_request(desc->pgio, new)) {
+                error = desc->pgio->pg_error;
+                goto out_unlock;
+        }
        return 0;
 out_error:
        error = PTR_ERR(new);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1fb381843650..f9219024f31a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -589,8 +589,6 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
        struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb);
        struct rpc_clnt *rpc;
-        shrink_submounts(vfsmnt, &nfs_automount_list);
        if (!(flags & MNT_FORCE))
                return;
        /* -EIO all pending I/O */
@@ -632,7 +630,7 @@ static int nfs_verify_server_address(struct sockaddr *addr)
        switch (addr->sa_family) {
        case AF_INET: {
                struct sockaddr_in *sa = (struct sockaddr_in *)addr;
-                return sa->sin_addr.s_addr != INADDR_ANY;
+                return sa->sin_addr.s_addr != htonl(INADDR_ANY);
        }
        case AF_INET6: {
                struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
@@ -684,8 +682,9 @@ static void nfs_parse_server_address(char *value,
 static int nfs_parse_mount_options(char *raw,
                                   struct nfs_parsed_mount_data *mnt)
 {
-        char *p, *string;
+        char *p, *string, *secdata;
        unsigned short port = 0;
+        int rc;
        if (!raw) {
                dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -693,6 +692,20 @@ static int nfs_parse_mount_options(char *raw,
        }
        dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw);
+        secdata = alloc_secdata();
+        if (!secdata)
+                goto out_nomem;
+        rc = security_sb_copy_data(raw, secdata);
+        if (rc)
+                goto out_security_failure;
+        rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts);
+        if (rc)
+                goto out_security_failure;
+        free_secdata(secdata);
        while ((p = strsep(&raw, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                int option, token;
@@ -1042,7 +1055,10 @@ static int nfs_parse_mount_options(char *raw,
 out_nomem:
        printk(KERN_INFO "NFS: not enough memory to parse option\n");
        return 0;
+out_security_failure:
+        free_secdata(secdata);
+        printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
+        return 0;
 out_unrec_vers:
        printk(KERN_INFO "NFS: unrecognized NFS version number\n");
        return 0;
@@ -1214,6 +1230,33 @@ static int nfs_validate_mount_data(void *options,
                args->namlen            = data->namlen;
                args->bsize             = data->bsize;
                args->auth_flavors[0]   = data->pseudoflavor;
+                /*
+                 * The legacy version 6 binary mount data from userspace has a
+                 * field used only to transport selinux information into the
+                 * the kernel.  To continue to support that functionality we
+                 * have a touch of selinux knowledge here in the NFS code. The
+                 * userspace code converted context=blah to just blah so we are
+                 * converting back to the full string selinux understands.
+                 */
+                if (data->context[0]){
+#ifdef CONFIG_SECURITY_SELINUX
+                        int rc;
+                        char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL);
+                        if (!opts_str)
+                                return -ENOMEM;
+                        strcpy(opts_str, "context=");
+                        data->context[NFS_MAX_CONTEXT_LEN] = '\0';
+                        strcat(opts_str, &data->context[0]);
+                        rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts);
+                        kfree(opts_str);
+                        if (rc)
+                                return rc;
+#else
+                        return -EINVAL;
+#endif
+                }
                break;
        default: {
                unsigned int len;
@@ -1476,6 +1519,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        };
        int error;
+        security_init_mnt_opts(&data.lsm_opts);
        /* Validate the mount data */
        error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
        if (error < 0)
@@ -1515,6 +1560,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                goto error_splat_super;
        }
+        error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+        if (error)
+                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
        mnt->mnt_sb = s;
        mnt->mnt_root = mntroot;
@@ -1523,12 +1572,15 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
        kfree(data.nfs_server.hostname);
        kfree(data.mount_server.hostname);
+        security_free_mnt_opts(&data.lsm_opts);
        return error;
 out_err_nosb:
        nfs_free_server(server);
        goto out;
+error_splat_root:
+        dput(mntroot);
 error_splat_super:
        up_write(&s->s_umount);
        deactivate_super(s);
@@ -1608,6 +1660,9 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
        mnt->mnt_sb = s;
        mnt->mnt_root = mntroot;
+        /* clone any lsm security options from the parent to the new sb */
+        security_sb_clone_mnt_opts(data->sb, s);
        dprintk("<-- nfs_xdev_get_sb() = 0\n");
        return 0;
@@ -1850,6 +1905,8 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        };
        int error;
+        security_init_mnt_opts(&data.lsm_opts);
        /* Validate the mount data */
        error = nfs4_validate_mount_data(raw_data, &data, dev_name);
        if (error < 0)
@@ -1898,6 +1955,7 @@ out:
        kfree(data.client_address);
        kfree(data.nfs_server.export_path);
        kfree(data.nfs_server.hostname);
+        security_free_mnt_opts(&data.lsm_opts);
        return error;
 out_free:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f55c437124a2..bed63416a55b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -39,6 +39,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*,
                                            unsigned int, unsigned int);
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
                                  struct inode *inode, int ioflags);
+static void nfs_redirty_request(struct nfs_page *req);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
@@ -288,7 +289,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                BUG();
        }
        spin_unlock(&inode->i_lock);
-        nfs_pageio_add_request(pgio, req);
+        if (!nfs_pageio_add_request(pgio, req)) {
+                nfs_redirty_request(req);
+                nfs_end_page_writeback(page);
+                nfs_clear_page_tag_locked(req);
+                return pgio->pg_error;
+        }
        return 0;
 }
@@ -734,7 +740,7 @@ int nfs_updatepage(struct file *file, struct page *page,
         */
        if (nfs_write_pageuptodate(page, inode) &&
                        inode->i_flock == NULL &&
-                        !(file->f_mode & O_SYNC)) {
+                        !(file->f_flags & O_SYNC)) {
                count = max(count + offset, nfs_page_length(page));
                offset = 0;
        }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c593db047d8b..c309c881bd4e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        return status;
                }
        }
+        status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
+        if (status)
+                return status;
        status = nfs_ok;
        if (setattr->sa_acl != NULL)
                status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
                                            setattr->sa_acl);
        if (status)
-                return status;
+                goto out;
        status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
                                0, (time_t)0);
+out:
+        mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
        return status;
 }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1ff90625860f..145b3c877a27 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -46,6 +46,7 @@
 #include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
+#include <linux/mount.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -154,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
                goto out_put;
        }
+        status = mnt_want_write(rec_dir.path.mnt);
+        if (status)
+                goto out_put;
        status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
+        mnt_drop_write(rec_dir.path.mnt);
 out_put:
        dput(dentry);
 out_unlock:
@@ -313,12 +318,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        if (!rec_dir_init || !clp->cl_firststate)
                return;
+        status = mnt_want_write(rec_dir.path.mnt);
+        if (status)
+                goto out;
        clp->cl_firststate = 0;
        nfs4_save_user(&uid, &gid);
        status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
        nfs4_reset_user(uid, gid);
        if (status == 0)
                nfsd4_sync_rec_dir();
+        mnt_drop_write(rec_dir.path.mnt);
+out:
        if (status)
                printk("NFSD: Failed to remove expired client state directory"
                                " %.*s\n", HEXDIR_LEN, clp->cl_recdir);
@@ -347,13 +357,17 @@ nfsd4_recdir_purge_old(void) {
        if (!rec_dir_init)
                return;
+        status = mnt_want_write(rec_dir.path.mnt);
+        if (status)
+                goto out;
        status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
        if (status == 0)
                nfsd4_sync_rec_dir();
+        mnt_drop_write(rec_dir.path.mnt);
+out:
        if (status)
                printk("nfsd4: failed to purge old clients from recovery"
                        " directory %s\n", rec_dir.path.dentry->d_name.name);
-        return;
 }
 static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bcb97d8e8b8b..81a75f3081f4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/cache.h>
+#include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/workqueue.h>
 #include <linux/smp_lock.h>
@@ -1239,7 +1240,7 @@ static inline void
 nfs4_file_downgrade(struct file *filp, unsigned int share_access)
 {
        if (share_access & NFS4_SHARE_ACCESS_WRITE) {
-                put_write_access(filp->f_path.dentry->d_inode);
+                drop_file_write_access(filp);
                filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
        }
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 0130b345234d..3e6b3f41ee1f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -101,7 +101,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
 {
        /* Check if the request originated from a secure port. */
        if (!rqstp->rq_secure && EX_SECURE(exp)) {
-                char buf[RPC_MAX_ADDRBUFLEN];
+                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
                dprintk(KERN_WARNING
                       "nfsd: request from insecure port %s!\n",
                       svc_print_addr(rqstp, buf, sizeof(buf)));
@@ -232,6 +232,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                fhp->fh_dentry = dentry;
                fhp->fh_export = exp;
                nfsd_nr_verified++;
+                cache_get(&exp->h);
        } else {
                /*
                 * just rechecking permissions
@@ -241,6 +242,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                dprintk("nfsd: fh_verify - just checking\n");
                dentry = fhp->fh_dentry;
                exp = fhp->fh_export;
+                cache_get(&exp->h);
                /*
                 * Set user creds for this exportpoint; necessary even
                 * in the "just checking" case because this may be a
@@ -252,8 +254,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                if (error)
                        goto out;
        }
-        cache_get(&exp->h);
        error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
        if (error)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5365a0..304bf5f643c9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = 0;
        switch (type) {
        case S_IFREG:
+                host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+                if (host_err)
+                        goto out_nfserr;
                host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
                break;
        case S_IFDIR:
+                host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+                if (host_err)
+                        goto out_nfserr;
                host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
                break;
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
+                host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+                if (host_err)
+                        goto out_nfserr;
                host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
                break;
        default:
                printk("nfsd: bad file type %o in nfsd_create\n", type);
                host_err = -EINVAL;
+                goto out_nfserr;
        }
-        if (host_err < 0)
+        if (host_err < 0) {
+                mnt_drop_write(fhp->fh_export->ex_path.mnt);
                goto out_nfserr;
+        }
        if (EX_ISSYNC(fhp->fh_export)) {
                err = nfserrno(nfsd_sync_dir(dentry));
@@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err2 = nfsd_create_setattr(rqstp, resfhp, iap);
        if (err2)
                err = err2;
+        mnt_drop_write(fhp->fh_export->ex_path.mnt);
        /*
         * Update the file handle to get the new inode info.
         */
@@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                v_atime = verifier[1]&0x7fffffff;
        }
        
+        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+        if (host_err)
+                goto out_nfserr;
        if (dchild->d_inode) {
                err = 0;
@@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                case NFS3_CREATE_GUARDED:
                        err = nfserr_exist;
                }
+                mnt_drop_write(fhp->fh_export->ex_path.mnt);
                goto out;
        }
        host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
-        if (host_err < 0)
+        if (host_err < 0) {
+                mnt_drop_write(fhp->fh_export->ex_path.mnt);
                goto out_nfserr;
+        }
        if (created)
                *created = 1;
@@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (err2)
                err = err2;
+        mnt_drop_write(fhp->fh_export->ex_path.mnt);
        /*
         * Update the filehandle to get the new inode info.
         */
@@ -1522,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (iap && (iap->ia_valid & ATTR_MODE))
                mode = iap->ia_mode & S_IALLUGO;
+        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+        if (host_err)
+                goto out_nfserr;
        if (unlikely(path[plen] != 0)) {
                char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
                if (path_alloced == NULL)
@@ -1542,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = nfserrno(host_err);
        fh_unlock(fhp);
+        mnt_drop_write(fhp->fh_export->ex_path.mnt);
        cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
        dput(dnew);
        if (err==0) err = cerr;
@@ -1592,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        dold = tfhp->fh_dentry;
        dest = dold->d_inode;
+        host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
+        if (host_err) {
+                err = nfserrno(host_err);
+                goto out_dput;
+        }
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
                if (EX_ISSYNC(ffhp->fh_export)) {
@@ -1605,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                else
                        err = nfserrno(host_err);
        }
+        mnt_drop_write(tfhp->fh_export->ex_path.mnt);
+out_dput:
        dput(dnew);
 out_unlock:
        fh_unlock(ffhp);
@@ -1678,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ndentry == trap)
                goto out_dput_new;
-#ifdef MSNFS
+        if (svc_msnfs(ffhp) &&
-        if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
                ((atomic_read(&odentry->d_count) > 1)
                 || (atomic_read(&ndentry->d_count) > 1))) {
                        host_err = -EPERM;
-        } else
+                        goto out_dput_new;
-#endif
+        }
+        host_err = -EXDEV;
+        if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+                goto out_dput_new;
+        host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
+        if (host_err)
+                goto out_dput_new;
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
                host_err = nfsd_sync_dir(tdentry);
@@ -1692,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
                        host_err = nfsd_sync_dir(fdentry);
        }
+        mnt_drop_write(ffhp->fh_export->ex_path.mnt);
 out_dput_new:
        dput(ndentry);
 out_dput_old:
@@ -1750,6 +1791,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (!type)
                type = rdentry->d_inode->i_mode & S_IFMT;
+        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+        if (host_err)
+                goto out_nfserr;
        if (type != S_IFDIR) { /* It's UNLINK */
 #ifdef MSNFS
                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
@@ -1765,10 +1810,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        dput(rdentry);
        if (host_err)
-                goto out_nfserr;
+                goto out_drop;
        if (EX_ISSYNC(fhp->fh_export))
                host_err = nfsd_sync_dir(dentry);
+out_drop:
+        mnt_drop_write(fhp->fh_export->ex_path.mnt);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -1865,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                inode->i_mode,
                IS_IMMUTABLE(inode)?    " immut" : "",
                IS_APPEND(inode)?       " append" : "",
-                IS_RDONLY(inode)?       " ro" : "");
+                __mnt_is_readonly(exp->ex_path.mnt)?    " ro" : "");
        dprintk("      owner %d/%d user %d/%d\n",
                inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
 #endif
@@ -1876,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
         */
        if (!(acc & MAY_LOCAL_ACCESS))
                if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
-                        if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode))
+                        if (exp_rdonly(rqstp, exp) ||
+                            __mnt_is_readonly(exp->ex_path.mnt))
                                return nfserr_rofs;
                        if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
                                return nfserr_perm;
@@ -2039,6 +2087,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
        } else
                size = 0;
+        error = mnt_want_write(fhp->fh_export->ex_path.mnt);
+        if (error)
+                goto getout;
        if (size)
                error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
        else {
@@ -2050,6 +2101,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
                                error = 0;
                }
        }
+        mnt_drop_write(fhp->fh_export->ex_path.mnt);
 getout:
        kfree(value);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42c..f6956de56fdb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
 EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+obj-$(CONFIG_OCFS2_FS) +=       \
+        ocfs2.o                 \
+        ocfs2_stackglue.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
@@ -31,5 +36,10 @@ ocfs2-objs := \
        uptodate.o              \
        ver.o
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+# cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
-obj-$(CONFIG_OCFS2_FS) += dlm/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2e..41f84c92094f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
        BUG_ON(!next_free);
        /* The tree code before us didn't allow enough room in the leaf. */
-        if (el->l_next_free_rec == el->l_count && !has_empty)
+        BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
-                BUG();
        /*
         * The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
 *   - When our insert into the right path leaf is at the leftmost edge
 *     and requires an update of the path immediately to it's left. This
 *     can occur at the end of some types of rotation and appending inserts.
+ *   - When we've adjusted the last extent record in the left path leaf and the
+ *     1st extent record in the right path leaf during cross extent block merge.
 */
 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
                                       struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
        }
 }
+static int ocfs2_get_right_path(struct inode *inode,
+                                struct ocfs2_path *left_path,
+                                struct ocfs2_path **ret_right_path)
+{
+        int ret;
+        u32 right_cpos;
+        struct ocfs2_path *right_path = NULL;
+        struct ocfs2_extent_list *left_el;
+        *ret_right_path = NULL;
+        /* This function shouldn't be called for non-trees. */
+        BUG_ON(left_path->p_tree_depth == 0);
+        left_el = path_leaf_el(left_path);
+        BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+        ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+                                             &right_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* This function shouldn't be called for the rightmost leaf. */
+        BUG_ON(right_cpos == 0);
+        right_path = ocfs2_new_path(path_root_bh(left_path),
+                                    path_root_el(left_path));
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, right_path, right_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_right_path = right_path;
+out:
+        if (ret)
+                ocfs2_free_path(right_path);
+        return ret;
+}
 /*
 * Remove split_rec clusters from the record at index and merge them
- * onto the beginning of the record at index + 1.
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
 */
-static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_right(struct inode *inode,
-                                handle_t *handle,
+                                 struct ocfs2_path *left_path,
-                                struct ocfs2_extent_rec *split_rec,
+                                 handle_t *handle,
-                                struct ocfs2_extent_list *el, int index)
+                                 struct ocfs2_extent_rec *split_rec,
+                                 int index)
 {
-        int ret;
+        int ret, next_free, i;
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
        struct ocfs2_extent_rec *left_rec;
        struct ocfs2_extent_rec *right_rec;
+        struct ocfs2_extent_list *right_el;
+        struct ocfs2_path *right_path = NULL;
+        int subtree_index = 0;
+        struct ocfs2_extent_list *el = path_leaf_el(left_path);
+        struct buffer_head *bh = path_leaf_bh(left_path);
+        struct buffer_head *root_bh = NULL;
        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
        left_rec = &el->l_recs[index];
-        right_rec = &el->l_recs[index + 1];
+        if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+            le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+                /* we meet with a cross extent block merge. */
+                ret = ocfs2_get_right_path(inode, left_path, &right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                right_el = path_leaf_el(right_path);
+                next_free = le16_to_cpu(right_el->l_next_free_rec);
+                BUG_ON(next_free <= 0);
+                right_rec = &right_el->l_recs[0];
+                if (ocfs2_is_empty_extent(right_rec)) {
+                        BUG_ON(le16_to_cpu(next_free) <= 1);
+                        right_rec = &right_el->l_recs[1];
+                }
+                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+                       le16_to_cpu(left_rec->e_leaf_clusters) !=
+                       le32_to_cpu(right_rec->e_cpos));
+                subtree_index = ocfs2_find_subtree_root(inode,
+                                                        left_path, right_path);
+                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+                                                      handle->h_buffer_credits,
+                                                      right_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                root_bh = left_path->p_node[subtree_index].bh;
+                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+                ret = ocfs2_journal_access(handle, inode, root_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                for (i = subtree_index + 1;
+                     i < path_num_items(right_path); i++) {
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   right_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   left_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+        } else {
+                BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+                right_rec = &el->l_recs[index + 1];
+        }
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
        if (ret)
                mlog_errno(ret);
+        if (right_path) {
+                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+                if (ret)
+                        mlog_errno(ret);
+                ocfs2_complete_edge_insert(inode, handle, left_path,
+                                           right_path, subtree_index);
+        }
+out:
+        if (right_path)
+                ocfs2_free_path(right_path);
+        return ret;
+}
+static int ocfs2_get_left_path(struct inode *inode,
+                               struct ocfs2_path *right_path,
+                               struct ocfs2_path **ret_left_path)
+{
+        int ret;
+        u32 left_cpos;
+        struct ocfs2_path *left_path = NULL;
+        *ret_left_path = NULL;
+        /* This function shouldn't be called for non-trees. */
+        BUG_ON(right_path->p_tree_depth == 0);
+        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                            right_path, &left_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /* This function shouldn't be called for the leftmost leaf. */
+        BUG_ON(left_cpos == 0);
+        left_path = ocfs2_new_path(path_root_bh(right_path),
+                                   path_root_el(right_path));
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(inode, left_path, left_cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        *ret_left_path = left_path;
 out:
+        if (ret)
+                ocfs2_free_path(left_path);
        return ret;
 }
 /*
 * Remove split_rec clusters from the record at index and merge them
- * onto the tail of the record at index - 1.
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
 */
-static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_left(struct inode *inode,
+                                struct ocfs2_path *right_path,
                                handle_t *handle,
                                struct ocfs2_extent_rec *split_rec,
-                                struct ocfs2_extent_list *el, int index)
+                                struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                int index)
 {
-        int ret, has_empty_extent = 0;
+        int ret, i, subtree_index = 0, has_empty_extent = 0;
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
        struct ocfs2_extent_rec *left_rec;
        struct ocfs2_extent_rec *right_rec;
+        struct ocfs2_extent_list *el = path_leaf_el(right_path);
+        struct buffer_head *bh = path_leaf_bh(right_path);
+        struct buffer_head *root_bh = NULL;
+        struct ocfs2_path *left_path = NULL;
+        struct ocfs2_extent_list *left_el;
-        BUG_ON(index <= 0);
+        BUG_ON(index < 0);
-        left_rec = &el->l_recs[index - 1];
        right_rec = &el->l_recs[index];
-        if (ocfs2_is_empty_extent(&el->l_recs[0]))
+        if (index == 0) {
-                has_empty_extent = 1;
+                /* we meet with a cross extent block merge. */
+                ret = ocfs2_get_left_path(inode, right_path, &left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                left_el = path_leaf_el(left_path);
+                BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+                       le16_to_cpu(left_el->l_count));
+                left_rec = &left_el->l_recs[
+                                le16_to_cpu(left_el->l_next_free_rec) - 1];
+                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+                       le16_to_cpu(left_rec->e_leaf_clusters) !=
+                       le32_to_cpu(split_rec->e_cpos));
+                subtree_index = ocfs2_find_subtree_root(inode,
+                                                        left_path, right_path);
+                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+                                                      handle->h_buffer_credits,
+                                                      left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                root_bh = left_path->p_node[subtree_index].bh;
+                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+                ret = ocfs2_journal_access(handle, inode, root_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                for (i = subtree_index + 1;
+                     i < path_num_items(right_path); i++) {
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   right_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_journal_access(handle, inode,
+                                                   left_path->p_node[i].bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                }
+        } else {
+                left_rec = &el->l_recs[index - 1];
+                if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                        has_empty_extent = 1;
+        }
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
                *left_rec = *split_rec;
                has_empty_extent = 0;
-        } else {
+        } else
                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
-        }
        le32_add_cpu(&right_rec->e_cpos, split_clusters);
        le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
        if (ret)
                mlog_errno(ret);
+        if (left_path) {
+                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+                if (ret)
+                        mlog_errno(ret);
+                /*
+                 * In the situation that the right_rec is empty and the extent
+                 * block is empty also,  ocfs2_complete_edge_insert can't handle
+                 * it and we need to delete the right extent block.
+                 */
+                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+                    le16_to_cpu(el->l_next_free_rec) == 1) {
+                        ret = ocfs2_remove_rightmost_path(inode, handle,
+                                                          right_path, dealloc);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        /* Now the rightmost extent block has been deleted.
+                         * So we use the new rightmost path.
+                         */
+                        ocfs2_mv_path(right_path, left_path);
+                        left_path = NULL;
+                } else
+                        ocfs2_complete_edge_insert(inode, handle, left_path,
+                                                   right_path, subtree_index);
+        }
 out:
+        if (left_path)
+                ocfs2_free_path(left_path);
        return ret;
 }
 static int ocfs2_try_to_merge_extent(struct inode *inode,
                                     handle_t *handle,
-                                     struct ocfs2_path *left_path,
+                                     struct ocfs2_path *path,
                                     int split_index,
                                     struct ocfs2_extent_rec *split_rec,
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 {
        int ret = 0;
-        struct ocfs2_extent_list *el = path_leaf_el(left_path);
+        struct ocfs2_extent_list *el = path_leaf_el(path);
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * extents - having more than one in a leaf is
                 * illegal.
                 */
-                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                ret = ocfs2_rotate_tree_left(inode, handle, path,
                                             dealloc);
                if (ret) {
                        mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * Left-right contig implies this.
                 */
                BUG_ON(!ctxt->c_split_covers_rec);
-                BUG_ON(split_index == 0);
                /*
                 * Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * Since the adding of an empty extent shifts
                 * everything back to the right, there's no need to
                 * update split_index here.
+                 *
+                 * When the split_index is zero, we need to merge it to the
+                 * prevoius extent block. It is more efficient and easier
+                 * if we do merge_right first and merge_left later.
                 */
-                ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+                ret = ocfs2_merge_rec_right(inode, path,
-                                           handle, split_rec, el, split_index);
+                                            handle, split_rec,
+                                            split_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
-                /*
+                /* The merge left us with an empty extent, remove it. */
-                 * The left merge left us with an empty extent, remove
+                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
-                 * it.
-                 */
-                ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                split_index--;
                rec = &el->l_recs[split_index];
                /*
                 * Note that we don't pass split_rec here on purpose -
-                 * we've merged it into the left side.
+                 * we've merged it into the rec already.
                 */
-                ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+                ret = ocfs2_merge_rec_left(inode, path,
-                                            handle, rec, el, split_index);
+                                           handle, rec,
+                                           dealloc,
+                                           split_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+                ret = ocfs2_rotate_tree_left(inode, handle, path,
-                ret = ocfs2_rotate_tree_left(inode, handle, left_path,
                                             dealloc);
                /*
                 * Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                if (ctxt->c_contig_type == CONTIG_RIGHT) {
                        ret = ocfs2_merge_rec_left(inode,
-                                                   path_leaf_bh(left_path),
+                                                   path,
-                                                   handle, split_rec, el,
+                                                   handle, split_rec,
+                                                   dealloc,
                                                   split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                        }
                } else {
                        ret = ocfs2_merge_rec_right(inode,
-                                                    path_leaf_bh(left_path),
+                                                    path,
-                                                    handle, split_rec, el,
+                                                    handle, split_rec,
                                                    split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                         * The merge may have left an empty extent in
                         * our leaf. Try to rotate it away.
                         */
-                        ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                        ret = ocfs2_rotate_tree_left(inode, handle, path,
                                                     dealloc);
                        if (ret)
                                mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
 }
 static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode,
+ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                               struct ocfs2_extent_list *el, int index,
                               struct ocfs2_extent_rec *split_rec)
 {
-        struct ocfs2_extent_rec *rec;
+        int status;
        enum ocfs2_contig_type ret = CONTIG_NONE;
+        u32 left_cpos, right_cpos;
+        struct ocfs2_extent_rec *rec = NULL;
+        struct ocfs2_extent_list *new_el;
+        struct ocfs2_path *left_path = NULL, *right_path = NULL;
+        struct buffer_head *bh;
+        struct ocfs2_extent_block *eb;
+        if (index > 0) {
+                rec = &el->l_recs[index - 1];
+        } else if (path->p_tree_depth > 0) {
+                status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                                       path, &left_cpos);
+                if (status)
+                        goto out;
+                if (left_cpos != 0) {
+                        left_path = ocfs2_new_path(path_root_bh(path),
+                                                   path_root_el(path));
+                        if (!left_path)
+                                goto out;
+                        status = ocfs2_find_path(inode, left_path, left_cpos);
+                        if (status)
+                                goto out;
+                        new_el = path_leaf_el(left_path);
+                        if (le16_to_cpu(new_el->l_next_free_rec) !=
+                            le16_to_cpu(new_el->l_count)) {
+                                bh = path_leaf_bh(left_path);
+                                eb = (struct ocfs2_extent_block *)bh->b_data;
+                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                 eb);
+                                goto out;
+                        }
+                        rec = &new_el->l_recs[
+                                le16_to_cpu(new_el->l_next_free_rec) - 1];
+                }
+        }
        /*
         * We're careful to check for an empty extent record here -
         * the merge code will know what to do if it sees one.
         */
+        if (rec) {
-        if (index > 0) {
-                rec = &el->l_recs[index - 1];
                if (index == 1 && ocfs2_is_empty_extent(rec)) {
                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
                                ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
                }
        }
-        if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+        rec = NULL;
+        if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+                rec = &el->l_recs[index + 1];
+        else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+                 path->p_tree_depth > 0) {
+                status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
+                                                        path, &right_cpos);
+                if (status)
+                        goto out;
+                if (right_cpos == 0)
+                        goto out;
+                right_path = ocfs2_new_path(path_root_bh(path),
+                                            path_root_el(path));
+                if (!right_path)
+                        goto out;
+                status = ocfs2_find_path(inode, right_path, right_cpos);
+                if (status)
+                        goto out;
+                new_el = path_leaf_el(right_path);
+                rec = &new_el->l_recs[0];
+                if (ocfs2_is_empty_extent(rec)) {
+                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+                                bh = path_leaf_bh(right_path);
+                                eb = (struct ocfs2_extent_block *)bh->b_data;
+                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                 eb);
+                                goto out;
+                        }
+                        rec = &new_el->l_recs[1];
+                }
+        }
+        if (rec) {
                enum ocfs2_contig_type contig_type;
-                rec = &el->l_recs[index + 1];
                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
                        ret = contig_type;
        }
+out:
+        if (left_path)
+                ocfs2_free_path(left_path);
+        if (right_path)
+                ocfs2_free_path(right_path);
        return ret;
 }
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                goto out;
        }
-        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+        ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
                                                            split_index,
                                                            split_rec);
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
        status = ocfs2_flush_truncate_log(osb);
        if (status < 0)
                mlog_errno(status);
+        else
+                ocfs2_init_inode_steal_slot(osb);
        mlog_exit(status);
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 82243127eebf..17964c0505a9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -257,7 +257,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        BUG_ON(!PageLocked(page));
-        BUG_ON(!OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
        ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
                               OCFS2_BH_CACHED, inode);
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned to)
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle = NULL;
+        handle_t *handle;
        int ret = 0;
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (!handle) {
+        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        }
 out:
        if (ret) {
-                if (handle)
+                if (!IS_ERR(handle))
                        ocfs2_commit_trans(osb, handle);
                handle = ERR_PTR(ret);
        }
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f13650..bc8c5e7d8608 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-        quorum.o tcp.o ver.o
+        quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000000..7bf3c0ea7bd9
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "tcp_internal.h"
+#define O2NET_DEBUG_DIR         "o2net"
+#define SC_DEBUG_NAME           "sock_containers"
+#define NST_DEBUG_NAME          "send_tracking"
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+static DEFINE_SPINLOCK(o2net_debug_lock);
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+        spin_lock(&o2net_debug_lock);
+        list_add(&nst->st_net_debug_item, &send_tracking);
+        spin_unlock(&o2net_debug_lock);
+}
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+        spin_lock(&o2net_debug_lock);
+        if (!list_empty(&nst->st_net_debug_item))
+                list_del_init(&nst->st_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+}
+static struct o2net_send_tracking
+                        *next_nst(struct o2net_send_tracking *nst_start)
+{
+        struct o2net_send_tracking *nst, *ret = NULL;
+        assert_spin_locked(&o2net_debug_lock);
+        list_for_each_entry(nst, &nst_start->st_net_debug_item,
+                            st_net_debug_item) {
+                /* discover the head of the list */
+                if (&nst->st_net_debug_item == &send_tracking)
+                        break;
+                /* use st_task to detect real nsts in the list */
+                if (nst->st_task != NULL) {
+                        ret = nst;
+                        break;
+                }
+        }
+        return ret;
+}
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        spin_lock(&o2net_debug_lock);
+        nst = next_nst(dummy_nst);
+        spin_unlock(&o2net_debug_lock);
+        return nst;
+}
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        spin_lock(&o2net_debug_lock);
+        nst = next_nst(dummy_nst);
+        list_del_init(&dummy_nst->st_net_debug_item);
+        if (nst)
+                list_add(&dummy_nst->st_net_debug_item,
+                         &nst->st_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+        return nst; /* unused, just needs to be null when done */
+}
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+        struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+        spin_lock(&o2net_debug_lock);
+        nst = next_nst(dummy_nst);
+        if (nst != NULL) {
+                /* get_task_comm isn't exported.  oh well. */
+                seq_printf(seq, "%p:\n"
+                           "  pid:          %lu\n"
+                           "  tgid:         %lu\n"
+                           "  process name: %s\n"
+                           "  node:         %u\n"
+                           "  sc:           %p\n"
+                           "  message id:   %d\n"
+                           "  message type: %u\n"
+                           "  message key:  0x%08x\n"
+                           "  sock acquiry: %lu.%lu\n"
+                           "  send start:   %lu.%lu\n"
+                           "  wait start:   %lu.%lu\n",
+                           nst, (unsigned long)nst->st_task->pid,
+                           (unsigned long)nst->st_task->tgid,
+                           nst->st_task->comm, nst->st_node,
+                           nst->st_sc, nst->st_id, nst->st_msg_type,
+                           nst->st_msg_key,
+                           nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
+                           nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+                           nst->st_status_time.tv_sec,
+                           nst->st_status_time.tv_usec);
+        }
+        spin_unlock(&o2net_debug_lock);
+        return 0;
+}
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations nst_seq_ops = {
+        .start = nst_seq_start,
+        .next = nst_seq_next,
+        .stop = nst_seq_stop,
+        .show = nst_seq_show,
+};
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_send_tracking *dummy_nst;
+        struct seq_file *seq;
+        int ret;
+        dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+        if (dummy_nst == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        dummy_nst->st_task = NULL;
+        ret = seq_open(file, &nst_seq_ops);
+        if (ret)
+                goto out;
+        seq = file->private_data;
+        seq->private = dummy_nst;
+        o2net_debug_add_nst(dummy_nst);
+        dummy_nst = NULL;
+out:
+        kfree(dummy_nst);
+        return ret;
+}
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct o2net_send_tracking *dummy_nst = seq->private;
+        o2net_debug_del_nst(dummy_nst);
+        return seq_release_private(inode, file);
+}
+static struct file_operations nst_seq_fops = {
+        .open = nst_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = nst_fop_release,
+};
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+        spin_lock(&o2net_debug_lock);
+        list_add(&sc->sc_net_debug_item, &sock_containers);
+        spin_unlock(&o2net_debug_lock);
+}
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+        spin_lock(&o2net_debug_lock);
+        list_del_init(&sc->sc_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+}
+static struct o2net_sock_container
+                        *next_sc(struct o2net_sock_container *sc_start)
+{
+        struct o2net_sock_container *sc, *ret = NULL;
+        assert_spin_locked(&o2net_debug_lock);
+        list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+                            sc_net_debug_item) {
+                /* discover the head of the list miscast as a sc */
+                if (&sc->sc_net_debug_item == &sock_containers)
+                        break;
+                /* use sc_page to detect real scs in the list */
+                if (sc->sc_page != NULL) {
+                        ret = sc;
+                        break;
+                }
+        }
+        return ret;
+}
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        spin_lock(&o2net_debug_lock);
+        sc = next_sc(dummy_sc);
+        spin_unlock(&o2net_debug_lock);
+        return sc;
+}
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        spin_lock(&o2net_debug_lock);
+        sc = next_sc(dummy_sc);
+        list_del_init(&dummy_sc->sc_net_debug_item);
+        if (sc)
+                list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+        spin_unlock(&o2net_debug_lock);
+        return sc; /* unused, just needs to be null when done */
+}
+#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+        struct o2net_sock_container *sc, *dummy_sc = seq->private;
+        spin_lock(&o2net_debug_lock);
+        sc = next_sc(dummy_sc);
+        if (sc != NULL) {
+                struct inet_sock *inet = NULL;
+                __be32 saddr = 0, daddr = 0;
+                __be16 sport = 0, dport = 0;
+                if (sc->sc_sock) {
+                        inet = inet_sk(sc->sc_sock->sk);
+                        /* the stack's structs aren't sparse endian clean */
+                        saddr = (__force __be32)inet->saddr;
+                        daddr = (__force __be32)inet->daddr;
+                        sport = (__force __be16)inet->sport;
+                        dport = (__force __be16)inet->dport;
+                }
+                /* XXX sigh, inet-> doesn't have sparse annotation so any
+                 * use of it here generates a warning with -Wbitwise */
+                seq_printf(seq, "%p:\n"
+                           "  krefs:           %d\n"
+                           "  sock:            %u.%u.%u.%u:%u -> "
+                                              "%u.%u.%u.%u:%u\n"
+                           "  remote node:     %s\n"
+                           "  page off:        %zu\n"
+                           "  handshake ok:    %u\n"
+                           "  timer:           %lu.%lu\n"
+                           "  data ready:      %lu.%lu\n"
+                           "  advance start:   %lu.%lu\n"
+                           "  advance stop:    %lu.%lu\n"
+                           "  func start:      %lu.%lu\n"
+                           "  func stop:       %lu.%lu\n"
+                           "  func key:        %u\n"
+                           "  func type:       %u\n",
+                           sc,
+                           atomic_read(&sc->sc_kref.refcount),
+                           NIPQUAD(saddr), inet ? ntohs(sport) : 0,
+                           NIPQUAD(daddr), inet ? ntohs(dport) : 0,
+                           sc->sc_node->nd_name,
+                           sc->sc_page_off,
+                           sc->sc_handshake_ok,
+                           TV_SEC_USEC(sc->sc_tv_timer),
+                           TV_SEC_USEC(sc->sc_tv_data_ready),
+                           TV_SEC_USEC(sc->sc_tv_advance_start),
+                           TV_SEC_USEC(sc->sc_tv_advance_stop),
+                           TV_SEC_USEC(sc->sc_tv_func_start),
+                           TV_SEC_USEC(sc->sc_tv_func_stop),
+                           sc->sc_msg_key,
+                           sc->sc_msg_type);
+        }
+        spin_unlock(&o2net_debug_lock);
+        return 0;
+}
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static struct seq_operations sc_seq_ops = {
+        .start = sc_seq_start,
+        .next = sc_seq_next,
+        .stop = sc_seq_stop,
+        .show = sc_seq_show,
+};
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+        struct o2net_sock_container *dummy_sc;
+        struct seq_file *seq;
+        int ret;
+        dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+        if (dummy_sc == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        dummy_sc->sc_page = NULL;
+        ret = seq_open(file, &sc_seq_ops);
+        if (ret)
+                goto out;
+        seq = file->private_data;
+        seq->private = dummy_sc;
+        o2net_debug_add_sc(dummy_sc);
+        dummy_sc = NULL;
+out:
+        kfree(dummy_sc);
+        return ret;
+}
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        struct o2net_sock_container *dummy_sc = seq->private;
+        o2net_debug_del_sc(dummy_sc);
+        return seq_release_private(inode, file);
+}
+static struct file_operations sc_seq_fops = {
+        .open = sc_fop_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = sc_fop_release,
+};
+int o2net_debugfs_init(void)
+{
+        o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+        if (!o2net_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                         o2net_dentry, NULL,
+                                         &nst_seq_fops);
+        if (!nst_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                        o2net_dentry, NULL,
+                                        &sc_seq_fops);
+        if (!sc_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        return 0;
+bail:
+        if (sc_dentry)
+                debugfs_remove(sc_dentry);
+        if (nst_dentry)
+                debugfs_remove(nst_dentry);
+        if (o2net_dentry)
+                debugfs_remove(o2net_dentry);
+        return -ENOMEM;
+}
+void o2net_debugfs_exit(void)
+{
+        if (sc_dentry)
+                debugfs_remove(sc_dentry);
+        if (nst_dentry)
+                debugfs_remove(nst_dentry);
+        if (o2net_dentry)
+                debugfs_remove(o2net_dentry);
+}
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7e..cf9401e8cd0b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
        cluster_print_version();
        o2hb_init();
-        o2net_init();
+        ret = o2net_init();
+        if (ret)
+                goto out;
        ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
        if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723d..98429fd68499 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
+        sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
+        /*
+         * Create this symlink for backwards compatibility with old
+         * versions of ocfs2-tools which look for things in /sys/o2cb.
+         */
+        ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
+        if (ret)
+                goto error;
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index ee50c9610e7f..1e44ad14881a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
-/*
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- * FIXME: These should use to_o2nm_cluster_from_node(), but we end up
+                           u32 msgkey, struct task_struct *task, u8 node)
- * losing our parent link to the cluster during shutdown. This can be
+{
- * solved by adding a pre-removal callback to configfs, or passing
+#ifdef CONFIG_DEBUG_FS
- * around the cluster with the node. -jeffm
+        INIT_LIST_HEAD(&nst->st_net_debug_item);
- */
+        nst->st_task = task;
-static inline int o2net_reconnect_delay(struct o2nm_node *node)
+        nst->st_msg_type = msgtype;
+        nst->st_msg_key = msgkey;
+        nst->st_node = node;
+#endif
+}
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+        do_gettimeofday(&nst->st_sock_time);
+#endif
+}
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+        do_gettimeofday(&nst->st_send_time);
+#endif
+}
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+        do_gettimeofday(&nst->st_status_time);
+#endif
+}
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+                                         struct o2net_sock_container *sc)
+{
+#ifdef CONFIG_DEBUG_FS
+        nst->st_sc = sc;
+#endif
+}
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+{
+#ifdef CONFIG_DEBUG_FS
+        nst->st_id = msg_id;
+#endif
+}
+static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
 }
-static inline int o2net_keepalive_delay(struct o2nm_node *node)
+static inline int o2net_keepalive_delay(void)
 {
        return o2nm_single_cluster->cl_keepalive_delay_ms;
 }
-static inline int o2net_idle_timeout(struct o2nm_node *node)
+static inline int o2net_idle_timeout(void)
 {
        return o2nm_single_cluster->cl_idle_timeout_ms;
 }
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
+        o2net_debug_del_sc(sc);
        kfree(sc);
 }
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        ret = sc;
        sc->sc_page = page;
+        o2net_debug_add_sc(sc);
        sc = NULL;
        page = NULL;
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
        mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
-        /* we won't reconnect after our valid conn goes away for
-         * this hb iteration.. here so it shows up in the logs */
        if (was_valid && !valid && err == 0)
                err = -ENOTCONN;
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        if (!was_valid && valid) {
                o2quo_conn_up(o2net_num_from_nn(nn));
-                /* this is a bit of a hack.  we only try reconnecting
-                 * when heartbeating starts until we get a connection.
-                 * if that connection then dies we don't try reconnecting.
-                 * the only way to start connecting again is to down
-                 * heartbeat and bring it back up. */
                cancel_delayed_work(&nn->nn_connect_expired);
                printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
                       o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
                /* delay if we're withing a RECONNECT_DELAY of the
                 * last attempt */
                delay = (nn->nn_last_connect_attempt +
-                         msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node)))
+                         msecs_to_jiffies(o2net_reconnect_delay()))
                        - jiffies;
-                if (delay > msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node)))
+                if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
                        delay = 0;
                mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
                queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+                /*
+                 * Delay the expired work after idle timeout.
+                 *
+                 * We might have lots of failed connection attempts that run
+                 * through here but we only cancel the connect_expired work when
+                 * a connection attempt succeeds.  So only the first enqueue of
+                 * the connect_expired work will do anything.  The rest will see
+                 * that it's already queued and do nothing.
+                 */
+                delay += msecs_to_jiffies(o2net_idle_timeout());
+                queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
        }
        /* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        struct o2net_status_wait nsw = {
                .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
        };
+        struct o2net_send_tracking nst;
+        o2net_init_nst(&nst, msg_type, key, current, target_node);
        if (o2net_wq == NULL) {
                mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                goto out;
        }
+        o2net_debug_add_nst(&nst);
+        o2net_set_nst_sock_time(&nst);
        ret = wait_event_interruptible(nn->nn_sc_wq,
                                       o2net_tx_can_proceed(nn, &sc, &error));
        if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        if (ret)
                goto out;
+        o2net_set_nst_sock_container(&nst, sc);
        veclen = caller_veclen + 1;
        vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
        if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                goto out;
        msg->msg_num = cpu_to_be32(nsw.ns_id);
+        o2net_set_nst_msg_id(&nst, nsw.ns_id);
+        o2net_set_nst_send_time(&nst);
        /* finally, convert the message header to network byte-order
         * and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        }
        /* wait on other node's handler */
+        o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
        /* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        mlog(0, "woken, returning system status %d, user status %d\n",
             ret, nsw.ns_status);
 out:
+        o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
        if (sc)
                sc_put(sc);
        if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         * but isn't. This can ultimately cause corruption.
         */
        if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
-                                o2net_idle_timeout(sc->sc_node)) {
+                                o2net_idle_timeout()) {
                mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
                     "%u ms, but we use %u ms locally.  disconnecting\n",
                     SC_NODEF_ARGS(sc),
                     be32_to_cpu(hand->o2net_idle_timeout_ms),
-                     o2net_idle_timeout(sc->sc_node));
+                     o2net_idle_timeout());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
        if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
-                        o2net_keepalive_delay(sc->sc_node)) {
+                        o2net_keepalive_delay()) {
                mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
                     "%u ms, but we use %u ms locally.  disconnecting\n",
                     SC_NODEF_ARGS(sc),
                     be32_to_cpu(hand->o2net_keepalive_delay_ms),
-                     o2net_keepalive_delay(sc->sc_node));
+                     o2net_keepalive_delay());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         * shut down already */
        if (nn->nn_sc == sc) {
                o2net_sc_reset_idle_timer(sc);
+                atomic_set(&nn->nn_timeout, 0);
                o2net_set_nn_state(nn, sc, 1, 0);
        }
        spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
 {
        o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
                O2HB_MAX_WRITE_TIMEOUT_MS);
-        o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
+        o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
-                o2net_idle_timeout(NULL));
        o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
-                o2net_keepalive_delay(NULL));
+                o2net_keepalive_delay());
        o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
-                o2net_reconnect_delay(NULL));
+                o2net_reconnect_delay());
 }
 /* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+        struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
        struct timeval now;
        do_gettimeofday(&now);
        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
-                     o2net_idle_timeout(sc->sc_node) / 1000,
+                     o2net_idle_timeout() / 1000,
-                     o2net_idle_timeout(sc->sc_node) % 1000);
+                     o2net_idle_timeout() % 1000);
        mlog(ML_NOTICE, "here are some times that might help debug the "
             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+        /*
+         * Initialize the nn_timeout so that the next connection attempt
+         * will continue in o2net_start_connect.
+         */
+        atomic_set(&nn->nn_timeout, 1);
        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 {
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
-                      msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node)));
+                      msecs_to_jiffies(o2net_keepalive_delay()));
        do_gettimeofday(&sc->sc_tv_timer);
        mod_timer(&sc->sc_idle_timeout,
-               jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node)));
+               jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
        struct socket *sock = NULL;
        struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
        int ret = 0, stop;
+        unsigned int timeout;
        /* if we're greater we initiate tx, otherwise we accept */
        if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
        }
        spin_lock(&nn->nn_lock);
-        /* see if we already have one pending or have given up */
+        /*
-        stop = (nn->nn_sc || nn->nn_persistent_error);
+         * see if we already have one pending or have given up.
+         * For nn_timeout, it is set when we close the connection
+         * because of the idle time out. So it means that we have
+         * at least connected to that node successfully once,
+         * now try to connect to it again.
+         */
+        timeout = atomic_read(&nn->nn_timeout);
+        stop = (nn->nn_sc ||
+                (nn->nn_persistent_error &&
+                (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
        spin_unlock(&nn->nn_lock);
        if (stop)
                goto out;
@@ -1552,12 +1632,11 @@ static void o2net_connect_expired(struct work_struct *work)
        spin_lock(&nn->nn_lock);
        if (!nn->nn_sc_valid) {
-                struct o2nm_node *node = nn->nn_sc->sc_node;
                mlog(ML_ERROR, "no connection established with node %u after "
                     "%u.%u seconds, giving up and returning errors.\n",
                     o2net_num_from_nn(nn),
-                     o2net_idle_timeout(node) / 1000,
+                     o2net_idle_timeout() / 1000,
-                     o2net_idle_timeout(node) % 1000);
+                     o2net_idle_timeout() % 1000);
                o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
        }
@@ -1580,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
        /* don't reconnect until it's heartbeating again */
        spin_lock(&nn->nn_lock);
+        atomic_set(&nn->nn_timeout, 0);
        o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
        spin_unlock(&nn->nn_lock);
@@ -1611,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
        /* ensure an immediate connect attempt */
        nn->nn_last_connect_attempt = jiffies -
-                (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
+                (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
        if (node_num != o2nm_this_node()) {
-                /* heartbeat doesn't work unless a local node number is
-                 * configured and doing so brings up the o2net_wq, so we can
-                 * use it.. */
-                queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
-                                   msecs_to_jiffies(o2net_idle_timeout(node)));
                /* believe it or not, accept and node hearbeating testing
                 * can succeed for this node before we got here.. so
                 * only use set_nn_state to clear the persistent error
                 * if that hasn't already happened */
                spin_lock(&nn->nn_lock);
+                atomic_set(&nn->nn_timeout, 0);
                if (nn->nn_persistent_error)
                        o2net_set_nn_state(nn, NULL, 0, 0);
                spin_unlock(&nn->nn_lock);
@@ -1748,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
        new_sock = NULL;
        spin_lock(&nn->nn_lock);
+        atomic_set(&nn->nn_timeout, 0);
        o2net_set_nn_state(nn, sc, 0, 0);
        spin_unlock(&nn->nn_lock);
@@ -1923,6 +1999,9 @@ int o2net_init(void)
        o2quo_init();
+        if (o2net_debugfs_init())
+                return -ENOMEM;
        o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
        o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
        o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1942,6 +2021,7 @@ int o2net_init(void)
        for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
                struct o2net_node *nn = o2net_nn_from_num(i);
+                atomic_set(&nn->nn_timeout, 0);
                spin_lock_init(&nn->nn_lock);
                INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
                INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1963,4 +2043,5 @@ void o2net_exit(void)
        kfree(o2net_hand);
        kfree(o2net_keep_req);
        kfree(o2net_keep_resp);
+        o2net_debugfs_exit();
 }
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3dd..a705d5d19036 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
 int o2net_init(void);
 void o2net_exit(void);
+struct o2net_send_tracking;
+struct o2net_sock_container;
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static int o2net_debugfs_init(void)
+{
+        return 0;
+}
+static void o2net_debugfs_exit(void)
+{
+}
+static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
 #endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
        unsigned                        nn_sc_valid:1;
        /* if this is set tx just returns it */
        int                             nn_persistent_error;
+        /* It is only set to 1 after the idle time out. */
+        atomic_t                        nn_timeout;
        /* threads waiting for an sc to arrive wait on the wq for generation
         * to increase.  it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
+#ifdef CONFIG_DEBUG_FS
+        struct list_head        sc_net_debug_item;
+#endif
        struct timeval          sc_tv_timer;
        struct timeval          sc_tv_data_ready;
        struct timeval          sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
        struct list_head        ns_node_item;
 };
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+        struct list_head                st_net_debug_item;
+        struct task_struct              *st_task;
+        struct o2net_sock_container     *st_sc;
+        u32                             st_id;
+        u32                             st_msg_type;
+        u32                             st_msg_key;
+        u8                              st_node;
+        struct timeval                  st_sock_time;
+        struct timeval                  st_send_time;
+        struct timeval                  st_status_time;
+};
+#else
+struct o2net_send_tracking {
+        u32     dummy;
+};
+#endif  /* CONFIG_DEBUG_FS */
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e280833ceb9a..8a1875848080 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -390,9 +390,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                                goto bail;
                        }
                        if (pde)
-                                pde->rec_len =
+                                le16_add_cpu(&pde->rec_len,
-                                        cpu_to_le16(le16_to_cpu(pde->rec_len) +
+                                                le16_to_cpu(de->rec_len));
-                                                    le16_to_cpu(de->rec_len));
                        else
                                de->inode = 0;
                        dir->i_version++;
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d270..190361375700 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 9843ee17ea27..d5a86fb81a49 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
 /* Intended to make it easier for us to switch out hash functions */
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+enum dlm_mle_type {
+        DLM_MLE_BLOCK,
+        DLM_MLE_MASTER,
+        DLM_MLE_MIGRATION
+};
+struct dlm_lock_name {
+        u8 len;
+        u8 name[DLM_LOCKID_NAME_MAX];
+};
+struct dlm_master_list_entry {
+        struct list_head list;
+        struct list_head hb_events;
+        struct dlm_ctxt *dlm;
+        spinlock_t spinlock;
+        wait_queue_head_t wq;
+        atomic_t woken;
+        struct kref mle_refs;
+        int inuse;
+        unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+        u8 master;
+        u8 new_master;
+        enum dlm_mle_type type;
+        struct o2hb_callback_func mle_hb_up;
+        struct o2hb_callback_func mle_hb_down;
+        union {
+                struct dlm_lock_resource *res;
+                struct dlm_lock_name name;
+        } u;
+};
 enum dlm_ast_type {
        DLM_AST = 0,
        DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
        struct list_head purge_list;
        struct list_head pending_asts;
        struct list_head pending_basts;
+        struct list_head tracking_list;
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
        atomic_t remote_resources;
        atomic_t unknown_resources;
+        struct dlm_debug_ctxt *dlm_debug_ctxt;
+        struct dentry *dlm_debugfs_subroot;
        /* NOTE: Next three are protected by dlm_domain_lock */
        struct kref dlm_refs;
        enum dlm_ctxt_state dlm_state;
@@ -176,6 +215,7 @@ struct dlm_mig_lockres_priv
 {
        struct dlm_lock_resource *lockres;
        u8 real_master;
+        u8 extra_ref;
 };
 struct dlm_assert_master_priv
@@ -269,6 +309,9 @@ struct dlm_lock_resource
        struct list_head dirty;
        struct list_head recovering; // dlm_recovery_ctxt.resources list
+        /* Added during init and removed during release */
+        struct list_head tracking;      /* dlm->tracking_list */
        /* unused lock resources have their last_used stamped and are
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
@@ -602,17 +645,19 @@ enum dlm_query_join_response_code {
        JOIN_PROTOCOL_MISMATCH,
 };
+struct dlm_query_join_packet {
+        u8 code;        /* Response code.  dlm_minor and fs_minor
+                           are only valid if this is JOIN_OK */
+        u8 dlm_minor;   /* The minor version of the protocol the
+                           dlm is speaking. */
+        u8 fs_minor;    /* The minor version of the protocol the
+                           filesystem is speaking. */
+        u8 reserved;
+};
 union dlm_query_join_response {
        u32 intval;
-        struct {
+        struct dlm_query_join_packet packet;
-                u8 code;        /* Response code.  dlm_minor and fs_minor
-                                   are only valid if this is JOIN_OK */
-                u8 dlm_minor;   /* The minor version of the protocol the
-                                   dlm is speaking. */
-                u8 fs_minor;    /* The minor version of the protocol the
-                                   filesystem is speaking. */
-                u8 reserved;
-        } packet;
 };
 struct dlm_lock_request
@@ -960,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
 int dlm_init_mle_cache(void);
 void dlm_destroy_mle_cache(void);
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index ecb4d997221e..75997b4deaf3 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -487,7 +487,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
                               "cookie=%u:%llu\n",
                     dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
                     dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
-                __dlm_print_one_lock_resource(res);
+                dlm_print_one_lock_resource(res);
                goto leave;
        }
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d4..5f6d858770a2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
 *
 * debug functionality for the dlm
 *
- * Copyright (C) 2004 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
+#include <linux/debugfs.h>
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -37,17 +38,16 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
 void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
-        mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
-               res->lockname.len, res->lockname.name,
-               res->owner, res->state);
        spin_lock(&res->spinlock);
        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
        int bit;
        assert_spin_locked(&res->spinlock);
-        mlog(ML_NOTICE, "  refmap nodes: [ ");
+        printk("  refmap nodes: [ ");
        bit = 0;
        while (1) {
                bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
        printk("], inflight=%u\n", res->inflight_locks);
 }
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+        spin_lock(&lock->spinlock);
+        printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+               "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+               "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+               lock->ml.type, lock->ml.convert_type, lock->ml.node,
+               dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+               dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+               atomic_read(&lock->lock_refs.refcount),
+               (list_empty(&lock->ast_list) ? 'y' : 'n'),
+               (lock->ast_pending ? 'y' : 'n'),
+               (list_empty(&lock->bast_list) ? 'y' : 'n'),
+               (lock->bast_pending ? 'y' : 'n'),
+               (lock->convert_pending ? 'y' : 'n'),
+               (lock->lock_pending ? 'y' : 'n'),
+               (lock->cancel_pending ? 'y' : 'n'),
+               (lock->unlock_pending ? 'y' : 'n'));
+        spin_unlock(&lock->spinlock);
+}
 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
        struct list_head *iter2;
        struct dlm_lock *lock;
+        char buf[DLM_LOCKID_NAME_MAX];
        assert_spin_locked(&res->spinlock);
-        mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+        stringify_lockname(res->lockname.name, res->lockname.len,
-               res->lockname.len, res->lockname.name,
+                           buf, sizeof(buf) - 1);
-               res->owner, res->state);
+        printk("lockres: %s, owner=%u, state=%u\n",
-        mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
+               buf, res->owner, res->state);
-             res->last_used, list_empty(&res->purge) ? "no" : "yes");
+        printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
+               res->last_used, atomic_read(&res->refs.refcount),
+               list_empty(&res->purge) ? "no" : "yes");
+        printk("  on dirty list: %s, on reco list: %s, "
+               "migrating pending: %s\n",
+               list_empty(&res->dirty) ? "no" : "yes",
+               list_empty(&res->recovering) ? "no" : "yes",
+               res->migration_pending ? "yes" : "no");
+        printk("  inflight locks: %d, asts reserved: %d\n",
+               res->inflight_locks, atomic_read(&res->asts_reserved));
        dlm_print_lockres_refmap(res);
-        mlog(ML_NOTICE, "  granted queue: \n");
+        printk("  granted queue:\n");
        list_for_each(iter2, &res->granted) {
                lock = list_entry(iter2, struct dlm_lock, list);
-                spin_lock(&lock->spinlock);
+                __dlm_print_lock(lock);
-                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                       list_empty(&lock->ast_list) ? 'y' : 'n',
-                       lock->ast_pending ? 'y' : 'n',
-                       list_empty(&lock->bast_list) ? 'y' : 'n',
-                       lock->bast_pending ? 'y' : 'n');
-                spin_unlock(&lock->spinlock);
        }
-        mlog(ML_NOTICE, "  converting queue: \n");
+        printk("  converting queue:\n");
        list_for_each(iter2, &res->converting) {
                lock = list_entry(iter2, struct dlm_lock, list);
-                spin_lock(&lock->spinlock);
+                __dlm_print_lock(lock);
-                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                       list_empty(&lock->ast_list) ? 'y' : 'n',
-                       lock->ast_pending ? 'y' : 'n',
-                       list_empty(&lock->bast_list) ? 'y' : 'n',
-                       lock->bast_pending ? 'y' : 'n');
-                spin_unlock(&lock->spinlock);
        }
-        mlog(ML_NOTICE, "  blocked queue: \n");
+        printk("  blocked queue:\n");
        list_for_each(iter2, &res->blocked) {
                lock = list_entry(iter2, struct dlm_lock, list);
-                spin_lock(&lock->spinlock);
+                __dlm_print_lock(lock);
-                mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                       list_empty(&lock->ast_list) ? 'y' : 'n',
-                       lock->ast_pending ? 'y' : 'n',
-                       list_empty(&lock->bast_list) ? 'y' : 'n',
-                       lock->bast_pending ? 'y' : 'n');
-                spin_unlock(&lock->spinlock);
        }
 }
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
-#if 0
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
-{
-        struct dlm_lock_resource *res;
-        struct hlist_node *iter;
-        struct hlist_head *bucket;
-        int i;
-        mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
-                  dlm->name, dlm->node_num, dlm->key);
-        if (!dlm || !dlm->name) {
-                mlog(ML_ERROR, "dlm=%p\n", dlm);
-                return;
-        }
-        spin_lock(&dlm->spinlock);
-        for (i=0; i<DLM_HASH_BUCKETS; i++) {
-                bucket = dlm_lockres_hash(dlm, i);
-                hlist_for_each_entry(res, iter, bucket, hash_node)
-                        dlm_print_one_lock_resource(res);
-        }
-        spin_unlock(&dlm->spinlock);
-}
-#endif  /*  0  */
 static const char *dlm_errnames[] = {
        [DLM_NORMAL] =                  "DLM_NORMAL",
        [DLM_GRANTED] =                 "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
        return dlm_errnames[err];
 }
 EXPORT_SYMBOL_GPL(dlm_errname);
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
+{
+        int out = 0;
+        __be64 inode_blkno_be;
+#define OCFS2_DENTRY_LOCK_INO_START     18
+        if (*lockname == 'N') {
+                memcpy((__be64 *)&inode_blkno_be,
+                       (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+                       sizeof(__be64));
+                out += snprintf(buf + out, len - out, "%.*s%08x",
+                                OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+                                (unsigned int)be64_to_cpu(inode_blkno_be));
+        } else
+                out += snprintf(buf + out, len - out, "%.*s",
+                                locklen, lockname);
+        return out;
+}
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+                             char *buf, int len)
+{
+        int out = 0;
+        int i = -1;
+        while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+                out += snprintf(buf + out, len - out, "%d ", i);
+        return out;
+}
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+        int out = 0;
+        unsigned int namelen;
+        const char *name;
+        char *mle_type;
+        if (mle->type != DLM_MLE_MASTER) {
+                namelen = mle->u.name.len;
+                name = mle->u.name.name;
+        } else {
+                namelen = mle->u.res->lockname.len;
+                name = mle->u.res->lockname.name;
+        }
+        if (mle->type == DLM_MLE_BLOCK)
+                mle_type = "BLK";
+        else if (mle->type == DLM_MLE_MASTER)
+                mle_type = "MAS";
+        else
+                mle_type = "MIG";
+        out += stringify_lockname(name, namelen, buf + out, len - out);
+        out += snprintf(buf + out, len - out,
+                        "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+                        mle_type, mle->master, mle->new_master,
+                        !list_empty(&mle->hb_events),
+                        !!mle->inuse,
+                        atomic_read(&mle->mle_refs.refcount));
+        out += snprintf(buf + out, len - out, "Maybe=");
+        out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "Vote=");
+        out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "Response=");
+        out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "Node=");
+        out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        out += snprintf(buf + out, len - out, "\n");
+        return out;
+}
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+        char *buf;
+        buf = (char *) get_zeroed_page(GFP_NOFS);
+        if (buf) {
+                dump_mle(mle, buf, PAGE_SIZE - 1);
+                free_page((unsigned long)buf);
+        }
+}
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *dlm_debugfs_root = NULL;
+#define DLM_DEBUGFS_DIR                         "o2dlm"
+#define DLM_DEBUGFS_DLM_STATE                   "dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE               "locking_state"
+#define DLM_DEBUGFS_MLE_STATE                   "mle_state"
+#define DLM_DEBUGFS_PURGE_LIST                  "purge_list"
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+        struct dlm_debug_ctxt *dc;
+        dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+        kfree(dc);
+}
+void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+        if (dc)
+                kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+        kref_get(&dc->debug_refcnt);
+}
+static struct debug_buffer *debug_buffer_allocate(void)
+{
+        struct debug_buffer *db = NULL;
+        db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+        if (!db)
+                goto bail;
+        db->len = PAGE_SIZE;
+        db->buf = kmalloc(db->len, GFP_KERNEL);
+        if (!db->buf)
+                goto bail;
+        return db;
+bail:
+        kfree(db);
+        return NULL;
+}
+static ssize_t debug_buffer_read(struct file *file, char __user *buf,
+                                 size_t nbytes, loff_t *ppos)
+{
+        struct debug_buffer *db = file->private_data;
+        return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
+}
+static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
+{
+        struct debug_buffer *db = file->private_data;
+        loff_t new = -1;
+        switch (whence) {
+        case 0:
+                new = off;
+                break;
+        case 1:
+                new = file->f_pos + off;
+                break;
+        }
+        if (new < 0 || new > db->len)
+                return -EINVAL;
+        return (file->f_pos = new);
+}
+static int debug_buffer_release(struct inode *inode, struct file *file)
+{
+        struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+        if (db)
+                kfree(db->buf);
+        kfree(db);
+        return 0;
+}
+/* end - util funcs */
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+        struct dlm_lock_resource *res;
+        int out = 0;
+        unsigned long total = 0;
+        out += snprintf(db->buf + out, db->len - out,
+                        "Dumping Purgelist for Domain: %s\n", dlm->name);
+        spin_lock(&dlm->spinlock);
+        list_for_each_entry(res, &dlm->purge_list, purge) {
+                ++total;
+                if (db->len - out < 100)
+                        continue;
+                spin_lock(&res->spinlock);
+                out += stringify_lockname(res->lockname.name,
+                                          res->lockname.len,
+                                          db->buf + out, db->len - out);
+                out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                                (jiffies - res->last_used)/HZ);
+                spin_unlock(&res->spinlock);
+        }
+        spin_unlock(&dlm->spinlock);
+        out += snprintf(db->buf + out, db->len - out,
+                        "Total on list: %ld\n", total);
+        return out;
+}
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        struct debug_buffer *db;
+        db = debug_buffer_allocate();
+        if (!db)
+                goto bail;
+        db->len = debug_purgelist_print(dlm, db);
+        file->private_data = db;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static struct file_operations debug_purgelist_fops = {
+        .open =         debug_purgelist_open,
+        .release =      debug_buffer_release,
+        .read =         debug_buffer_read,
+        .llseek =       debug_buffer_llseek,
+};
+/* end - purge list funcs */
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+        struct dlm_master_list_entry *mle;
+        int out = 0;
+        unsigned long total = 0;
+        out += snprintf(db->buf + out, db->len - out,
+                        "Dumping MLEs for Domain: %s\n", dlm->name);
+        spin_lock(&dlm->master_lock);
+        list_for_each_entry(mle, &dlm->master_list, list) {
+                ++total;
+                if (db->len - out < 200)
+                        continue;
+                out += dump_mle(mle, db->buf + out, db->len - out);
+        }
+        spin_unlock(&dlm->master_lock);
+        out += snprintf(db->buf + out, db->len - out,
+                        "Total on list: %ld\n", total);
+        return out;
+}
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        struct debug_buffer *db;
+        db = debug_buffer_allocate();
+        if (!db)
+                goto bail;
+        db->len = debug_mle_print(dlm, db);
+        file->private_data = db;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static struct file_operations debug_mle_fops = {
+        .open =         debug_mle_open,
+        .release =      debug_buffer_release,
+        .read =         debug_buffer_read,
+        .llseek =       debug_buffer_llseek,
+};
+/* end - debug mle funcs */
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+        int out;
+#define DEBUG_LOCK_VERSION      1
+        spin_lock(&lock->spinlock);
+        out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+                       "%d,%d,%d,%d\n",
+                       DEBUG_LOCK_VERSION,
+                       list_type, lock->ml.type, lock->ml.convert_type,
+                       lock->ml.node,
+                       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                       !list_empty(&lock->ast_list),
+                       !list_empty(&lock->bast_list),
+                       lock->ast_pending, lock->bast_pending,
+                       lock->convert_pending, lock->lock_pending,
+                       lock->cancel_pending, lock->unlock_pending,
+                       atomic_read(&lock->lock_refs.refcount));
+        spin_unlock(&lock->spinlock);
+        return out;
+}
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+        struct dlm_lock *lock;
+        int i;
+        int out = 0;
+        out += snprintf(buf + out, len - out, "NAME:");
+        out += stringify_lockname(res->lockname.name, res->lockname.len,
+                                  buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+#define DEBUG_LRES_VERSION      1
+        out += snprintf(buf + out, len - out,
+                        "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+                        DEBUG_LRES_VERSION,
+                        res->owner, res->state, res->last_used,
+                        !list_empty(&res->purge),
+                        !list_empty(&res->dirty),
+                        !list_empty(&res->recovering),
+                        res->inflight_locks, res->migration_pending,
+                        atomic_read(&res->asts_reserved),
+                        atomic_read(&res->refs.refcount));
+        /* refmap */
+        out += snprintf(buf + out, len - out, "RMAP:");
+        out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+                                 buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        /* lvb */
+        out += snprintf(buf + out, len - out, "LVBX:");
+        for (i = 0; i < DLM_LVB_LEN; i++)
+                out += snprintf(buf + out, len - out,
+                                        "%02x", (unsigned char)res->lvb[i]);
+        out += snprintf(buf + out, len - out, "\n");
+        /* granted */
+        list_for_each_entry(lock, &res->granted, list)
+                out += dump_lock(lock, 0, buf + out, len - out);
+        /* converting */
+        list_for_each_entry(lock, &res->converting, list)
+                out += dump_lock(lock, 1, buf + out, len - out);
+        /* blocked */
+        list_for_each_entry(lock, &res->blocked, list)
+                out += dump_lock(lock, 2, buf + out, len - out);
+        out += snprintf(buf + out, len - out, "\n");
+        return out;
+}
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+        struct debug_lockres *dl = m->private;
+        struct dlm_ctxt *dlm = dl->dl_ctxt;
+        struct dlm_lock_resource *res = NULL;
+        spin_lock(&dlm->spinlock);
+        if (dl->dl_res) {
+                list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                        if (dl->dl_res) {
+                                dlm_lockres_put(dl->dl_res);
+                                dl->dl_res = NULL;
+                        }
+                        if (&res->tracking == &dlm->tracking_list) {
+                                mlog(0, "End of list found, %p\n", res);
+                                dl = NULL;
+                                break;
+                        }
+                        dlm_lockres_get(res);
+                        dl->dl_res = res;
+                        break;
+                }
+        } else {
+                if (!list_empty(&dlm->tracking_list)) {
+                        list_for_each_entry(res, &dlm->tracking_list, tracking)
+                                break;
+                        dlm_lockres_get(res);
+                        dl->dl_res = res;
+                } else
+                        dl = NULL;
+        }
+        if (dl) {
+                spin_lock(&dl->dl_res->spinlock);
+                dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
+                spin_unlock(&dl->dl_res->spinlock);
+        }
+        spin_unlock(&dlm->spinlock);
+        return dl;
+}
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return NULL;
+}
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+        struct debug_lockres *dl = (struct debug_lockres *)v;
+        seq_printf(s, "%s", dl->dl_buf);
+        return 0;
+}
+static struct seq_operations debug_lockres_ops = {
+        .start =        lockres_seq_start,
+        .stop =         lockres_seq_stop,
+        .next =         lockres_seq_next,
+        .show =         lockres_seq_show,
+};
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        int ret = -ENOMEM;
+        struct seq_file *seq;
+        struct debug_lockres *dl = NULL;
+        dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+        if (!dl) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        dl->dl_len = PAGE_SIZE;
+        dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+        if (!dl->dl_buf) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        ret = seq_open(file, &debug_lockres_ops);
+        if (ret) {
+                mlog_errno(ret);
+                goto bail;
+        }
+        seq = (struct seq_file *) file->private_data;
+        seq->private = dl;
+        dlm_grab(dlm);
+        dl->dl_ctxt = dlm;
+        return 0;
+bail:
+        if (dl)
+                kfree(dl->dl_buf);
+        kfree(dl);
+        return ret;
+}
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = (struct seq_file *)file->private_data;
+        struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+        if (dl->dl_res)
+                dlm_lockres_put(dl->dl_res);
+        dlm_put(dl->dl_ctxt);
+        kfree(dl->dl_buf);
+        return seq_release_private(inode, file);
+}
+static struct file_operations debug_lockres_fops = {
+        .open =         debug_lockres_open,
+        .release =      debug_lockres_release,
+        .read =         seq_read,
+        .llseek =       seq_lseek,
+};
+/* end - debug lockres funcs */
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+        int out = 0;
+        struct dlm_reco_node_data *node;
+        char *state;
+        int lres, rres, ures, tres;
+        lres = atomic_read(&dlm->local_resources);
+        rres = atomic_read(&dlm->remote_resources);
+        ures = atomic_read(&dlm->unknown_resources);
+        tres = lres + rres + ures;
+        spin_lock(&dlm->spinlock);
+        switch (dlm->dlm_state) {
+        case DLM_CTXT_NEW:
+                state = "NEW"; break;
+        case DLM_CTXT_JOINED:
+                state = "JOINED"; break;
+        case DLM_CTXT_IN_SHUTDOWN:
+                state = "SHUTDOWN"; break;
+        case DLM_CTXT_LEAVING:
+                state = "LEAVING"; break;
+        default:
+                state = "UNKNOWN"; break;
+        }
+        /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+        /* Thread Pid: xxx  Node: xxx  State: xxxxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Thread Pid: %d  Node: %d  State: %s\n",
+                        dlm->dlm_thread_task->pid, dlm->node_num, state);
+        /* Number of Joins: xxx  Joining Node: xxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Number of Joins: %d  Joining Node: %d\n",
+                        dlm->num_joins, dlm->joining_node);
+        /* Domain Map: xx xx xx */
+        out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+        out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+                                 db->buf + out, db->len - out);
+        out += snprintf(db->buf + out, db->len - out, "\n");
+        /* Live Map: xx xx xx */
+        out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+                                 db->buf + out, db->len - out);
+        out += snprintf(db->buf + out, db->len - out, "\n");
+        /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Mastered Resources Total: %d  Locally: %d  "
+                        "Remotely: %d  Unknown: %d\n",
+                        tres, lres, rres, ures);
+        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
+                        "PendingBASTs=%s  Master=%s\n",
+                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+                        (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+        /* Purge Count: xxx  Refs: xxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Purge Count: %d  Refs: %d\n", dlm->purge_count,
+                        atomic_read(&dlm->dlm_refs.refcount));
+        /* Dead Node: xxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Dead Node: %d\n", dlm->reco.dead_node);
+        /* What about DLM_RECO_STATE_FINALIZE? */
+        if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+                state = "ACTIVE";
+        else
+                state = "INACTIVE";
+        /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
+        out += snprintf(db->buf + out, db->len - out,
+                        "Recovery Pid: %d  Master: %d  State: %s\n",
+                        dlm->dlm_reco_thread_task->pid,
+                        dlm->reco.new_master, state);
+        /* Recovery Map: xx xx */
+        out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+        out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+                                 db->buf + out, db->len - out);
+        out += snprintf(db->buf + out, db->len - out, "\n");
+        /* Recovery Node State: */
+        out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+        list_for_each_entry(node, &dlm->reco.node_data, list) {
+                switch (node->state) {
+                case DLM_RECO_NODE_DATA_INIT:
+                        state = "INIT";
+                        break;
+                case DLM_RECO_NODE_DATA_REQUESTING:
+                        state = "REQUESTING";
+                        break;
+                case DLM_RECO_NODE_DATA_DEAD:
+                        state = "DEAD";
+                        break;
+                case DLM_RECO_NODE_DATA_RECEIVING:
+                        state = "RECEIVING";
+                        break;
+                case DLM_RECO_NODE_DATA_REQUESTED:
+                        state = "REQUESTED";
+                        break;
+                case DLM_RECO_NODE_DATA_DONE:
+                        state = "DONE";
+                        break;
+                case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                        state = "FINALIZE-SENT";
+                        break;
+                default:
+                        state = "BAD";
+                        break;
+                }
+                out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                                node->node_num, state);
+        }
+        spin_unlock(&dlm->spinlock);
+        return out;
+}
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+        struct dlm_ctxt *dlm = inode->i_private;
+        struct debug_buffer *db = NULL;
+        db = debug_buffer_allocate();
+        if (!db)
+                goto bail;
+        db->len = debug_state_print(dlm, db);
+        file->private_data = db;
+        return 0;
+bail:
+        return -ENOMEM;
+}
+static struct file_operations debug_state_fops = {
+        .open =         debug_state_open,
+        .release =      debug_buffer_release,
+        .read =         debug_buffer_read,
+        .llseek =       debug_buffer_llseek,
+};
+/* end  - debug state funcs */
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+        /* for dumping dlm_ctxt */
+        dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+                                                     S_IFREG|S_IRUSR,
+                                                     dlm->dlm_debugfs_subroot,
+                                                     dlm, &debug_state_fops);
+        if (!dc->debug_state_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        /* for dumping lockres */
+        dc->debug_lockres_dentry =
+                        debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+                                            S_IFREG|S_IRUSR,
+                                            dlm->dlm_debugfs_subroot,
+                                            dlm, &debug_lockres_fops);
+        if (!dc->debug_lockres_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        /* for dumping mles */
+        dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+                                                   S_IFREG|S_IRUSR,
+                                                   dlm->dlm_debugfs_subroot,
+                                                   dlm, &debug_mle_fops);
+        if (!dc->debug_mle_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        /* for dumping lockres on the purge list */
+        dc->debug_purgelist_dentry =
+                        debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+                                            S_IFREG|S_IRUSR,
+                                            dlm->dlm_debugfs_subroot,
+                                            dlm, &debug_purgelist_fops);
+        if (!dc->debug_purgelist_dentry) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        dlm_debug_get(dc);
+        return 0;
+bail:
+        dlm_debug_shutdown(dlm);
+        return -ENOMEM;
+}
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+        struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+        if (dc) {
+                if (dc->debug_purgelist_dentry)
+                        debugfs_remove(dc->debug_purgelist_dentry);
+                if (dc->debug_mle_dentry)
+                        debugfs_remove(dc->debug_mle_dentry);
+                if (dc->debug_lockres_dentry)
+                        debugfs_remove(dc->debug_lockres_dentry);
+                if (dc->debug_state_dentry)
+                        debugfs_remove(dc->debug_state_dentry);
+                dlm_debug_put(dc);
+        }
+}
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+        dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+                                                      dlm_debugfs_root);
+        if (!dlm->dlm_debugfs_subroot) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+                                      GFP_KERNEL);
+        if (!dlm->dlm_debug_ctxt) {
+                mlog_errno(-ENOMEM);
+                goto bail;
+        }
+        kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+        return 0;
+bail:
+        dlm_destroy_debugfs_subroot(dlm);
+        return -ENOMEM;
+}
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+        if (dlm->dlm_debugfs_subroot)
+                debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+        dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+        if (!dlm_debugfs_root) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void dlm_destroy_debugfs_root(void)
+{
+        if (dlm_debugfs_root)
+                debugfs_remove(dlm_debugfs_root);
+}
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..d34a62a3a625
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,86 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+#ifdef CONFIG_DEBUG_FS
+struct dlm_debug_ctxt {
+        struct kref debug_refcnt;
+        struct dentry *debug_state_dentry;
+        struct dentry *debug_lockres_dentry;
+        struct dentry *debug_mle_dentry;
+        struct dentry *debug_purgelist_dentry;
+};
+struct debug_buffer {
+        int len;
+        char *buf;
+};
+struct debug_lockres {
+        int dl_len;
+        char *dl_buf;
+        struct dlm_ctxt *dl_ctxt;
+        struct dlm_lock_resource *dl_res;
+};
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+#else
+static int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+        return 0;
+}
+static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+        return 0;
+}
+static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_root(void)
+{
+        return 0;
+}
+static void dlm_destroy_debugfs_root(void)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+#endif  /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 638d2ebb892b..63f8125824e8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/debugfs.h>
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 #include "dlmver.h"
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
+        dlm_destroy_debugfs_subroot(dlm);
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
        dlm_unregister_domain_handlers(dlm);
+        dlm_debug_shutdown(dlm);
        dlm_complete_thread(dlm);
        dlm_complete_recovery_thread(dlm);
        dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
 void dlm_unregister_domain(struct dlm_ctxt *dlm)
 {
        int leave = 0;
+        struct dlm_lock_resource *res;
        spin_lock(&dlm_domain_lock);
        BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                        msleep(500);
                        mlog(0, "%s: more migration to do\n", dlm->name);
                }
+                /* This list should be empty. If not, print remaining lockres */
+                if (!list_empty(&dlm->tracking_list)) {
+                        mlog(ML_ERROR, "Following lockres' are still on the "
+                             "tracking list:\n");
+                        list_for_each_entry(res, &dlm->tracking_list, tracking)
+                                dlm_print_one_lock_resource(res);
+                }
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
                dlm_complete_dlm_shutdown(dlm);
@@ -713,14 +727,46 @@ static int dlm_query_join_proto_check(char *proto_type, int node,
        return rc;
 }
+/*
+ * struct dlm_query_join_packet is made up of four one-byte fields.  They
+ * are effectively in big-endian order already.  However, little-endian
+ * machines swap them before putting the packet on the wire (because
+ * query_join's response is a status, and that status is treated as a u32
+ * on the wire).  Thus, a big-endian and little-endian machines will treat
+ * this structure differently.
+ *
+ * The solution is to have little-endian machines swap the structure when
+ * converting from the structure to the u32 representation.  This will
+ * result in the structure having the correct format on the wire no matter
+ * the host endian format.
+ */
+static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
+                                          u32 *wire)
+{
+        union dlm_query_join_response response;
+        response.packet = *packet;
+        *wire = cpu_to_be32(response.intval);
+}
+static void dlm_query_join_wire_to_packet(u32 wire,
+                                          struct dlm_query_join_packet *packet)
+{
+        union dlm_query_join_response response;
+        response.intval = cpu_to_be32(wire);
+        *packet = response.packet;
+}
 static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
                                  void **ret_data)
 {
        struct dlm_query_join_request *query;
-        union dlm_query_join_response response = {
+        struct dlm_query_join_packet packet = {
-                .packet.code = JOIN_DISALLOW,
+                .code = JOIN_DISALLOW,
        };
        struct dlm_ctxt *dlm = NULL;
+        u32 response;
        u8 nodenum;
        query = (struct dlm_query_join_request *) msg->buf;
@@ -737,11 +783,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
                mlog(0, "node %u is not in our live map yet\n",
                     query->node_idx);
-                response.packet.code = JOIN_DISALLOW;
+                packet.code = JOIN_DISALLOW;
                goto respond;
        }
-        response.packet.code = JOIN_OK_NO_MAP;
+        packet.code = JOIN_OK_NO_MAP;
        spin_lock(&dlm_domain_lock);
        dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
@@ -760,7 +806,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
                                mlog(0, "disallow join as node %u does not "
                                     "have node %u in its nodemap\n",
                                     query->node_idx, nodenum);
-                                response.packet.code = JOIN_DISALLOW;
+                                packet.code = JOIN_DISALLOW;
                                goto unlock_respond;
                        }
                }
@@ -780,23 +826,23 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
                        /*If this is a brand new context and we
                         * haven't started our join process yet, then
                         * the other node won the race. */
-                        response.packet.code = JOIN_OK_NO_MAP;
+                        packet.code = JOIN_OK_NO_MAP;
                } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
                        /* Disallow parallel joins. */
-                        response.packet.code = JOIN_DISALLOW;
+                        packet.code = JOIN_DISALLOW;
                } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
                        mlog(0, "node %u trying to join, but recovery "
                             "is ongoing.\n", bit);
-                        response.packet.code = JOIN_DISALLOW;
+                        packet.code = JOIN_DISALLOW;
                } else if (test_bit(bit, dlm->recovery_map)) {
                        mlog(0, "node %u trying to join, but it "
                             "still needs recovery.\n", bit);
-                        response.packet.code = JOIN_DISALLOW;
+                        packet.code = JOIN_DISALLOW;
                } else if (test_bit(bit, dlm->domain_map)) {
                        mlog(0, "node %u trying to join, but it "
                             "is still in the domain! needs recovery?\n",
                             bit);
-                        response.packet.code = JOIN_DISALLOW;
+                        packet.code = JOIN_DISALLOW;
                } else {
                        /* Alright we're fully a part of this domain
                         * so we keep some state as to who's joining
@@ -807,19 +853,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
                        if (dlm_query_join_proto_check("DLM", bit,
                                                       &dlm->dlm_locking_proto,
                                                       &query->dlm_proto)) {
-                                response.packet.code =
+                                packet.code = JOIN_PROTOCOL_MISMATCH;
-                                        JOIN_PROTOCOL_MISMATCH;
                        } else if (dlm_query_join_proto_check("fs", bit,
                                                              &dlm->fs_locking_proto,
                                                              &query->fs_proto)) {
-                                response.packet.code =
+                                packet.code = JOIN_PROTOCOL_MISMATCH;
-                                        JOIN_PROTOCOL_MISMATCH;
                        } else {
-                                response.packet.dlm_minor =
+                                packet.dlm_minor = query->dlm_proto.pv_minor;
-                                        query->dlm_proto.pv_minor;
+                                packet.fs_minor = query->fs_proto.pv_minor;
-                                response.packet.fs_minor =
+                                packet.code = JOIN_OK;
-                                        query->fs_proto.pv_minor;
-                                response.packet.code = JOIN_OK;
                                __dlm_set_joining_node(dlm, query->node_idx);
                        }
                }
@@ -830,9 +872,10 @@ unlock_respond:
        spin_unlock(&dlm_domain_lock);
 respond:
-        mlog(0, "We respond with %u\n", response.packet.code);
+        mlog(0, "We respond with %u\n", packet.code);
-        return response.intval;
+        dlm_query_join_packet_to_wire(&packet, &response);
+        return response;
 }
 static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -937,7 +980,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
                         sizeof(unsigned long))) {
                mlog(ML_ERROR,
                     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
-                     map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
+                     map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES));
                return -EINVAL;
        }
@@ -968,7 +1011,8 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
 {
        int status;
        struct dlm_query_join_request join_msg;
-        union dlm_query_join_response join_resp;
+        struct dlm_query_join_packet packet;
+        u32 join_resp;
        mlog(0, "querying node %d\n", node);
@@ -984,11 +1028,12 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
                                    sizeof(join_msg), node,
-                                    &join_resp.intval);
+                                    &join_resp);
        if (status < 0 && status != -ENOPROTOOPT) {
                mlog_errno(status);
                goto bail;
        }
+        dlm_query_join_wire_to_packet(join_resp, &packet);
        /* -ENOPROTOOPT from the net code means the other side isn't
            listening for our message type -- that's fine, it means
@@ -997,10 +1042,10 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        if (status == -ENOPROTOOPT) {
                status = 0;
                *response = JOIN_OK_NO_MAP;
-        } else if (join_resp.packet.code == JOIN_DISALLOW ||
+        } else if (packet.code == JOIN_DISALLOW ||
-                   join_resp.packet.code == JOIN_OK_NO_MAP) {
+                   packet.code == JOIN_OK_NO_MAP) {
-                *response = join_resp.packet.code;
+                *response = packet.code;
-        } else if (join_resp.packet.code == JOIN_PROTOCOL_MISMATCH) {
+        } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
                mlog(ML_NOTICE,
                     "This node requested DLM locking protocol %u.%u and "
                     "filesystem locking protocol %u.%u.  At least one of "
@@ -1012,14 +1057,12 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
                     dlm->fs_locking_proto.pv_minor,
                     node);
                status = -EPROTO;
-                *response = join_resp.packet.code;
+                *response = packet.code;
-        } else if (join_resp.packet.code == JOIN_OK) {
+        } else if (packet.code == JOIN_OK) {
-                *response = join_resp.packet.code;
+                *response = packet.code;
                /* Use the same locking protocol as the remote node */
-                dlm->dlm_locking_proto.pv_minor =
+                dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
-                        join_resp.packet.dlm_minor;
+                dlm->fs_locking_proto.pv_minor = packet.fs_minor;
-                dlm->fs_locking_proto.pv_minor =
-                        join_resp.packet.fs_minor;
                mlog(0,
                     "Node %d responds JOIN_OK with DLM locking protocol "
                     "%u.%u and fs locking protocol %u.%u\n",
@@ -1031,11 +1074,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        } else {
                status = -EINVAL;
                mlog(ML_ERROR, "invalid response %d from node %u\n",
-                     join_resp.packet.code, node);
+                     packet.code, node);
        }
        mlog(0, "status %d, node %d response is %d\n", status, node,
-                  *response);
+             *response);
 bail:
        return status;
@@ -1376,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                goto bail;
        }
+        status = dlm_debug_init(dlm);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
        status = dlm_launch_thread(dlm);
        if (status < 0) {
                mlog_errno(status);
@@ -1443,6 +1492,7 @@ bail:
        if (status) {
                dlm_unregister_domain_handlers(dlm);
+                dlm_debug_shutdown(dlm);
                dlm_complete_thread(dlm);
                dlm_complete_recovery_thread(dlm);
                dlm_destroy_dlm_worker(dlm);
@@ -1455,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                                u32 key)
 {
        int i;
+        int ret;
        struct dlm_ctxt *dlm = NULL;
        dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1487,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
+        ret = dlm_create_debugfs_subroot(dlm);
+        if (ret < 0) {
+                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+                kfree(dlm->name);
+                kfree(dlm);
+                dlm = NULL;
+                goto leave;
+        }
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
@@ -1497,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        INIT_LIST_HEAD(&dlm->reco.node_data);
        INIT_LIST_HEAD(&dlm->purge_list);
        INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+        INIT_LIST_HEAD(&dlm->tracking_list);
        dlm->reco.state = 0;
        INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1787,21 +1848,49 @@ static int __init dlm_init(void)
        dlm_print_version();
        status = dlm_init_mle_cache();
-        if (status)
+        if (status) {
-                return -1;
+                mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+                goto error;
+        }
+        status = dlm_init_master_caches();
+        if (status) {
+                mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+                     "o2dlm_lockname slabcaches\n");
+                goto error;
+        }
+        status = dlm_init_lock_cache();
+        if (status) {
+                mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+                goto error;
+        }
        status = dlm_register_net_handlers();
        if (status) {
-                dlm_destroy_mle_cache();
+                mlog(ML_ERROR, "Unable to register network handlers\n");
-                return -1;
+                goto error;
        }
+        status = dlm_create_debugfs_root();
+        if (status)
+                goto error;
        return 0;
+error:
+        dlm_unregister_net_handlers();
+        dlm_destroy_lock_cache();
+        dlm_destroy_master_caches();
+        dlm_destroy_mle_cache();
+        return -1;
 }
 static void __exit dlm_exit (void)
 {
+        dlm_destroy_debugfs_root();
        dlm_unregister_net_handlers();
+        dlm_destroy_lock_cache();
+        dlm_destroy_master_caches();
        dlm_destroy_mle_cache();
 }
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9a..83a9f2972ac8 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
+static struct kmem_cache *dlm_lock_cache = NULL;
 static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
 static void dlm_lock_release(struct kref *kref);
 static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+int dlm_init_lock_cache(void)
+{
+        dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+                                           sizeof(struct dlm_lock),
+                                           0, SLAB_HWCACHE_ALIGN, NULL);
+        if (dlm_lock_cache == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void dlm_destroy_lock_cache(void)
+{
+        if (dlm_lock_cache)
+                kmem_cache_destroy(dlm_lock_cache);
+}
 /* Tell us whether we can grant a new lock request.
 * locking:
 *   caller needs:  res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
                mlog(0, "freeing kernel-allocated lksb\n");
                kfree(lock->lksb);
        }
-        kfree(lock);
+        kmem_cache_free(dlm_lock_cache, lock);
 }
 /* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = kzalloc(sizeof(*lock), GFP_NOFS);
+        lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a54d33d95ada..efc015c6128a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,47 +48,11 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
-enum dlm_mle_type {
-        DLM_MLE_BLOCK,
-        DLM_MLE_MASTER,
-        DLM_MLE_MIGRATION
-};
-struct dlm_lock_name
-{
-        u8 len;
-        u8 name[DLM_LOCKID_NAME_MAX];
-};
-struct dlm_master_list_entry
-{
-        struct list_head list;
-        struct list_head hb_events;
-        struct dlm_ctxt *dlm;
-        spinlock_t spinlock;
-        wait_queue_head_t wq;
-        atomic_t woken;
-        struct kref mle_refs;
-        int inuse;
-        unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-        u8 master;
-        u8 new_master;
-        enum dlm_mle_type type;
-        struct o2hb_callback_func mle_hb_up;
-        struct o2hb_callback_func mle_hb_down;
-        union {
-                struct dlm_lock_resource *res;
-                struct dlm_lock_name name;
-        } u;
-};
 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
                              struct dlm_master_list_entry *mle,
                              struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
        return 1;
 }
-#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
+static struct kmem_cache *dlm_lockres_cache = NULL;
-static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+static struct kmem_cache *dlm_lockname_cache = NULL;
-{
-        int i;
-        printk("%s=[ ", mapname);
-        for (i=0; i<O2NM_MAX_NODES; i++)
-                if (test_bit(i, map))
-                        printk("%d ", i);
-        printk("]");
-}
-static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
-{
-        int refs;
-        char *type;
-        char attached;
-        u8 master;
-        unsigned int namelen;
-        const char *name;
-        struct kref *k;
-        unsigned long *maybe = mle->maybe_map,
-                      *vote = mle->vote_map,
-                      *resp = mle->response_map,
-                      *node = mle->node_map;
-        k = &mle->mle_refs;
-        if (mle->type == DLM_MLE_BLOCK)
-                type = "BLK";
-        else if (mle->type == DLM_MLE_MASTER)
-                type = "MAS";
-        else
-                type = "MIG";
-        refs = atomic_read(&k->refcount);
-        master = mle->master;
-        attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
-        if (mle->type != DLM_MLE_MASTER) {
-                namelen = mle->u.name.len;
-                name = mle->u.name.name;
-        } else {
-                namelen = mle->u.res->lockname.len;
-                name = mle->u.res->lockname.name;
-        }
-        mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
-                  namelen, name, type, refs, master, mle->new_master, attached,
-                  mle->inuse);
-        dlm_print_nodemap(maybe);
-        printk(", ");
-        dlm_print_nodemap(vote);
-        printk(", ");
-        dlm_print_nodemap(resp);
-        printk(", ");
-        dlm_print_nodemap(node);
-        printk(", ");
-        printk("\n");
-}
-#if 0
-/* Code here is included but defined out as it aids debugging */
-static void dlm_dump_mles(struct dlm_ctxt *dlm)
-{
-        struct dlm_master_list_entry *mle;
-        
-        mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-        spin_lock(&dlm->master_lock);
-        list_for_each_entry(mle, &dlm->master_list, list)
-                dlm_print_one_mle(mle);
-        spin_unlock(&dlm->master_lock);
-}
-int dlm_dump_all_mles(const char __user *data, unsigned int len)
-{
-        struct dlm_ctxt *dlm;
-        spin_lock(&dlm_domain_lock);
-        list_for_each_entry(dlm, &dlm_domains, list) {
-                mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
-                dlm_dump_mles(dlm);
-        }
-        spin_unlock(&dlm_domain_lock);
-        return len;
-}
-EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
-#endif  /*  0  */
 static struct kmem_cache *dlm_mle_cache = NULL;
 static void dlm_mle_release(struct kref *kref);
 static void dlm_init_mle(struct dlm_master_list_entry *mle,
                        enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
 int dlm_init_mle_cache(void)
 {
-        dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+        dlm_mle_cache = kmem_cache_create("o2dlm_mle",
                                          sizeof(struct dlm_master_list_entry),
                                          0, SLAB_HWCACHE_ALIGN,
                                          NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
 * LOCK RESOURCE FUNCTIONS
 */
+int dlm_init_master_caches(void)
+{
+        dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+                                              sizeof(struct dlm_lock_resource),
+                                              0, SLAB_HWCACHE_ALIGN, NULL);
+        if (!dlm_lockres_cache)
+                goto bail;
+        dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+                                               DLM_LOCKID_NAME_MAX, 0,
+                                               SLAB_HWCACHE_ALIGN, NULL);
+        if (!dlm_lockname_cache)
+                goto bail;
+        return 0;
+bail:
+        dlm_destroy_master_caches();
+        return -ENOMEM;
+}
+void dlm_destroy_master_caches(void)
+{
+        if (dlm_lockname_cache)
+                kmem_cache_destroy(dlm_lockname_cache);
+        if (dlm_lockres_cache)
+                kmem_cache_destroy(dlm_lockres_cache);
+}
 static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
                                  struct dlm_lock_resource *res,
                                  u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        if (!list_empty(&res->tracking))
+                list_del_init(&res->tracking);
+        else {
+                mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+                     res->lockname.len, res->lockname.name);
+                dlm_print_one_lock_resource(res);
+        }
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
            !list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
        BUG_ON(!list_empty(&res->recovering));
        BUG_ON(!list_empty(&res->purge));
-        kfree(res->lockname.name);
+        kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
-        kfree(res);
+        kmem_cache_free(dlm_lockres_cache, res);
 }
 void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        INIT_LIST_HEAD(&res->dirty);
        INIT_LIST_HEAD(&res->recovering);
        INIT_LIST_HEAD(&res->purge);
+        INIT_LIST_HEAD(&res->tracking);
        atomic_set(&res->asts_reserved, 0);
        res->migration_pending = 0;
        res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->last_used = 0;
+        list_add_tail(&res->tracking, &dlm->tracking_list);
        memset(res->lvb, 0, DLM_LVB_LEN);
        memset(res->refmap, 0, sizeof(res->refmap));
 }
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                   const char *name,
                                   unsigned int namelen)
 {
-        struct dlm_lock_resource *res;
+        struct dlm_lock_resource *res = NULL;
-        res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
+        res = (struct dlm_lock_resource *)
+                                kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
-                return NULL;
+                goto error;
-        res->lockname.name = kmalloc(namelen, GFP_NOFS);
+        res->lockname.name = (char *)
-        if (!res->lockname.name) {
+                                kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
-                kfree(res);
+        if (!res->lockname.name)
-                return NULL;
+                goto error;
-        }
        dlm_init_lockres(dlm, res, name, namelen);
        return res;
+error:
+        if (res && res->lockname.name)
+                kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+        if (res)
+                kmem_cache_free(dlm_lockres_cache, res);
+        return NULL;
 }
 void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
@@ -1663,7 +1587,12 @@ way_up_top:
                dlm_put_mle(tmpmle);
        }
 send_response:
+        /*
+         * __dlm_lookup_lockres() grabbed a reference to this lockres.
+         * The reference is released by dlm_assert_master_worker() under
+         * the call to dlm_dispatch_assert_master().  If
+         * dlm_assert_master_worker() isn't called, we drop it here.
+         */
        if (dispatch_assert) {
                if (response != DLM_MASTER_RESP_YES)
                        mlog(ML_ERROR, "invalid response %d\n", response);
@@ -1678,7 +1607,11 @@ send_response:
                if (ret < 0) {
                        mlog(ML_ERROR, "failed to dispatch assert master work\n");
                        response = DLM_MASTER_RESP_ERROR;
+                        dlm_lockres_put(res);
                }
+        } else {
+                if (res)
+                        dlm_lockres_put(res);
        }
        dlm_put(dlm);
@@ -1695,9 +1628,9 @@ send_response:
 * can periodically run all locks owned by this node
 * and re-assert across the cluster...
 */
-int dlm_do_assert_master(struct dlm_ctxt *dlm,
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
-                         struct dlm_lock_resource *res,
+                                struct dlm_lock_resource *res,
-                         void *nodemap, u32 flags)
+                                void *nodemap, u32 flags)
 {
        struct dlm_assert_master assert;
        int to, tmpret;
@@ -2348,7 +2281,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
                        mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
                        "but it is already dropped!\n", dlm->name,
                        res->lockname.len, res->lockname.name, node);
-                        __dlm_print_one_lock_resource(res);
+                        dlm_print_one_lock_resource(res);
                }
                ret = 0;
                goto done;
@@ -2408,7 +2341,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
                mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
                     "but it is already dropped!\n", dlm->name,
                     res->lockname.len, res->lockname.name, node);
-                __dlm_print_one_lock_resource(res);
+                dlm_print_one_lock_resource(res);
        }
        dlm_lockres_put(res);
@@ -2933,6 +2866,9 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                dlm_lockres_clear_refmap_bit(lock->ml.node, res);
                                list_del_init(&lock->list);
                                dlm_lock_put(lock);
+                                /* In a normal unlock, we would have added a
+                                 * DLM_UNLOCK_FREE_LOCK action. Force it. */
+                                dlm_lock_put(lock);
                        }
                }
                queue++;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 91f747b8a538..bcb9260c3735 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -519,9 +519,9 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        return 0;
 master_here:
-        mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+        mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
-             task_pid_nr(dlm->dlm_reco_thread_task),
+             "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
-             dlm->name, dlm->reco.dead_node, dlm->node_num);
+             dlm->node_num, dlm->reco.dead_node, dlm->name);
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
@@ -1191,7 +1191,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
                            (ml->type == LKM_EXMODE ||
                             memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
                                mlog(ML_ERROR, "mismatched lvbs!\n");
-                                __dlm_print_one_lock_resource(lock->lockres);
+                                dlm_print_one_lock_resource(lock->lockres);
                                BUG();
                        }
                        memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
@@ -1327,6 +1327,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
                (struct dlm_migratable_lockres *)msg->buf;
        int ret = 0;
        u8 real_master;
+        u8 extra_refs = 0;
        char *buf = NULL;
        struct dlm_work_item *item = NULL;
        struct dlm_lock_resource *res = NULL;
@@ -1404,16 +1405,28 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
                __dlm_insert_lockres(dlm, res);
                spin_unlock(&dlm->spinlock);
+                /* Add an extra ref for this lock-less lockres lest the
+                 * dlm_thread purges it before we get the chance to add
+                 * locks to it */
+                dlm_lockres_get(res);
+                /* There are three refs that need to be put.
+                 * 1. Taken above.
+                 * 2. kref_init in dlm_new_lockres()->dlm_init_lockres().
+                 * 3. dlm_lookup_lockres()
+                 * The first one is handled at the end of this function. The
+                 * other two are handled in the worker thread after locks have
+                 * been attached. Yes, we don't wait for purge time to match
+                 * kref_init. The lockres will still have atleast one ref
+                 * added because it is in the hash __dlm_insert_lockres() */
+                extra_refs++;
                /* now that the new lockres is inserted,
                 * make it usable by other processes */
                spin_lock(&res->spinlock);
                res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
                spin_unlock(&res->spinlock);
                wake_up(&res->wq);
-                /* add an extra ref for just-allocated lockres 
-                 * otherwise the lockres will be purged immediately */
-                dlm_lockres_get(res);
        }
        /* at this point we have allocated everything we need,
@@ -1443,12 +1456,17 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
        dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
        item->u.ml.lockres = res; /* already have a ref */
        item->u.ml.real_master = real_master;
+        item->u.ml.extra_ref = extra_refs;
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
        queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 leave:
+        /* One extra ref taken needs to be put here */
+        if (extra_refs)
+                dlm_lockres_put(res);
        dlm_put(dlm);
        if (ret < 0) {
                if (buf)
@@ -1464,17 +1482,19 @@ leave:
 static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
 {
-        struct dlm_ctxt *dlm = data;
+        struct dlm_ctxt *dlm;
        struct dlm_migratable_lockres *mres;
        int ret = 0;
        struct dlm_lock_resource *res;
        u8 real_master;
+        u8 extra_ref;
        dlm = item->dlm;
        mres = (struct dlm_migratable_lockres *)data;
        res = item->u.ml.lockres;
        real_master = item->u.ml.real_master;
+        extra_ref = item->u.ml.extra_ref;
        if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
                /* this case is super-rare. only occurs if
@@ -1517,6 +1537,12 @@ again:
        }
 leave:
+        /* See comment in dlm_mig_lockres_handler() */
+        if (res) {
+                if (extra_ref)
+                        dlm_lockres_put(res);
+                dlm_lockres_put(res);
+        }
        kfree(data);
        mlog_exit(ret);
 }
@@ -1644,7 +1670,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
                                /* retry!? */
                                BUG();
                        }
-                }
+                } else /* put.. incase we are not the master */
+                        dlm_lockres_put(res);
                spin_unlock(&res->spinlock);
        }
        spin_unlock(&dlm->spinlock);
@@ -1921,6 +1948,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
                     "Recovering res %s:%.*s, is already on recovery list!\n",
                     dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->recovering);
+                dlm_lockres_put(res);
        }
        /* We need to hold a reference while on the recovery list */
        dlm_lockres_get(res);
@@ -2130,11 +2158,16 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&res->spinlock);
+        /* We do two dlm_lock_put(). One for removing from list and the other is
+         * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
        /* TODO: check pending_asts, pending_basts here */
        list_for_each_entry_safe(lock, next, &res->granted, list) {
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
+                        /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+                        dlm_lock_put(lock);
                        freed++;
                }
        }
@@ -2142,6 +2175,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
+                        /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+                        dlm_lock_put(lock);
                        freed++;
                }
        }
@@ -2149,6 +2184,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                if (lock->ml.node == dead_node) {
                        list_del_init(&lock->list);
                        dlm_lock_put(lock);
+                        /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+                        dlm_lock_put(lock);
                        freed++;
                }
        }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index cebd089f8955..4060bb328bc8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -176,12 +176,14 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
             res->lockname.name, master);
        if (!master) {
+                /* drop spinlock...  retake below */
+                spin_unlock(&dlm->spinlock);
                spin_lock(&res->spinlock);
                /* This ensures that clear refmap is sent after the set */
                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
                spin_unlock(&res->spinlock);
-                /* drop spinlock to do messaging, retake below */
-                spin_unlock(&dlm->spinlock);
                /* clear our bit from the master's refmap, ignore errors */
                ret = dlm_drop_lockres_ref(dlm, res);
                if (ret < 0) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 351130c9b734..394d25a131a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,18 +27,11 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
-#include <linux/crc32.h>
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-#include <dlm/dlmapi.h>
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -53,6 +46,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "stackglue.h"
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        struct ocfs2_meta_lvb *lvb =
+                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
-/*
- * This is the filesystem locking protocol version.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-const struct dlm_protocol_version ocfs2_locking_protocol = {
-        .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-        .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
 static int ocfs2_lock_create(struct ocfs2_super *osb,
                             struct ocfs2_lock_res *lockres,
                             int level,
-                             int dlm_flags);
+                             u32 dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
                                                     int wanted);
 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                                                int convert);
-#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {        \
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {                 \
-        mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "  \
+        mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
-                "resource %s: %s\n", dlm_errname(_stat), _func, \
+             _err, _func, _lockres->l_name);                            \
-                _lockres->l_name, dlm_errmsg(_stat));           \
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-                                      int new_level);
+                                              int new_level);
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                                  struct ocfs2_lock_res *lockres,
                                  int new_level,
-                                  int lvb);
+                                  int lvb,
+                                  unsigned int generation);
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
        res->l_ops           = ops;
        res->l_priv          = priv;
-        res->l_level         = LKM_IVMODE;
+        res->l_level         = DLM_LOCK_IV;
-        res->l_requested     = LKM_IVMODE;
+        res->l_requested     = DLM_LOCK_IV;
-        res->l_blocking      = LKM_IVMODE;
+        res->l_blocking      = DLM_LOCK_IV;
        res->l_action        = OCFS2_AST_INVALID;
        res->l_unlock_action = OCFS2_UNLOCK_INVALID;
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
        BUG_ON(!lockres);
        switch(level) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                lockres->l_ex_holders++;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                lockres->l_ro_holders++;
                break;
        default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
        BUG_ON(!lockres);
        switch(level) {
-        case LKM_EXMODE:
+        case DLM_LOCK_EX:
                BUG_ON(!lockres->l_ex_holders);
                lockres->l_ex_holders--;
                break;
-        case LKM_PRMODE:
+        case DLM_LOCK_PR:
                BUG_ON(!lockres->l_ro_holders);
                lockres->l_ro_holders--;
                break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
 * lock types are added. */
 static inline int ocfs2_highest_compat_lock_level(int level)
 {
-        int new_level = LKM_EXMODE;
+        int new_level = DLM_LOCK_EX;
-        if (level == LKM_EXMODE)
+        if (level == DLM_LOCK_EX)
-                new_level = LKM_NLMODE;
+                new_level = DLM_LOCK_NL;
-        else if (level == LKM_PRMODE)
+        else if (level == DLM_LOCK_PR)
-                new_level = LKM_PRMODE;
+                new_level = DLM_LOCK_PR;
        return new_level;
 }
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-        BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+        BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
        lockres->l_level = lockres->l_requested;
        if (lockres->l_level <=
            ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
-                lockres->l_blocking = LKM_NLMODE;
+                lockres->l_blocking = DLM_LOCK_NL;
                lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
        }
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
         * information is already up to data. Convert from NL to
         * *anything* however should mark ourselves as needing an
         * update */
-        if (lockres->l_level == LKM_NLMODE &&
+        if (lockres->l_level == DLM_LOCK_NL &&
            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
        BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
-        if (lockres->l_requested > LKM_NLMODE &&
+        if (lockres->l_requested > DLM_LOCK_NL &&
            !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
        return needs_downconvert;
 }
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist?  To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock().  See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely.  However, it introduces
+ * a race on itself.  In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns.  The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time.  When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again.  If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action?  The other path has re-set PENDING.  Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ *  ocfs2_cluster_lock()
+ *   set BUSY
+ *   set PENDING
+ *   drop l_lock
+ *   ocfs2_dlm_lock()
+ *    ocfs2_locking_ast()               ocfs2_downconvert_thread()
+ *     clear PENDING                     ocfs2_unblock_lock()
+ *                                        take_l_lock
+ *                                        !BUSY
+ *                                        ocfs2_prepare_downconvert()
+ *                                         set BUSY
+ *                                         set PENDING
+ *                                        drop l_lock
+ *   take l_lock
+ *   clear PENDING
+ *   drop l_lock
+ *                      <window>
+ *                                        ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert().  That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen.  A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres.  lockres_set_pending() will return the
+ * current generation number.  When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending().  In our
+ * example above, the generation numbers will *not* match.  Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+                                    unsigned int generation,
+                                    struct ocfs2_super *osb)
+{
+        assert_spin_locked(&lockres->l_lock);
+        /*
+         * The ast and locking functions can race us here.  The winner
+         * will clear pending, the loser will not.
+         */
+        if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+            (lockres->l_pending_gen != generation))
+                return;
+        lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+        lockres->l_pending_gen++;
+        /*
+         * The downconvert thread may have skipped us because we
+         * were PENDING.  Wake it up.
+         */
+        if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+                ocfs2_wake_downconvert_thread(osb);
+}
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+                                  unsigned int generation,
+                                  struct ocfs2_super *osb)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        __lockres_clear_pending(lockres, generation, osb);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+        assert_spin_locked(&lockres->l_lock);
+        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+        lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+        return lockres->l_pending_gen;
+}
 static void ocfs2_blocking_ast(void *opaque, int level)
 {
        struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
        int needs_downconvert;
        unsigned long flags;
-        BUG_ON(level <= LKM_NLMODE);
+        BUG_ON(level <= DLM_LOCK_NL);
        mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
             lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 static void ocfs2_locking_ast(void *opaque)
 {
        struct ocfs2_lock_res *lockres = opaque;
-        struct dlm_lockstatus *lksb = &lockres->l_lksb;
+        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
        unsigned long flags;
+        int status;
        spin_lock_irqsave(&lockres->l_lock, flags);
-        if (lksb->status != DLM_NORMAL) {
+        status = ocfs2_dlm_lock_status(&lockres->l_lksb);
-                mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
-                     lockres->l_name, lksb->status);
+        if (status == -EAGAIN) {
+                lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+                goto out;
+        }
+        if (status) {
+                mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+                     lockres->l_name, status);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                return;
        }
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
                     lockres->l_unlock_action);
                BUG();
        }
+out:
        /* set it to something invalid so if we get called again we
         * can catch it. */
        lockres->l_action = OCFS2_AST_INVALID;
+        /* Did we try to cancel this lock?  Clear that state */
+        if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+                lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+        /*
+         * We may have beaten the locking functions here.  We certainly
+         * know that dlm_lock() has been called :-)
+         * Because we can't have two lock calls in flight at once, we
+         * can use lockres->l_pending_gen.
+         */
+        __lockres_clear_pending(lockres, lockres->l_pending_gen,  osb);
        wake_up(&lockres->l_event);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 static int ocfs2_lock_create(struct ocfs2_super *osb,
                             struct ocfs2_lock_res *lockres,
                             int level,
-                             int dlm_flags)
+                             u32 dlm_flags)
 {
        int ret = 0;
-        enum dlm_status status = DLM_NORMAL;
        unsigned long flags;
+        unsigned int gen;
        mlog_entry_void();
-        mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+        mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
             dlm_flags);
        spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
        lockres->l_action = OCFS2_AST_ATTACH;
        lockres->l_requested = level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+        gen = lockres_set_pending(lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        status = dlmlock(osb->dlm,
+        ret = ocfs2_dlm_lock(osb->cconn,
-                         level,
+                             level,
-                         &lockres->l_lksb,
+                             &lockres->l_lksb,
-                         dlm_flags,
+                             dlm_flags,
-                         lockres->l_name,
+                             lockres->l_name,
-                         OCFS2_LOCK_ID_MAX_LEN - 1,
+                             OCFS2_LOCK_ID_MAX_LEN - 1,
-                         ocfs2_locking_ast,
+                             lockres);
-                         lockres,
+        lockres_clear_pending(lockres, gen, osb);
-                         ocfs2_blocking_ast);
+        if (ret) {
-        if (status != DLM_NORMAL) {
+                ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
-                ocfs2_log_dlm_error("dlmlock", status, lockres);
-                ret = -EINVAL;
                ocfs2_recover_from_dlm_error(lockres, 1);
        }
-        mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+        mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 bail:
        mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                              struct ocfs2_lock_res *lockres,
                              int level,
-                              int lkm_flags,
+                              u32 lkm_flags,
                              int arg_flags)
 {
        struct ocfs2_mask_waiter mw;
-        enum dlm_status status;
        int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
        int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
        unsigned long flags;
+        unsigned int gen;
+        int noqueue_attempted = 0;
        mlog_entry_void();
        ocfs2_init_mask_waiter(&mw);
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-                lkm_flags |= LKM_VALBLK;
+                lkm_flags |= DLM_LKF_VALBLK;
 again:
        wait = 0;
@@ -1068,52 +1165,56 @@ again:
        }
        if (level > lockres->l_level) {
+                if (noqueue_attempted > 0) {
+                        ret = -EAGAIN;
+                        goto unlock;
+                }
+                if (lkm_flags & DLM_LKF_NOQUEUE)
+                        noqueue_attempted = 1;
                if (lockres->l_action != OCFS2_AST_INVALID)
                        mlog(ML_ERROR, "lockres %s has action %u pending\n",
                             lockres->l_name, lockres->l_action);
                if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
                        lockres->l_action = OCFS2_AST_ATTACH;
-                        lkm_flags &= ~LKM_CONVERT;
+                        lkm_flags &= ~DLM_LKF_CONVERT;
                } else {
                        lockres->l_action = OCFS2_AST_CONVERT;
-                        lkm_flags |= LKM_CONVERT;
+                        lkm_flags |= DLM_LKF_CONVERT;
                }
                lockres->l_requested = level;
                lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+                gen = lockres_set_pending(lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                BUG_ON(level == LKM_IVMODE);
+                BUG_ON(level == DLM_LOCK_IV);
-                BUG_ON(level == LKM_NLMODE);
+                BUG_ON(level == DLM_LOCK_NL);
                mlog(0, "lock %s, convert from %d to level = %d\n",
                     lockres->l_name, lockres->l_level, level);
                /* call dlm_lock to upgrade lock now */
-                status = dlmlock(osb->dlm,
+                ret = ocfs2_dlm_lock(osb->cconn,
-                                 level,
+                                     level,
-                                 &lockres->l_lksb,
+                                     &lockres->l_lksb,
-                                 lkm_flags,
+                                     lkm_flags,
-                                 lockres->l_name,
+                                     lockres->l_name,
-                                 OCFS2_LOCK_ID_MAX_LEN - 1,
+                                     OCFS2_LOCK_ID_MAX_LEN - 1,
-                                 ocfs2_locking_ast,
+                                     lockres);
-                                 lockres,
+                lockres_clear_pending(lockres, gen, osb);
-                                 ocfs2_blocking_ast);
+                if (ret) {
-                if (status != DLM_NORMAL) {
+                        if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
-                        if ((lkm_flags & LKM_NOQUEUE) &&
+                            (ret != -EAGAIN)) {
-                            (status == DLM_NOTQUEUED))
+                                ocfs2_log_dlm_error("ocfs2_dlm_lock",
-                                ret = -EAGAIN;
+                                                    ret, lockres);
-                        else {
-                                ocfs2_log_dlm_error("dlmlock", status,
-                                                    lockres);
-                                ret = -EINVAL;
                        }
                        ocfs2_recover_from_dlm_error(lockres, 1);
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from dlmlock\n",
+                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
                                 int ex,
                                 int local)
 {
-        int level =  ex ? LKM_EXMODE : LKM_PRMODE;
+        int level =  ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        unsigned long flags;
-        int lkm_flags = local ? LKM_LOCAL : 0;
+        u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
        spin_lock_irqsave(&lockres->l_lock, flags);
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
        }
        /*
-         * We don't want to use LKM_LOCAL on a meta data lock as they
+         * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
         * don't use a generation in their lock names.
         */
        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
        lockres = &OCFS2_I(inode)->ip_rw_lockres;
-        level = write ? LKM_EXMODE : LKM_PRMODE;
+        level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
                                    0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 void ocfs2_rw_unlock(struct inode *inode, int write)
 {
-        int level = write ? LKM_EXMODE : LKM_PRMODE;
+        int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
        lockres = &OCFS2_I(inode)->ip_open_lockres;
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-                                    LKM_PRMODE, 0, 0);
+                                    DLM_LOCK_PR, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
        lockres = &OCFS2_I(inode)->ip_open_lockres;
-        level = write ? LKM_EXMODE : LKM_PRMODE;
+        level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        /*
         * The file system may already holding a PRMODE/EXMODE open lock.
-         * Since we pass LKM_NOQUEUE, the request won't block waiting on
+         * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
         * other nodes and the -EAGAIN will indicate to the caller that
         * this inode is still in use.
         */
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-                                    level, LKM_NOQUEUE, 0);
+                                    level, DLM_LKF_NOQUEUE, 0);
 out:
        mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
        if(lockres->l_ro_holders)
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-                                     LKM_PRMODE);
+                                     DLM_LOCK_PR);
        if(lockres->l_ex_holders)
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-                                     LKM_EXMODE);
+                                     DLM_LOCK_EX);
 out:
        mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        ocfs2_init_mask_waiter(&mw);
        if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
-            (lockres->l_level > LKM_NLMODE)) {
+            (lockres->l_level > DLM_LOCK_NL)) {
                mlog(ML_ERROR,
                     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
                     "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+        ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
-                      lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                             lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
-                      ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+                             lockres);
-        if (ret != DLM_NORMAL) {
+        if (ret) {
-                if (trylock && ret == DLM_NOTQUEUED)
+                if (!trylock || (ret != -EAGAIN)) {
-                        ret = -EAGAIN;
+                        ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
-                else {
-                        ocfs2_log_dlm_error("dlmlock", ret, lockres);
                        ret = -EINVAL;
                }
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * to just bubble sucess back up to the user.
                 */
                ret = ocfs2_flock_handle_signal(lockres, level);
+        } else if (!ret && (level > lockres->l_level)) {
+                /* Trylock failed asynchronously */
+                BUG_ON(!trylock);
+                ret = -EAGAIN;
        }
 out:
@@ -1549,6 +1652,7 @@ out:
 void ocfs2_file_unlock(struct file *file)
 {
        int ret;
+        unsigned int gen;
        unsigned long flags;
        struct ocfs2_file_private *fp = file->private_data;
        struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
         * Fake a blocking ast for the downconvert code.
         */
        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-        lockres->l_blocking = LKM_EXMODE;
+        lockres->l_blocking = DLM_LOCK_EX;
-        ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+        gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+        ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
        if (ret) {
                mlog_errno(ret);
                return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
         * condition. */
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
                switch(lockres->l_blocking) {
-                case LKM_EXMODE:
+                case DLM_LOCK_EX:
                        if (!lockres->l_ex_holders && !lockres->l_ro_holders)
                                kick = 1;
                        break;
-                case LKM_PRMODE:
+                case DLM_LOCK_PR:
                        if (!lockres->l_ex_holders)
                                kick = 1;
                        break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        mlog_entry_void();
-        lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_meta_lvb(0, lockres);
-        lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+        struct ocfs2_meta_lvb *lvb =
+                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
                         int ex,
                         int arg_flags)
 {
-        int status, level, dlm_flags, acquired;
+        int status, level, acquired;
+        u32 dlm_flags;
        struct ocfs2_lock_res *lockres = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
                goto local;
        if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-                wait_event(osb->recovery_event,
+                ocfs2_wait_for_recovery(osb);
-                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
        lockres = &OCFS2_I(inode)->ip_inode_lockres;
-        level = ex ? LKM_EXMODE : LKM_PRMODE;
+        level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        dlm_flags = 0;
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
-                dlm_flags |= LKM_NOQUEUE;
+                dlm_flags |= DLM_LKF_NOQUEUE;
        status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
        if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
         * committed to owning this lock so we don't allow signals to
         * abort the operation. */
        if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-                wait_event(osb->recovery_event,
+                ocfs2_wait_for_recovery(osb);
-                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
 local:
        /*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 void ocfs2_inode_unlock(struct inode *inode,
                       int ex)
 {
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex)
 {
        int status = 0;
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
-        struct buffer_head *bh;
-        struct ocfs2_slot_info *si = osb->slot_info;
        mlog_entry_void();
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                goto bail;
        }
        if (status) {
-                bh = si->si_bh;
+                status = ocfs2_refresh_slot_info(osb);
-                status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
-                                          si->si_inode);
-                if (status == 0)
-                        ocfs2_update_slot_info(si);
                ocfs2_complete_lock_res_refresh(lockres, status);
@@ -2178,7 +2276,7 @@ bail:
 void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex)
 {
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
        if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return 0;
-        status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+        status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
        struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
        if (!ocfs2_mount_local(osb))
-                ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
 {
-        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
                   lockres->l_blocking);
        /* Dump the raw LVB */
-        lvb = lockres->l_lksb.lvb;
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        for(i = 0; i < DLM_LVB_LEN; i++)
                seq_printf(m, "0x%x\t", lvb[i]);
@@ -2409,7 +2507,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
        return 0;
 }
-static struct seq_operations ocfs2_dlm_seq_ops = {
+static const struct seq_operations ocfs2_dlm_seq_ops = {
        .start =        ocfs2_dlm_seq_start,
        .stop =         ocfs2_dlm_seq_stop,
        .next =         ocfs2_dlm_seq_next,
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
 int ocfs2_dlm_init(struct ocfs2_super *osb)
 {
        int status = 0;
-        u32 dlm_key;
+        struct ocfs2_cluster_connection *conn = NULL;
-        struct dlm_ctxt *dlm = NULL;
        mlog_entry_void();
-        if (ocfs2_mount_local(osb))
+        if (ocfs2_mount_local(osb)) {
+                osb->node_num = 0;
                goto local;
+        }
        status = ocfs2_dlm_init_debug(osb);
        if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
-        /* used by the dlm code to make message headers unique, each
-         * node in this domain must agree on this. */
-        dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
        /* for now, uuid == domain */
-        dlm = dlm_register_domain(osb->uuid_str, dlm_key,
+        status = ocfs2_cluster_connect(osb->osb_cluster_stack,
-                                  &osb->osb_locking_proto);
+                                       osb->uuid_str,
-        if (IS_ERR(dlm)) {
+                                       strlen(osb->uuid_str),
-                status = PTR_ERR(dlm);
+                                       ocfs2_do_node_down, osb,
+                                       &conn);
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
-        dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+        status = ocfs2_cluster_this_node(&osb->node_num);
+        if (status < 0) {
+                mlog_errno(status);
+                mlog(ML_ERROR,
+                     "could not find this host's node number\n");
+                ocfs2_cluster_disconnect(conn, 0);
+                goto bail;
+        }
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
-        osb->dlm = dlm;
+        osb->cconn = conn;
        status = 0;
 bail:
@@ -2560,14 +2664,19 @@ bail:
        return status;
 }
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+                        int hangup_pending)
 {
        mlog_entry_void();
-        dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
        ocfs2_drop_osb_locks(osb);
+        /*
+         * Now that we have dropped all locks and ocfs2_dismount_volume()
+         * has disabled recovery, the DLM won't be talking to us.  It's
+         * safe to tear things down before disconnecting the cluster.
+         */
        if (osb->dc_task) {
                kthread_stop(osb->dc_task);
                osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
-        dlm_unregister_domain(osb->dlm);
+        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
-        osb->dlm = NULL;
+        osb->cconn = NULL;
        ocfs2_dlm_shutdown_debug(osb);
        mlog_exit_void();
 }
-static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
+static void ocfs2_unlock_ast(void *opaque, int error)
 {
        struct ocfs2_lock_res *lockres = opaque;
        unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
             lockres->l_unlock_action);
        spin_lock_irqsave(&lockres->l_lock, flags);
-        /* We tried to cancel a convert request, but it was already
+        if (error) {
-         * granted. All we want to do here is clear our unlock
+                mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
-         * state. The wake_up call done at the bottom is redundant
+                     "unlock_action %d\n", error, lockres->l_name,
-         * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
-         * hurt anything anyway */
-        if (status == DLM_CANCELGRANT &&
-            lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
-                mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
-                /* We don't clear the busy flag in this case as it
-                 * should have been cleared by the ast which the dlm
-                 * has called. */
-                goto complete_unlock;
-        }
-        if (status != DLM_NORMAL) {
-                mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
-                     "unlock_action %d\n", status, lockres->l_name,
                     lockres->l_unlock_action);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
                lockres->l_action = OCFS2_AST_INVALID;
                break;
        case OCFS2_UNLOCK_DROP_LOCK:
-                lockres->l_level = LKM_IVMODE;
+                lockres->l_level = DLM_LOCK_IV;
                break;
        default:
                BUG();
        }
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-complete_unlock:
        lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
        spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2643,16 +2736,16 @@ complete_unlock:
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
                           struct ocfs2_lock_res *lockres)
 {
-        enum dlm_status status;
+        int ret;
        unsigned long flags;
-        int lkm_flags = 0;
+        u32 lkm_flags = 0;
        /* We didn't get anywhere near actually using this lockres. */
        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
                goto out;
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-                lkm_flags |= LKM_VALBLK;
+                lkm_flags |= DLM_LKF_VALBLK;
        spin_lock_irqsave(&lockres->l_lock, flags);
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
                if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
-                    lockres->l_level == LKM_EXMODE &&
+                    lockres->l_level == DLM_LOCK_EX &&
                    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
                        lockres->l_ops->set_lvb(lockres);
        }
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        mlog(0, "lock %s\n", lockres->l_name);
-        status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
+        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
-                           ocfs2_unlock_ast, lockres);
+                               lockres);
-        if (status != DLM_NORMAL) {
+        if (ret) {
-                ocfs2_log_dlm_error("dlmunlock", status, lockres);
+                ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
                mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
-                dlm_print_one_lock(lockres->l_lksb.lockid);
+                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from dlmunlock\n",
+        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
        return status;
 }
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-                                      int new_level)
+                                              int new_level)
 {
        assert_spin_locked(&lockres->l_lock);
-        BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+        BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
        if (lockres->l_level <= new_level) {
-                mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+                mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
                     lockres->l_level, new_level);
                BUG();
        }
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
        lockres->l_action = OCFS2_AST_DOWNCONVERT;
        lockres->l_requested = new_level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+        return lockres_set_pending(lockres);
 }
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                                  struct ocfs2_lock_res *lockres,
                                  int new_level,
-                                  int lvb)
+                                  int lvb,
+                                  unsigned int generation)
 {
-        int ret, dlm_flags = LKM_CONVERT;
+        int ret;
-        enum dlm_status status;
+        u32 dlm_flags = DLM_LKF_CONVERT;
        mlog_entry_void();
        if (lvb)
-                dlm_flags |= LKM_VALBLK;
+                dlm_flags |= DLM_LKF_VALBLK;
-        status = dlmlock(osb->dlm,
+        ret = ocfs2_dlm_lock(osb->cconn,
-                         new_level,
+                             new_level,
-                         &lockres->l_lksb,
+                             &lockres->l_lksb,
-                         dlm_flags,
+                             dlm_flags,
-                         lockres->l_name,
+                             lockres->l_name,
-                         OCFS2_LOCK_ID_MAX_LEN - 1,
+                             OCFS2_LOCK_ID_MAX_LEN - 1,
-                         ocfs2_locking_ast,
+                             lockres);
-                         lockres,
+        lockres_clear_pending(lockres, generation, osb);
-                         ocfs2_blocking_ast);
+        if (ret) {
-        if (status != DLM_NORMAL) {
+                ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
-                ocfs2_log_dlm_error("dlmlock", status, lockres);
-                ret = -EINVAL;
                ocfs2_recover_from_dlm_error(lockres, 1);
                goto bail;
        }
@@ -2862,7 +2955,7 @@ bail:
        return ret;
 }
-/* returns 1 when the caller should unlock and call dlmunlock */
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres)
 {
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
                                struct ocfs2_lock_res *lockres)
 {
        int ret;
-        enum dlm_status status;
        mlog_entry_void();
        mlog(0, "lock %s\n", lockres->l_name);
-        ret = 0;
+        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
-        status = dlmunlock(osb->dlm,
+                               DLM_LKF_CANCEL, lockres);
-                           &lockres->l_lksb,
+        if (ret) {
-                           LKM_CANCEL,
+                ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
-                           ocfs2_unlock_ast,
-                           lockres);
-        if (status != DLM_NORMAL) {
-                ocfs2_log_dlm_error("dlmunlock", status, lockres);
-                ret = -EINVAL;
                ocfs2_recover_from_dlm_error(lockres, 0);
        }
-        mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+        mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
        mlog_exit(ret);
        return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
        int new_level;
        int ret = 0;
        int set_lvb = 0;
+        unsigned int gen;
        mlog_entry_void();
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 recheck:
        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+                /* XXX
+                 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
+                 * exists entirely for one reason - another thread has set
+                 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+                 *
+                 * If we do ocfs2_cancel_convert() before the other thread
+                 * calls dlm_lock(), our cancel will do nothing.  We will
+                 * get no ast, and we will have no way of knowing the
+                 * cancel failed.  Meanwhile, the other thread will call
+                 * into dlm_lock() and wait...forever.
+                 *
+                 * Why forever?  Because another node has asked for the
+                 * lock first; that's why we're here in unblock_lock().
+                 *
+                 * The solution is OCFS2_LOCK_PENDING.  When PENDING is
+                 * set, we just requeue the unblock.  Only when the other
+                 * thread has called dlm_lock() and cleared PENDING will
+                 * we then cancel their request.
+                 *
+                 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+                 * at the same time they set OCFS2_DLM_BUSY.  They must
+                 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+                 */
+                if (lockres->l_flags & OCFS2_LOCK_PENDING)
+                        goto leave_requeue;
                ctl->requeue = 1;
                ret = ocfs2_prepare_cancel_convert(osb, lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
        /* if we're blocking an exclusive and we have *any* holders,
         * then requeue. */
-        if ((lockres->l_blocking == LKM_EXMODE)
+        if ((lockres->l_blocking == DLM_LOCK_EX)
            && (lockres->l_ex_holders || lockres->l_ro_holders))
                goto leave_requeue;
        /* If it's a PR we're blocking, then only
         * requeue if we've got any EX holders */
-        if (lockres->l_blocking == LKM_PRMODE &&
+        if (lockres->l_blocking == DLM_LOCK_PR &&
            lockres->l_ex_holders)
                goto leave_requeue;
@@ -3005,7 +3119,7 @@ downconvert:
        ctl->requeue = 0;
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
-                if (lockres->l_level == LKM_EXMODE)
+                if (lockres->l_level == DLM_LOCK_EX)
                        set_lvb = 1;
                /*
@@ -3018,9 +3132,11 @@ downconvert:
                        lockres->l_ops->set_lvb(lockres);
        }
-        ocfs2_prepare_downconvert(lockres, new_level);
+        gen = ocfs2_prepare_downconvert(lockres, new_level);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+        ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+                                     gen);
 leave:
        mlog_exit(ret);
        return ret;
@@ -3042,7 +3158,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
        inode = ocfs2_lock_res_inode(lockres);
        mapping = inode->i_mapping;
-        if (S_ISREG(inode->i_mode))
+        if (!S_ISREG(inode->i_mode))
                goto out;
        /*
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
        }
        sync_mapping_buffers(mapping);
-        if (blocking == LKM_EXMODE) {
+        if (blocking == DLM_LOCK_EX) {
                truncate_inode_pages(mapping, 0);
        } else {
                /* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
        struct inode *inode = ocfs2_lock_res_inode(lockres);
        int checkpointed = ocfs2_inode_fully_checkpointed(inode);
-        BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
+        BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
-        BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
+        BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
        if (checkpointed)
                return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
         * valid. The downconvert code will retain a PR for this node,
         * so there's no further work to do.
         */
-        if (blocking == LKM_PRMODE)
+        if (blocking == DLM_LOCK_PR)
                return UNBLOCK_CONTINUE;
        /*
@@ -3219,8 +3335,47 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
-void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+/*
-                                struct ocfs2_lock_res *lockres)
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+        .lp_max_version = {
+                .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+                .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+        },
+        .lp_lock_ast            = ocfs2_locking_ast,
+        .lp_blocking_ast        = ocfs2_blocking_ast,
+        .lp_unlock_ast          = ocfs2_unlock_ast,
+};
+void ocfs2_set_locking_protocol(void)
+{
+        ocfs2_stack_glue_set_locking_protocol(&lproto);
+}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+                                       struct ocfs2_lock_res *lockres)
 {
        int status;
        struct ocfs2_unblock_ctl ctl = {0, 0,};
@@ -3356,7 +3511,7 @@ static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
        return should_wake;
 }
-int ocfs2_downconvert_thread(void *arg)
+static int ocfs2_downconvert_thread(void *arg)
 {
        int status = 0;
        struct ocfs2_super *osb = arg;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d5b0699d0a9..2bb01f09c1b1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
 #define OCFS2_LOCK_NONBLOCK             (0x04)
 int ocfs2_dlm_init(struct ocfs2_super *osb);
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               enum ocfs2_lock_type type,
@@ -109,12 +109,11 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
                               struct ocfs2_lock_res *lockres);
 /* for the downconvert thread */
-void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
-                                struct ocfs2_lock_res *lockres);
 void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
-extern const struct dlm_protocol_version ocfs2_locking_protocol;
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
 #endif  /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85d..9154c82d3258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
        .open           = ocfs2_file_open,
        .aio_read       = ocfs2_file_aio_read,
        .aio_write      = ocfs2_file_aio_write,
-        .ioctl          = ocfs2_ioctl,
+        .unlocked_ioctl = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
-        .ioctl          = ocfs2_ioctl,
+        .unlocked_ioctl = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c0efd9489fe8..c6e7213db868 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,9 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/kmod.h>
-#include <dlm/dlmapi.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -48,32 +45,36 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
                                              int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+/* special case -1 for now
-                                 struct ocfs2_node_map *from);
+ * TODO: should *really* make sure the calling func never passes -1!!  */
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+static void ocfs2_node_map_init(struct ocfs2_node_map *map)
-                                 struct ocfs2_node_map *from);
+{
+        map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+        memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+               sizeof(unsigned long));
+}
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
        spin_lock_init(&osb->node_map_lock);
-        ocfs2_node_map_init(&osb->recovery_map);
        ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
-static void ocfs2_do_node_down(int node_num,
+void ocfs2_do_node_down(int node_num, void *data)
-                               struct ocfs2_super *osb)
 {
+        struct ocfs2_super *osb = data;
        BUG_ON(osb->node_num == node_num);
        mlog(0, "ocfs2: node down event for %d\n", node_num);
-        if (!osb->dlm) {
+        if (!osb->cconn) {
                /*
-                 * No DLM means we're not even ready to participate yet.
+                 * No cluster connection means we're not even ready to
-                 * We check the slots after the DLM comes up, so we will
+                 * participate yet.  We check the slots after the cluster
-                 * notice the node death then.  We can safely ignore it
+                 * comes up, so we will notice the node death then.  We
-                 * here.
+                 * can safely ignore it here.
                 */
                return;
        }
@@ -81,70 +82,6 @@ static void ocfs2_do_node_down(int node_num,
        ocfs2_recovery_thread(osb, node_num);
 }
-/* Called from the dlm when it's about to evict a node. We may also
- * get a heartbeat callback later. */
-static void ocfs2_dlm_eviction_cb(int node_num,
-                                  void *data)
-{
-        struct ocfs2_super *osb = (struct ocfs2_super *) data;
-        struct super_block *sb = osb->sb;
-        mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
-             MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
-        ocfs2_do_node_down(node_num, osb);
-}
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
-{
-        /* Not exactly a heartbeat callback, but leads to essentially
-         * the same path so we set it up here. */
-        dlm_setup_eviction_cb(&osb->osb_eviction_cb,
-                              ocfs2_dlm_eviction_cb,
-                              osb);
-}
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
-{
-        int ret;
-        char *argv[5], *envp[3];
-        if (ocfs2_mount_local(osb))
-                return;
-        if (!osb->uuid_str) {
-                /* This can happen if we don't get far enough in mount... */
-                mlog(0, "No UUID with which to stop heartbeat!\n\n");
-                return;
-        }
-        argv[0] = (char *)o2nm_get_hb_ctl_path();
-        argv[1] = "-K";
-        argv[2] = "-u";
-        argv[3] = osb->uuid_str;
-        argv[4] = NULL;
-        mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-        /* minimal command environment taken from cpu_run_sbin_hotplug */
-        envp[0] = "HOME=/";
-        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-        envp[2] = NULL;
-        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-        if (ret < 0)
-                mlog_errno(ret);
-}
-/* special case -1 for now
- * TODO: should *really* make sure the calling func never passes -1!!  */
-void ocfs2_node_map_init(struct ocfs2_node_map *map)
-{
-        map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
-        memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
-               sizeof(unsigned long));
-}
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit)
 {
@@ -196,108 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
        return ret;
 }
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
-        int bit;
-        bit = find_next_bit(map->map, map->num_nodes, 0);
-        if (bit < map->num_nodes)
-                return 0;
-        return 1;
-}
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-                            struct ocfs2_node_map *map)
-{
-        int ret;
-        BUG_ON(map->num_nodes == 0);
-        spin_lock(&osb->node_map_lock);
-        ret = __ocfs2_node_map_is_empty(map);
-        spin_unlock(&osb->node_map_lock);
-        return ret;
-}
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
-                                 struct ocfs2_node_map *from)
-{
-        BUG_ON(from->num_nodes == 0);
-        ocfs2_node_map_init(target);
-        __ocfs2_node_map_set(target, from);
-}
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *target,
-                           int bit)
-{
-        struct ocfs2_node_map temp;
-        int ret;
-        spin_lock(&osb->node_map_lock);
-        __ocfs2_node_map_dup(&temp, target);
-        __ocfs2_node_map_clear_bit(&temp, bit);
-        ret = __ocfs2_node_map_is_empty(&temp);
-        spin_unlock(&osb->node_map_lock);
-        return ret;
-}
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
-                                 struct ocfs2_node_map *from)
-{
-        int num_longs, i;
-        BUG_ON(target->num_nodes != from->num_nodes);
-        BUG_ON(target->num_nodes == 0);
-        num_longs = BITS_TO_LONGS(target->num_nodes);
-        for (i = 0; i < num_longs; i++)
-                target->map[i] = from->map[i];
-}
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-                           int num)
-{
-        int set = 0;
-        spin_lock(&osb->node_map_lock);
-        if (!test_bit(num, osb->recovery_map.map)) {
-            __ocfs2_node_map_set_bit(&osb->recovery_map, num);
-            set = 1;
-        }
-        spin_unlock(&osb->node_map_lock);
-        return set;
-}
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-                              int num)
-{
-        ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *map,
-                           int idx)
-{
-        int i = idx;
-        idx = O2NM_INVALID_NODE_NUM;
-        spin_lock(&osb->node_map_lock);
-        if ((i != O2NM_INVALID_NODE_NUM) &&
-            (i >= 0) &&
-            (i < map->num_nodes)) {
-                while(i < map->num_nodes) {
-                        if (test_bit(i, map->map)) {
-                                idx = i;
-                                break;
-                        }
-                        i++;
-                }
-        }
-        spin_unlock(&osb->node_map_lock);
-        return idx;
-}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 56859211888a..74b9c5dda28d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,14 +28,10 @@
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_do_node_down(int node_num, void *data);
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 /* node map functions - used to keep track of mounted and in-recovery
 * nodes. */
-void ocfs2_node_map_init(struct ocfs2_node_map *map);
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-                            struct ocfs2_node_map *map);
 void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
                            struct ocfs2_node_map *map,
                            int bit);
@@ -45,21 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
                            struct ocfs2_node_map *map,
                            int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *map,
-                           int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
-                                               struct ocfs2_node_map *map)
-{
-        return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-                           int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-                              int num);
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *target,
-                           int bit);
 #endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162b..7b142f0ce995 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -59,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
                goto bail;
        }
-        status = -EROFS;
-        if (IS_RDONLY(inode))
-                goto bail_unlock;
        status = -EACCES;
        if (!is_owner_or_cap(inode))
                goto bail_unlock;
@@ -112,9 +109,9 @@ bail:
        return status;
 }
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-        unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int new_clusters;
        int status;
@@ -133,8 +130,13 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
                if (get_user(flags, (int __user *) arg))
                        return -EFAULT;
-                return ocfs2_set_inode_attr(inode, flags,
+                status = mnt_want_write(filp->f_path.mnt);
+                if (status)
+                        return status;
+                status = ocfs2_set_inode_attr(inode, flags,
                        OCFS2_FL_MODIFIABLE);
+                mnt_drop_write(filp->f_path.mnt);
+                return status;
        case OCFS2_IOC_RESVSP:
        case OCFS2_IOC_RESVSP64:
        case OCFS2_IOC_UNRESVSP:
@@ -168,9 +170,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 #ifdef CONFIG_COMPAT
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
-        int ret;
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
                cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +189,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return -ENOIOCTLCMD;
        }
-        lock_kernel();
+        return ocfs2_ioctl(file, cmd, arg);
-        ret = ocfs2_ioctl(inode, file, cmd, arg);
-        unlock_kernel();
-        return ret;
 }
 #endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0d..cf9a5ee30fef 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
 #ifndef OCFS2_IOCTL_H
 #define OCFS2_IOCTL_H
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-        unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c3..9698338adc39 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+struct ocfs2_recovery_map {
+        unsigned int rm_used;
+        unsigned int *rm_entries;
+};
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+        struct ocfs2_recovery_map *rm;
+        mutex_init(&osb->recovery_lock);
+        osb->disable_recovery = 0;
+        osb->recovery_thread_task = NULL;
+        init_waitqueue_head(&osb->recovery_event);
+        rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+                     osb->max_slots * sizeof(unsigned int),
+                     GFP_KERNEL);
+        if (!rm) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
+        rm->rm_entries = (unsigned int *)((char *)rm +
+                                          sizeof(struct ocfs2_recovery_map));
+        osb->recovery_map = rm;
+        return 0;
+}
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+        mb();
+        return osb->recovery_thread_task != NULL;
+}
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+        struct ocfs2_recovery_map *rm;
+        /* disable any new recovery threads and wait for any currently
+         * running ones to exit. Do this before setting the vol_state. */
+        mutex_lock(&osb->recovery_lock);
+        osb->disable_recovery = 1;
+        mutex_unlock(&osb->recovery_lock);
+        wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+        /* At this point, we know that no more recovery threads can be
+         * launched, so wait for any recovery completion work to
+         * complete. */
+        flush_workqueue(ocfs2_wq);
+        /*
+         * Now that recovery is shut down, and the osb is about to be
+         * freed,  the osb_lock is not taken here.
+         */
+        rm = osb->recovery_map;
+        /* XXX: Should we bug if there are dirty entries? */
+        kfree(rm);
+}
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+                                     unsigned int node_num)
+{
+        int i;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        assert_spin_locked(&osb->osb_lock);
+        for (i = 0; i < rm->rm_used; i++) {
+                if (rm->rm_entries[i] == node_num)
+                        return 1;
+        }
+        return 0;
+}
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+                                  unsigned int node_num)
+{
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        if (__ocfs2_recovery_map_test(osb, node_num)) {
+                spin_unlock(&osb->osb_lock);
+                return 1;
+        }
+        /* XXX: Can this be exploited? Not from o2dlm... */
+        BUG_ON(rm->rm_used >= osb->max_slots);
+        rm->rm_entries[rm->rm_used] = node_num;
+        rm->rm_used++;
+        spin_unlock(&osb->osb_lock);
+        return 0;
+}
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+                                     unsigned int node_num)
+{
+        int i;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        for (i = 0; i < rm->rm_used; i++) {
+                if (rm->rm_entries[i] == node_num)
+                        break;
+        }
+        if (i < rm->rm_used) {
+                /* XXX: be careful with the pointer math */
+                memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+                        (rm->rm_used - i - 1) * sizeof(unsigned int));
+                rm->rm_used--;
+        }
+        spin_unlock(&osb->osb_lock);
+}
 static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
        int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
        mlog_entry_void();
-        if (!journal)
+        BUG_ON(!journal);
-                BUG();
        osb = journal->j_osb;
@@ -650,6 +780,23 @@ bail:
        return status;
 }
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+        int empty;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        spin_lock(&osb->osb_lock);
+        empty = (rm->rm_used == 0);
+        spin_unlock(&osb->osb_lock);
+        return empty;
+}
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+        wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
 /*
 * JBD Might read a cached version of another nodes journal file. We
 * don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
 {
        int status, node_num;
        struct ocfs2_super *osb = arg;
+        struct ocfs2_recovery_map *rm = osb->recovery_map;
        mlog_entry_void();
@@ -863,26 +1011,29 @@ restart:
                goto bail;
        }
-        while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+        spin_lock(&osb->osb_lock);
-                node_num = ocfs2_node_map_first_set_bit(osb,
+        while (rm->rm_used) {
-                                                        &osb->recovery_map);
+                /* It's always safe to remove entry zero, as we won't
-                if (node_num == O2NM_INVALID_NODE_NUM) {
+                 * clear it until ocfs2_recover_node() has succeeded. */
-                        mlog(0, "Out of nodes to recover.\n");
+                node_num = rm->rm_entries[0];
-                        break;
+                spin_unlock(&osb->osb_lock);
-                }
                status = ocfs2_recover_node(osb, node_num);
-                if (status < 0) {
+                if (!status) {
+                        ocfs2_recovery_map_clear(osb, node_num);
+                } else {
                        mlog(ML_ERROR,
                             "Error %d recovering node %d on device (%u,%u)!\n",
                             status, node_num,
                             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
                        mlog(ML_ERROR, "Volume requires unmount.\n");
-                        continue;
                }
-                ocfs2_recovery_map_clear(osb, node_num);
+                spin_lock(&osb->osb_lock);
        }
+        spin_unlock(&osb->osb_lock);
+        mlog(0, "All nodes recovered\n");
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
 bail:
        mutex_lock(&osb->recovery_lock);
-        if (!status &&
+        if (!status && !ocfs2_recovery_completed(osb)) {
-            !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
                mutex_unlock(&osb->recovery_lock);
                goto restart;
        }
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
        /* People waiting on recovery will wait on
         * the recovery map to empty. */
-        if (!ocfs2_recovery_map_set(osb, node_num))
+        if (ocfs2_recovery_map_set(osb, node_num))
-                mlog(0, "node %d already be in recovery.\n", node_num);
+                mlog(0, "node %d already in recovery map.\n", node_num);
        mlog(0, "starting recovery thread...\n");
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 {
        int status = 0;
        int slot_num;
-        struct ocfs2_slot_info *si = osb->slot_info;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
-        slot_num = ocfs2_node_num_to_slot(si, node_num);
+        slot_num = ocfs2_node_num_to_slot(osb, node_num);
-        if (slot_num == OCFS2_INVALID_SLOT) {
+        if (slot_num == -ENOENT) {
                status = 0;
                mlog(0, "no slot for this node, so no recovery required.\n");
                goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* Likewise, this would be a strange but ultimately not so
         * harmful place to get an error... */
-        ocfs2_clear_slot(si, slot_num);
+        status = ocfs2_clear_slot(osb, slot_num);
-        status = ocfs2_update_disk_slots(osb, si);
        if (status < 0)
                mlog_errno(status);
@@ -1184,23 +1332,24 @@ bail:
 * slot info struct has been updated from disk. */
 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
-        int status, i, node_num;
+        unsigned int node_num;
-        struct ocfs2_slot_info *si = osb->slot_info;
+        int status, i;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
-        spin_lock(&si->si_lock);
+        spin_lock(&osb->osb_lock);
-        for(i = 0; i < si->si_num_slots; i++) {
+        for (i = 0; i < osb->max_slots; i++) {
                if (i == osb->slot_num)
                        continue;
-                if (ocfs2_is_empty_slot(si, i))
+                status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+                if (status == -ENOENT)
                        continue;
-                node_num = si->si_global_node_nums[i];
+                if (__ocfs2_recovery_map_test(osb, node_num))
-                if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
                        continue;
-                spin_unlock(&si->si_lock);
+                spin_unlock(&osb->osb_lock);
                /* Ok, we have a slot occupied by another node which
                 * is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                        goto bail;
                }
-                spin_lock(&si->si_lock);
+                spin_lock(&osb->osb_lock);
        }
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
        status = 0;
 bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e78..db82be2532ed 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
 /*
 *  Journal Control:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index add1ffdc5c6c..ce0dc147602a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -120,9 +120,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
-        if (ocfs2_mount_local(osb))
-                goto bail;
        if (osb->local_alloc_size == 0)
                goto bail;
@@ -450,6 +447,8 @@ out_mutex:
        iput(main_bm_inode);
 out:
+        if (!status)
+                ocfs2_init_inode_steal_slot(osb);
        mlog_exit(status);
        return status;
 }
@@ -526,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        }
        ac->ac_inode = local_alloc_inode;
+        /* We should never use localalloc from another slot */
+        ac->ac_alloc_slot = osb->slot_num;
        ac->ac_which = OCFS2_AC_USE_LOCAL;
        get_bh(osb->local_alloc_bh);
        ac->ac_bh = osb->local_alloc_bh;
@@ -588,8 +589,7 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        while(bits_wanted--)
                ocfs2_set_bit(start++, bitmap);
-        alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
+        le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
-                                le32_to_cpu(alloc->id1.bitmap1.i_used));
        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
        if (status < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad9587516..d5d808fe0140 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
-        fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(current->fsuid);
        if (dir->i_mode & S_ISGID) {
                fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
         *
         * And that's why, just like the VFS, we need a file system
         * rename lock. */
-        if (old_dentry != new_dentry) {
+        if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
                status = ocfs2_rename_lock(osb);
                if (status < 0) {
                        mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e3..31692379c170 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,11 +36,8 @@
 #include <linux/mutex.h>
 #include <linux/jbd.h>
-#include "cluster/nodemanager.h"
+/* For union ocfs2_dlm_lksb */
-#include "cluster/heartbeat.h"
+#include "stackglue.h"
-#include "cluster/tcp.h"
-#include "dlm/dlmapi.h"
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
                                               * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
 #define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
+                                                 call to dlm_lock.  Only
+                                                 exists with BUSY set. */
 struct ocfs2_lock_res_ops;
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        struct dlm_lockstatus    l_lksb;
+        union ocfs2_dlm_lksb     l_lksb;
        /* used from AST/BAST funcs. */
        enum ocfs2_ast_action    l_action;
        enum ocfs2_unlock_action l_unlock_action;
        int                      l_requested;
        int                      l_blocking;
+        unsigned int             l_pending_gen;
        wait_queue_head_t        l_event;
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
 #define OCFS2_DEFAULT_ATIME_QUANTUM     60
 struct ocfs2_journal;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
        spinlock_t node_map_lock;
-        struct ocfs2_node_map recovery_map;
        u64 root_blkno;
        u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
        u32 s_feature_incompat;
        u32 s_feature_ro_compat;
-        /* Protects s_next_generaion, osb_flags. Could protect more on
+        /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
-         * osb as it's very short lived. */
+         * Could protect more on osb as it's very short lived.
+         */
        spinlock_t osb_lock;
        u32 s_next_generation;
        unsigned long osb_flags;
+        s16 s_inode_steal_slot;
+        atomic_t s_num_inodes_stolen;
        unsigned long s_mount_opt;
        unsigned int s_atime_quantum;
-        u16 max_slots;
+        unsigned int max_slots;
-        s16 node_num;
+        unsigned int node_num;
-        s16 slot_num;
+        int slot_num;
-        s16 preferred_slot;
+        int preferred_slot;
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
        atomic_t vol_state;
        struct mutex recovery_lock;
+        struct ocfs2_recovery_map *recovery_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
-        struct dlm_ctxt *dlm;
+        char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
-        struct dlm_eviction_cb osb_eviction_cb;
        struct ocfs2_dlm_debug *osb_dlm_debug;
-        struct dlm_protocol_version osb_locking_proto;
        struct dentry *osb_debug_root;
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
        return ret;
 }
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+        return (osb->s_feature_incompat &
+                OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+}
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 {
        return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
 }
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+        return (osb->s_feature_incompat &
+                OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
 #define OCFS2_IS_VALID_DINODE(ptr)                                      \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
        return pages_per_cluster;
 }
+static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+        spin_lock(&osb->osb_lock);
+        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+        spin_unlock(&osb->osb_lock);
+        atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
+                                              s16 slot)
+{
+        spin_lock(&osb->osb_lock);
+        osb->s_inode_steal_slot = slot;
+        spin_unlock(&osb->osb_lock);
+}
+static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+        s16 slot;
+        spin_lock(&osb->osb_lock);
+        slot = osb->s_inode_steal_slot;
+        spin_unlock(&osb->osb_lock);
+        return slot;
+}
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982f..52c426665154 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,9 @@
 #define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
-                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 /*
@@ -125,6 +127,21 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA      0x0040
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+/*
+ * Support for alternate, userspace cluster stacks.  If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used.  This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK  0x0080
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
 #define OCFS2_VOL_UUID_LEN              16
 #define OCFS2_MAX_VOL_LABEL_LEN         64
+/* The alternate, userspace stack fields */
+#define OCFS2_STACK_LABEL_LEN           4
+#define OCFS2_CLUSTER_NAME_LEN          16
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
@@ -475,6 +496,47 @@ struct ocfs2_extent_block
 };
 /*
+ * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
+ * system file.  A slot is valid if it contains a node number >= 0.  The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT.  This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/  __le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block.  OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+struct ocfs2_extended_slot {
+/*00*/  __u8    es_valid;
+        __u8    es_reserved1[3];
+        __le32  es_node_num;
+/*10*/
+};
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set.  It separates out the valid marker from the node number, and
+ * has room to grow.  Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/  struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file.  It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+struct ocfs2_cluster_info {
+/*00*/  __u8   ci_stack[OCFS2_STACK_LABEL_LEN];
+        __le32 ci_reserved;
+/*08*/  __u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+/*
 * On disk superblock for OCFS2
 * Note that it is contained inside an ocfs2_dinode, so all offsets
 * are relative to the start of ocfs2_dinode.id2.
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
                                         * group header */
 /*50*/  __u8  s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
 /*90*/  __u8  s_uuid[OCFS2_VOL_UUID_LEN];       /* 128-bit uuid */
-/*A0*/
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
+                                                     stack.  Only valid
+                                                     with INCOMPAT flag. */
+/*B8*/  __le64 s_reserved2[17];         /* Fill out superblock */
+/*140*/
+        /*
+         * NOTE: As stated above, all offsets are relative to
+         * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+         * 0xC0 + 0x140 = 0x200 or 512 bytes.  A superblock must fit within
+         * our smallest blocksize, which is 512 bytes.  To ensure this,
+         * we reserve the space in s_reserved2.  Anything past s_reserved2
+         * will not be available on the smallest blocksize.
+         */
 };
 /*
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2b..82c200f7a8f1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
 {
 #ifdef __KERNEL__
-        mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+        BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
 #endif
        return ocfs2_lock_type_strings[type];
 }
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 37835ffcb039..8166968e9015 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -597,7 +597,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                memset(cr, 0, sizeof(struct ocfs2_chain_rec));
        }
-        cr->c_blkno = le64_to_cpu(input->group);
+        cr->c_blkno = cpu_to_le64(input->group);
        le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
        le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e64..bb5ff8939bf1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,81 +42,244 @@
 #include "buffer_head_io.h"
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                                    s16 global);
+struct ocfs2_slot {
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+        int sl_valid;
-                              s16 slot_num,
+        unsigned int sl_node_num;
-                              s16 node_num);
+};
-/* post the slot information on disk into our slot_info struct. */
+struct ocfs2_slot_info {
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+        int si_extended;
+        int si_slots_per_block;
+        struct inode *si_inode;
+        unsigned int si_blocks;
+        struct buffer_head **si_bh;
+        unsigned int si_num_slots;
+        struct ocfs2_slot *si_slots;
+};
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+                                    unsigned int node_num);
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+                                  int slot_num)
+{
+        BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+        si->si_slots[slot_num].sl_valid = 0;
+}
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+                           int slot_num, unsigned int node_num)
+{
+        BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+        si->si_slots[slot_num].sl_valid = 1;
+        si->si_slots[slot_num].sl_node_num = node_num;
+}
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+        int b, i, slotno;
+        struct ocfs2_slot_map_extended *se;
+        slotno = 0;
+        for (b = 0; b < si->si_blocks; b++) {
+                se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+                for (i = 0;
+                     (i < si->si_slots_per_block) &&
+                     (slotno < si->si_num_slots);
+                     i++, slotno++) {
+                        if (se->se_slots[i].es_valid)
+                                ocfs2_set_slot(si, slotno,
+                                               le32_to_cpu(se->se_slots[i].es_node_num));
+                        else
+                                ocfs2_invalidate_slot(si, slotno);
+                }
+        }
+}
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
 {
        int i;
-        __le16 *disk_info;
+        struct ocfs2_slot_map *sm;
-        /* we don't read the slot block here as ocfs2_super_lock
+        sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
-         * should've made sure we have the most recent copy. */
-        spin_lock(&si->si_lock);
-        disk_info = (__le16 *) si->si_bh->b_data;
-        for (i = 0; i < si->si_size; i++)
+        for (i = 0; i < si->si_num_slots; i++) {
-                si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+                if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+                        ocfs2_invalidate_slot(si, i);
+                else
+                        ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+        }
+}
-        spin_unlock(&si->si_lock);
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+        /*
+         * The slot data will have been refreshed when ocfs2_super_lock
+         * was taken.
+         */
+        if (si->si_extended)
+                ocfs2_update_slot_info_extended(si);
+        else
+                ocfs2_update_slot_info_old(si);
+}
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+        int ret;
+        struct ocfs2_slot_info *si = osb->slot_info;
+        if (si == NULL)
+                return 0;
+        BUG_ON(si->si_blocks == 0);
+        BUG_ON(si->si_bh == NULL);
+        mlog(0, "Refreshing slot map, reading %u block(s)\n",
+             si->si_blocks);
+        /*
+         * We pass -1 as blocknr because we expect all of si->si_bh to
+         * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
+         * this is not true, the read of -1 (UINT64_MAX) will fail.
+         */
+        ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+                                si->si_inode);
+        if (ret == 0) {
+                spin_lock(&osb->osb_lock);
+                ocfs2_update_slot_info(si);
+                spin_unlock(&osb->osb_lock);
+        }
+        return ret;
 }
 /* post the our slot info stuff into it's destination bh and write it
 * out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
-                            struct ocfs2_slot_info *si)
+                                            int slot_num,
+                                            struct buffer_head **bh)
 {
-        int status, i;
+        int blkind = slot_num / si->si_slots_per_block;
-        __le16 *disk_info = (__le16 *) si->si_bh->b_data;
+        int slotno = slot_num % si->si_slots_per_block;
+        struct ocfs2_slot_map_extended *se;
+        BUG_ON(blkind >= si->si_blocks);
+        se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+        se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+        if (si->si_slots[slot_num].sl_valid)
+                se->se_slots[slotno].es_node_num =
+                        cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+        *bh = si->si_bh[blkind];
+}
-        spin_lock(&si->si_lock);
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
-        for (i = 0; i < si->si_size; i++)
+                                       int slot_num,
-                disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+                                       struct buffer_head **bh)
-        spin_unlock(&si->si_lock);
+{
+        int i;
+        struct ocfs2_slot_map *sm;
+        sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+        for (i = 0; i < si->si_num_slots; i++) {
+                if (si->si_slots[i].sl_valid)
+                        sm->sm_slots[i] =
+                                cpu_to_le16(si->si_slots[i].sl_node_num);
+                else
+                        sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+        }
+        *bh = si->si_bh[0];
+}
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+                                  struct ocfs2_slot_info *si,
+                                  int slot_num)
+{
+        int status;
+        struct buffer_head *bh;
+        spin_lock(&osb->osb_lock);
+        if (si->si_extended)
+                ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+        else
+                ocfs2_update_disk_slot_old(si, slot_num, &bh);
+        spin_unlock(&osb->osb_lock);
-        status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+        status = ocfs2_write_block(osb, bh, si->si_inode);
        if (status < 0)
                mlog_errno(status);
        return status;
 }
-/* try to find global node in the slot info. Returns
+/*
- * OCFS2_INVALID_SLOT if nothing is found. */
+ * Calculate how many bytes are needed by the slot map.  Returns
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ * an error if the slot map file is too small.
-                                    s16 global)
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+                                        struct inode *inode,
+                                        unsigned long long *bytes)
 {
-        int i;
+        unsigned long long bytes_needed;
-        s16 ret = OCFS2_INVALID_SLOT;
+        if (ocfs2_uses_extended_slot_map(osb)) {
+                bytes_needed = osb->max_slots *
+                        sizeof(struct ocfs2_extended_slot);
+        } else {
+                bytes_needed = osb->max_slots * sizeof(__le16);
+        }
+        if (bytes_needed > i_size_read(inode)) {
+                mlog(ML_ERROR,
+                     "Slot map file is too small!  (size %llu, needed %llu)\n",
+                     i_size_read(inode), bytes_needed);
+                return -ENOSPC;
+        }
+        *bytes = bytes_needed;
+        return 0;
+}
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+                                    unsigned int node_num)
+{
+        int i, ret = -ENOENT;
        for(i = 0; i < si->si_num_slots; i++) {
-                if (global == si->si_global_node_nums[i]) {
+                if (si->si_slots[i].sl_valid &&
-                        ret = (s16) i;
+                    (node_num == si->si_slots[i].sl_node_num)) {
+                        ret = i;
                        break;
                }
        }
        return ret;
 }
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+                                   int preferred)
 {
-        int i;
+        int i, ret = -ENOSPC;
-        s16 ret = OCFS2_INVALID_SLOT;
-        if (preferred >= 0 && preferred < si->si_num_slots) {
+        if ((preferred >= 0) && (preferred < si->si_num_slots)) {
-                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+                if (!si->si_slots[preferred].sl_valid) {
                        ret = preferred;
                        goto out;
                }
        }
        for(i = 0; i < si->si_num_slots; i++) {
-                if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
+                if (!si->si_slots[i].sl_valid) {
-                        ret = (s16) i;
+                        ret = i;
                        break;
                }
        }
@@ -124,58 +287,155 @@ out:
        return ret;
 }
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
-                           s16 global)
 {
-        s16 ret;
+        int slot;
+        struct ocfs2_slot_info *si = osb->slot_info;
-        spin_lock(&si->si_lock);
+        spin_lock(&osb->osb_lock);
-        ret = __ocfs2_node_num_to_slot(si, global);
+        slot = __ocfs2_node_num_to_slot(si, node_num);
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
-        return ret;
+        return slot;
+}
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+                                  unsigned int *node_num)
+{
+        struct ocfs2_slot_info *si = osb->slot_info;
+        assert_spin_locked(&osb->osb_lock);
+        BUG_ON(slot_num < 0);
+        BUG_ON(slot_num > osb->max_slots);
+        if (!si->si_slots[slot_num].sl_valid)
+                return -ENOENT;
+        *node_num = si->si_slots[slot_num].sl_node_num;
+        return 0;
 }
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
-                              s16 slot_num,
-                              s16 node_num)
 {
-        BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+        unsigned int i;
-        BUG_ON(slot_num >= si->si_num_slots);
-        BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
+        if (si == NULL)
-               (node_num >= O2NM_MAX_NODES));
+                return;
+        if (si->si_inode)
+                iput(si->si_inode);
+        if (si->si_bh) {
+                for (i = 0; i < si->si_blocks; i++) {
+                        if (si->si_bh[i]) {
+                                brelse(si->si_bh[i]);
+                                si->si_bh[i] = NULL;
+                        }
+                }
+                kfree(si->si_bh);
+        }
-        si->si_global_node_nums[slot_num] = node_num;
+        kfree(si);
 }
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
-                      s16 slot_num)
 {
-        spin_lock(&si->si_lock);
+        struct ocfs2_slot_info *si = osb->slot_info;
-        __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
-        spin_unlock(&si->si_lock);
+        if (si == NULL)
+                return 0;
+        spin_lock(&osb->osb_lock);
+        ocfs2_invalidate_slot(si, slot_num);
+        spin_unlock(&osb->osb_lock);
+        return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
 }
-int ocfs2_init_slot_info(struct ocfs2_super *osb)
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+                                  struct ocfs2_slot_info *si)
 {
-        int status, i;
+        int status = 0;
        u64 blkno;
+        unsigned long long blocks, bytes;
+        unsigned int i;
+        struct buffer_head *bh;
+        status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+        if (status)
+                goto bail;
+        blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+        BUG_ON(blocks > UINT_MAX);
+        si->si_blocks = blocks;
+        if (!si->si_blocks)
+                goto bail;
+        if (si->si_extended)
+                si->si_slots_per_block =
+                        (osb->sb->s_blocksize /
+                         sizeof(struct ocfs2_extended_slot));
+        else
+                si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+        /* The size checks above should ensure this */
+        BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+        mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+             si->si_blocks, bytes);
+        si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+                            GFP_KERNEL);
+        if (!si->si_bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        for (i = 0; i < si->si_blocks; i++) {
+                status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+                                                     &blkno, NULL, NULL);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                mlog(0, "Reading slot map block %u at %llu\n", i,
+                     (unsigned long long)blkno);
+                bh = NULL;  /* Acquire a fresh bh */
+                status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                si->si_bh[i] = bh;
+        }
+bail:
+        return status;
+}
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+        int status;
        struct inode *inode = NULL;
-        struct buffer_head *bh = NULL;
        struct ocfs2_slot_info *si;
-        si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+        si = kzalloc(sizeof(struct ocfs2_slot_info) +
+                     (sizeof(struct ocfs2_slot) * osb->max_slots),
+                     GFP_KERNEL);
        if (!si) {
                status = -ENOMEM;
                mlog_errno(status);
                goto bail;
        }
-        spin_lock_init(&si->si_lock);
+        si->si_extended = ocfs2_uses_extended_slot_map(osb);
        si->si_num_slots = osb->max_slots;
-        si->si_size = OCFS2_MAX_SLOTS;
+        si->si_slots = (struct ocfs2_slot *)((char *)si +
+                                             sizeof(struct ocfs2_slot_info));
-        for(i = 0; i < si->si_num_slots; i++)
-                si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
        inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
                                            OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
+        si->si_inode = inode;
-        if (status < 0) {
+        status = ocfs2_map_slot_buffers(osb, si);
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        si->si_inode = inode;
+        osb->slot_info = (struct ocfs2_slot_info *)si;
-        si->si_bh = bh;
-        osb->slot_info = si;
 bail:
        if (status < 0 && si)
-                ocfs2_free_slot_info(si);
+                __ocfs2_free_slot_info(si);
        return status;
 }
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
 {
-        if (si->si_inode)
+        struct ocfs2_slot_info *si = osb->slot_info;
-                iput(si->si_inode);
-        if (si->si_bh)
+        osb->slot_info = NULL;
-                brelse(si->si_bh);
+        __ocfs2_free_slot_info(si);
-        kfree(si);
 }
 int ocfs2_find_slot(struct ocfs2_super *osb)
 {
        int status;
-        s16 slot;
+        int slot;
        struct ocfs2_slot_info *si;
        mlog_entry_void();
        si = osb->slot_info;
+        spin_lock(&osb->osb_lock);
        ocfs2_update_slot_info(si);
-        spin_lock(&si->si_lock);
        /* search for ourselves first and take the slot if it already
         * exists. Perhaps we need to mark this in a variable for our
         * own journal recovery? Possibly not, though we certainly
         * need to warn to the user */
        slot = __ocfs2_node_num_to_slot(si, osb->node_num);
-        if (slot == OCFS2_INVALID_SLOT) {
+        if (slot < 0) {
                /* if no slot yet, then just take 1st available
                 * one. */
                slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
-                if (slot == OCFS2_INVALID_SLOT) {
+                if (slot < 0) {
-                        spin_unlock(&si->si_lock);
+                        spin_unlock(&osb->osb_lock);
                        mlog(ML_ERROR, "no free slots available!\n");
                        status = -EINVAL;
                        goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
                mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
                     slot);
-        __ocfs2_fill_slot(si, slot, osb->node_num);
+        ocfs2_set_slot(si, slot, osb->node_num);
        osb->slot_num = slot;
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
        mlog(0, "taking node slot %d\n", osb->slot_num);
-        status = ocfs2_update_disk_slots(osb, si);
+        status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
        if (status < 0)
                mlog_errno(status);
@@ -265,27 +517,27 @@ bail:
 void ocfs2_put_slot(struct ocfs2_super *osb)
 {
-        int status;
+        int status, slot_num;
        struct ocfs2_slot_info *si = osb->slot_info;
        if (!si)
                return;
+        spin_lock(&osb->osb_lock);
        ocfs2_update_slot_info(si);
-        spin_lock(&si->si_lock);
+        slot_num = osb->slot_num;
-        __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+        ocfs2_invalidate_slot(si, osb->slot_num);
        osb->slot_num = OCFS2_INVALID_SLOT;
-        spin_unlock(&si->si_lock);
+        spin_unlock(&osb->osb_lock);
-        status = ocfs2_update_disk_slots(osb, si);
+        status = ocfs2_update_disk_slot(osb, si, slot_num);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
 bail:
-        osb->slot_info = NULL;
+        ocfs2_free_slot_info(osb);
-        ocfs2_free_slot_info(si);
 }
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaade..601c95fd7003 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,38 +27,18 @@
 #ifndef SLOTMAP_H
 #define SLOTMAP_H
-struct ocfs2_slot_info {
-        spinlock_t si_lock;
-        struct inode *si_inode;
-        struct buffer_head *si_bh;
-        unsigned int si_num_slots;
-        unsigned int si_size;
-        s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
 int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
 int ocfs2_find_slot(struct ocfs2_super *osb);
 void ocfs2_put_slot(struct ocfs2_super *osb);
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-                            struct ocfs2_slot_info *si);
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                           s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
-                      s16 slot_num);
-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
-                                      int slot_num)
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
-{
+                                  unsigned int *node_num);
-        BUG_ON(slot_num == OCFS2_INVALID_SLOT);
-        assert_spin_locked(&si->si_lock);
-        return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
-}
 #endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000000..ac1d74c63bf5
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,420 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/crc32.h>
+#include <linux/module.h>
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "stackglue.h"
+struct o2dlm_private {
+        struct dlm_eviction_cb op_eviction_cb;
+};
+static struct ocfs2_stack_plugin o2cb_stack;
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+        BUG_ON(mode > LKM_MAXMODE);
+        return mode;
+}
+#define map_flag(_generic, _o2dlm)              \
+        if (flags & (_generic)) {               \
+                flags &= ~(_generic);           \
+                o2dlm_flags |= (_o2dlm);        \
+        }
+static int flags_to_o2dlm(u32 flags)
+{
+        int o2dlm_flags = 0;
+        map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+        map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+        map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+        map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+        map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+        map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+        map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+        map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+        map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+        /* map_flag() should have cleared every flag passed in */
+        BUG_ON(flags != 0);
+        return o2dlm_flags;
+}
+#undef map_flag
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:          0
+ * DLM_NOTQUEUED:       -EAGAIN
+ * DLM_CANCELGRANT:     -EBUSY
+ * DLM_CANCEL:          -DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+        [DLM_NORMAL]                    = 0,            /* Success */
+        [DLM_GRANTED]                   = -EINVAL,
+        [DLM_DENIED]                    = -EACCES,
+        [DLM_DENIED_NOLOCKS]            = -EACCES,
+        [DLM_WORKING]                   = -EACCES,
+        [DLM_BLOCKED]                   = -EINVAL,
+        [DLM_BLOCKED_ORPHAN]            = -EINVAL,
+        [DLM_DENIED_GRACE_PERIOD]       = -EACCES,
+        [DLM_SYSERR]                    = -ENOMEM,      /* It is what it is */
+        [DLM_NOSUPPORT]                 = -EPROTO,
+        [DLM_CANCELGRANT]               = -EBUSY,       /* Cancel after grant */
+        [DLM_IVLOCKID]                  = -EINVAL,
+        [DLM_SYNC]                      = -EINVAL,
+        [DLM_BADTYPE]                   = -EINVAL,
+        [DLM_BADRESOURCE]               = -EINVAL,
+        [DLM_MAXHANDLES]                = -ENOMEM,
+        [DLM_NOCLINFO]                  = -EINVAL,
+        [DLM_NOLOCKMGR]                 = -EINVAL,
+        [DLM_NOPURGED]                  = -EINVAL,
+        [DLM_BADARGS]                   = -EINVAL,
+        [DLM_VOID]                      = -EINVAL,
+        [DLM_NOTQUEUED]                 = -EAGAIN,      /* Trylock failed */
+        [DLM_IVBUFLEN]                  = -EINVAL,
+        [DLM_CVTUNGRANT]                = -EPERM,
+        [DLM_BADPARAM]                  = -EINVAL,
+        [DLM_VALNOTVALID]               = -EINVAL,
+        [DLM_REJECTED]                  = -EPERM,
+        [DLM_ABORT]                     = -EINVAL,
+        [DLM_CANCEL]                    = -DLM_ECANCEL, /* Successful cancel */
+        [DLM_IVRESHANDLE]               = -EINVAL,
+        [DLM_DEADLOCK]                  = -EDEADLK,
+        [DLM_DENIED_NOASTS]             = -EINVAL,
+        [DLM_FORWARD]                   = -EINVAL,
+        [DLM_TIMEOUT]                   = -ETIMEDOUT,
+        [DLM_IVGROUPID]                 = -EINVAL,
+        [DLM_VERS_CONFLICT]             = -EOPNOTSUPP,
+        [DLM_BAD_DEVICE_PATH]           = -ENOENT,
+        [DLM_NO_DEVICE_PERMISSION]      = -EPERM,
+        [DLM_NO_CONTROL_DEVICE]         = -ENOENT,
+        [DLM_RECOVERING]                = -ENOTCONN,
+        [DLM_MIGRATING]                 = -ERESTART,
+        [DLM_MAXSTATS]                  = -EINVAL,
+};
+static int dlm_status_to_errno(enum dlm_status status)
+{
+        BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+        return status_map[status];
+}
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        o2cb_stack.sp_proto->lp_lock_ast(astarg);
+}
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+        int error = dlm_status_to_errno(status);
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        /*
+         * In o2dlm, you can get both the lock_ast() for the lock being
+         * granted and the unlock_ast() for the CANCEL failing.  A
+         * successful cancel sends DLM_NORMAL here.  If the
+         * lock grant happened before the cancel arrived, you get
+         * DLM_CANCELGRANT.
+         *
+         * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+         * we just ignore it.  We expect the lock_ast() to handle the
+         * granted lock.
+         */
+        if (status == DLM_CANCELGRANT)
+                return;
+        o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+}
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+                         int mode,
+                         union ocfs2_dlm_lksb *lksb,
+                         u32 flags,
+                         void *name,
+                         unsigned int namelen,
+                         void *astarg)
+{
+        enum dlm_status status;
+        int o2dlm_mode = mode_to_o2dlm(mode);
+        int o2dlm_flags = flags_to_o2dlm(flags);
+        int ret;
+        status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+                         o2dlm_flags, name, namelen,
+                         o2dlm_lock_ast_wrapper, astarg,
+                         o2dlm_blocking_ast_wrapper);
+        ret = dlm_status_to_errno(status);
+        return ret;
+}
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                           union ocfs2_dlm_lksb *lksb,
+                           u32 flags,
+                           void *astarg)
+{
+        enum dlm_status status;
+        int o2dlm_flags = flags_to_o2dlm(flags);
+        int ret;
+        status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+                           o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+        ret = dlm_status_to_errno(status);
+        return ret;
+}
+static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+        return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+        return (void *)(lksb->lksb_o2dlm.lvb);
+}
+static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+        dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+        struct ocfs2_cluster_connection *conn = data;
+        mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+             node_num, conn->cc_namelen, conn->cc_name);
+        conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+        int rc = 0;
+        u32 dlm_key;
+        struct dlm_ctxt *dlm;
+        struct o2dlm_private *priv;
+        struct dlm_protocol_version dlm_version;
+        BUG_ON(conn == NULL);
+        BUG_ON(o2cb_stack.sp_proto == NULL);
+        /* for now we only have one cluster/node, make sure we see it
+         * in the heartbeat universe */
+        if (!o2hb_check_local_node_heartbeating()) {
+                rc = -EINVAL;
+                goto out;
+        }
+        priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+        if (!priv) {
+                rc = -ENOMEM;
+                goto out_free;
+        }
+        /* This just fills the structure in.  It is safe to pass conn. */
+        dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+                              conn);
+        conn->cc_private = priv;
+        /* used by the dlm code to make message headers unique, each
+         * node in this domain must agree on this. */
+        dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+        dlm_version.pv_major = conn->cc_version.pv_major;
+        dlm_version.pv_minor = conn->cc_version.pv_minor;
+        dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+        if (IS_ERR(dlm)) {
+                rc = PTR_ERR(dlm);
+                mlog_errno(rc);
+                goto out_free;
+        }
+        conn->cc_version.pv_major = dlm_version.pv_major;
+        conn->cc_version.pv_minor = dlm_version.pv_minor;
+        conn->cc_lockspace = dlm;
+        dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+out_free:
+        if (rc && conn->cc_private)
+                kfree(conn->cc_private);
+out:
+        return rc;
+}
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                                   int hangup_pending)
+{
+        struct dlm_ctxt *dlm = conn->cc_lockspace;
+        struct o2dlm_private *priv = conn->cc_private;
+        dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+        conn->cc_private = NULL;
+        kfree(priv);
+        dlm_unregister_domain(dlm);
+        conn->cc_lockspace = NULL;
+        return 0;
+}
+static void o2hb_stop(const char *group)
+{
+        int ret;
+        char *argv[5], *envp[3];
+        argv[0] = (char *)o2nm_get_hb_ctl_path();
+        argv[1] = "-K";
+        argv[2] = "-u";
+        argv[3] = (char *)group;
+        argv[4] = NULL;
+        mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+        /* minimal command environment taken from cpu_run_sbin_hotplug */
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        if (ret < 0)
+                mlog_errno(ret);
+}
+/*
+ * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
+ * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
+ * happens regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
+ * the glue and provide a "hangup" API for super.c to call.
+ *
+ * Other stacks will eventually provide a NULL ->hangup() pointer.
+ */
+static void o2cb_cluster_hangup(const char *group, int grouplen)
+{
+        o2hb_stop(group);
+}
+static int o2cb_cluster_this_node(unsigned int *node)
+{
+        int node_num;
+        node_num = o2nm_this_node();
+        if (node_num == O2NM_INVALID_NODE_NUM)
+                return -ENOENT;
+        if (node_num >= O2NM_MAX_NODES)
+                return -EOVERFLOW;
+        *node = node_num;
+        return 0;
+}
+struct ocfs2_stack_operations o2cb_stack_ops = {
+        .connect        = o2cb_cluster_connect,
+        .disconnect     = o2cb_cluster_disconnect,
+        .hangup         = o2cb_cluster_hangup,
+        .this_node      = o2cb_cluster_this_node,
+        .dlm_lock       = o2cb_dlm_lock,
+        .dlm_unlock     = o2cb_dlm_unlock,
+        .lock_status    = o2cb_dlm_lock_status,
+        .lock_lvb       = o2cb_dlm_lvb,
+        .dump_lksb      = o2cb_dump_lksb,
+};
+static struct ocfs2_stack_plugin o2cb_stack = {
+        .sp_name        = "o2cb",
+        .sp_ops         = &o2cb_stack_ops,
+        .sp_owner       = THIS_MODULE,
+};
+static int __init o2cb_stack_init(void)
+{
+        return ocfs2_stack_glue_register(&o2cb_stack);
+}
+static void __exit o2cb_stack_exit(void)
+{
+        ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000000..7428663f9cbb
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,883 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+#include "ocfs2.h"  /* For struct ocfs2_lock_res */
+#include "stackglue.h"
+/*
+ * The control protocol starts with a handshake.  Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple.  First, the client reads until EOF.  Each line
+ * of output is a supported protocol tag.  All protocol tags are a single
+ * character followed by a two hex digit version number.  Currently the
+ * only things supported is T01, for "Text-base version 0x01".  Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'.  If the version tag written is
+ * unknown, -EINVAL is returned.  Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages.  First is the "SETN" message.
+ * It has the following syntax:
+ *
+ *  SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message.  It has the following syntax:
+ *
+ *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client.  The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
+ * number from the "SETV" message must match
+ * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * must be less than or equal to ...->lp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed.  From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO                     "T01\n"
+#define OCFS2_CONTROL_PROTO_LEN                 4
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID         (0)
+#define OCFS2_CONTROL_HANDSHAKE_READ            (1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL        (2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID           (3)
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN            4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP        "SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP     "SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN      11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP           "DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN    47
+#define OCFS2_TEXT_UUID_LEN                     32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN        2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN       8
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order.  Let's just be safe.
+ */
+struct ocfs2_live_connection {
+        struct list_head                oc_list;
+        struct ocfs2_cluster_connection *oc_conn;
+};
+struct ocfs2_control_private {
+        struct list_head op_list;
+        int op_state;
+        int op_this_node;
+        struct ocfs2_protocol_version op_proto;
+};
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+        char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        char    space;
+        char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+        char    newline;
+};
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+        char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        char    space1;
+        char    major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+        char    space2;
+        char    minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+        char    newline;
+};
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+        char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        char    space1;
+        char    uuid[OCFS2_TEXT_UUID_LEN];
+        char    space2;
+        char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+        char    newline;
+};
+union ocfs2_control_message {
+        char                                    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+        struct ocfs2_control_message_setn       u_setn;
+        struct ocfs2_control_message_setv       u_setv;
+        struct ocfs2_control_message_down       u_down;
+};
+static struct ocfs2_stack_plugin user_stack;
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+                                                     int state)
+{
+        struct ocfs2_control_private *p = file->private_data;
+        p->op_state = state;
+}
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+        struct ocfs2_control_private *p = file->private_data;
+        return p->op_state;
+}
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+        size_t len = strlen(name);
+        struct ocfs2_live_connection *c;
+        BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+        list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+                if ((c->oc_conn->cc_namelen == len) &&
+                    !strncmp(c->oc_conn->cc_name, name, len))
+                        return c;
+        }
+        return c;
+}
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path.  Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+                                     struct ocfs2_live_connection **c_ret)
+{
+        int rc = 0;
+        struct ocfs2_live_connection *c;
+        c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+        if (!c)
+                return -ENOMEM;
+        mutex_lock(&ocfs2_control_lock);
+        c->oc_conn = conn;
+        if (atomic_read(&ocfs2_control_opened))
+                list_add(&c->oc_list, &ocfs2_live_connection_list);
+        else {
+                printk(KERN_ERR
+                       "ocfs2: Userspace control daemon is not present\n");
+                rc = -ESRCH;
+        }
+        mutex_unlock(&ocfs2_control_lock);
+        if (!rc)
+                *c_ret = c;
+        else
+                kfree(c);
+        return rc;
+}
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+        mutex_lock(&ocfs2_control_lock);
+        list_del_init(&c->oc_list);
+        c->oc_conn = NULL;
+        mutex_unlock(&ocfs2_control_lock);
+        kfree(c);
+}
+static int ocfs2_control_cfu(void *target, size_t target_len,
+                             const char __user *buf, size_t count)
+{
+        /* The T01 expects write(2) calls to have exactly one command */
+        if ((count != target_len) ||
+            (count > sizeof(union ocfs2_control_message)))
+                return -EINVAL;
+        if (copy_from_user(target, buf, target_len))
+                return -EFAULT;
+        return 0;
+}
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+                                               const char __user *buf,
+                                               size_t count)
+{
+        ssize_t ret;
+        char kbuf[OCFS2_CONTROL_PROTO_LEN];
+        ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+                                buf, count);
+        if (ret)
+                return ret;
+        if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+                return -EINVAL;
+        ocfs2_control_set_handshake_state(file,
+                                          OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+        return count;
+}
+static void ocfs2_control_send_down(const char *uuid,
+                                    int nodenum)
+{
+        struct ocfs2_live_connection *c;
+        mutex_lock(&ocfs2_control_lock);
+        c = ocfs2_connection_find(uuid);
+        if (c) {
+                BUG_ON(c->oc_conn == NULL);
+                c->oc_conn->cc_recovery_handler(nodenum,
+                                                c->oc_conn->cc_recovery_data);
+        }
+        mutex_unlock(&ocfs2_control_lock);
+}
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values.  If there is a problem, return an error.  Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+        int rc = 0;
+        int set_p = 1;
+        struct ocfs2_control_private *p = file->private_data;
+        BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+        mutex_lock(&ocfs2_control_lock);
+        if (p->op_this_node < 0) {
+                set_p = 0;
+        } else if ((ocfs2_control_this_node >= 0) &&
+                   (ocfs2_control_this_node != p->op_this_node)) {
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        if (!p->op_proto.pv_major) {
+                set_p = 0;
+        } else if (!list_empty(&ocfs2_live_connection_list) &&
+                   ((running_proto.pv_major != p->op_proto.pv_major) ||
+                    (running_proto.pv_minor != p->op_proto.pv_minor))) {
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        if (set_p) {
+                ocfs2_control_this_node = p->op_this_node;
+                running_proto.pv_major = p->op_proto.pv_major;
+                running_proto.pv_minor = p->op_proto.pv_minor;
+        }
+out_unlock:
+        mutex_unlock(&ocfs2_control_lock);
+        if (!rc && set_p) {
+                /* We set the global values successfully */
+                atomic_inc(&ocfs2_control_opened);
+                ocfs2_control_set_handshake_state(file,
+                                        OCFS2_CONTROL_HANDSHAKE_VALID);
+        }
+        return rc;
+}
+static int ocfs2_control_get_this_node(void)
+{
+        int rc;
+        mutex_lock(&ocfs2_control_lock);
+        if (ocfs2_control_this_node < 0)
+                rc = -EINVAL;
+        else
+                rc = ocfs2_control_this_node;
+        mutex_unlock(&ocfs2_control_lock);
+        return rc;
+}
+static int ocfs2_control_do_setnode_msg(struct file *file,
+                                        struct ocfs2_control_message_setn *msg)
+{
+        long nodenum;
+        char *ptr = NULL;
+        struct ocfs2_control_private *p = file->private_data;
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+                return -EINVAL;
+        if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+                return -EINVAL;
+        if ((msg->space != ' ') || (msg->newline != '\n'))
+                return -EINVAL;
+        msg->space = msg->newline = '\0';
+        nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+        if (!ptr || *ptr)
+                return -EINVAL;
+        if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+            (nodenum > INT_MAX) || (nodenum < 0))
+                return -ERANGE;
+        p->op_this_node = nodenum;
+        return ocfs2_control_install_private(file);
+}
+static int ocfs2_control_do_setversion_msg(struct file *file,
+                                           struct ocfs2_control_message_setv *msg)
+ {
+        long major, minor;
+        char *ptr = NULL;
+        struct ocfs2_control_private *p = file->private_data;
+        struct ocfs2_protocol_version *max =
+                &user_stack.sp_proto->lp_max_version;
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+                return -EINVAL;
+        if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+                return -EINVAL;
+        if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+            (msg->newline != '\n'))
+                return -EINVAL;
+        msg->space1 = msg->space2 = msg->newline = '\0';
+        major = simple_strtol(msg->major, &ptr, 16);
+        if (!ptr || *ptr)
+                return -EINVAL;
+        minor = simple_strtol(msg->minor, &ptr, 16);
+        if (!ptr || *ptr)
+                return -EINVAL;
+        /*
+         * The major must be between 1 and 255, inclusive.  The minor
+         * must be between 0 and 255, inclusive.  The version passed in
+         * must be within the maximum version supported by the filesystem.
+         */
+        if ((major == LONG_MIN) || (major == LONG_MAX) ||
+            (major > (u8)-1) || (major < 1))
+                return -ERANGE;
+        if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+            (minor > (u8)-1) || (minor < 0))
+                return -ERANGE;
+        if ((major != max->pv_major) ||
+            (minor > max->pv_minor))
+                return -EINVAL;
+        p->op_proto.pv_major = major;
+        p->op_proto.pv_minor = minor;
+        return ocfs2_control_install_private(file);
+}
+static int ocfs2_control_do_down_msg(struct file *file,
+                                     struct ocfs2_control_message_down *msg)
+{
+        long nodenum;
+        char *p = NULL;
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_VALID)
+                return -EINVAL;
+        if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+                return -EINVAL;
+        if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+            (msg->newline != '\n'))
+                return -EINVAL;
+        msg->space1 = msg->space2 = msg->newline = '\0';
+        nodenum = simple_strtol(msg->nodestr, &p, 16);
+        if (!p || *p)
+                return -EINVAL;
+        if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+            (nodenum > INT_MAX) || (nodenum < 0))
+                return -ERANGE;
+        ocfs2_control_send_down(msg->uuid, nodenum);
+        return 0;
+}
+static ssize_t ocfs2_control_message(struct file *file,
+                                     const char __user *buf,
+                                     size_t count)
+{
+        ssize_t ret;
+        union ocfs2_control_message msg;
+        /* Try to catch padding issues */
+        WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+                (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+        memset(&msg, 0, sizeof(union ocfs2_control_message));
+        ret = ocfs2_control_cfu(&msg, count, buf, count);
+        if (ret)
+                goto out;
+        if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+            !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+                     OCFS2_CONTROL_MESSAGE_OP_LEN))
+                ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+        else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+                 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+                          OCFS2_CONTROL_MESSAGE_OP_LEN))
+                ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+        else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+                 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+                          OCFS2_CONTROL_MESSAGE_OP_LEN))
+                ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+        else
+                ret = -EINVAL;
+out:
+        return ret ? ret : count;
+}
+static ssize_t ocfs2_control_write(struct file *file,
+                                   const char __user *buf,
+                                   size_t count,
+                                   loff_t *ppos)
+{
+        ssize_t ret;
+        switch (ocfs2_control_get_handshake_state(file)) {
+                case OCFS2_CONTROL_HANDSHAKE_INVALID:
+                        ret = -EINVAL;
+                        break;
+                case OCFS2_CONTROL_HANDSHAKE_READ:
+                        ret = ocfs2_control_validate_protocol(file, buf,
+                                                              count);
+                        break;
+                case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+                case OCFS2_CONTROL_HANDSHAKE_VALID:
+                        ret = ocfs2_control_message(file, buf, count);
+                        break;
+                default:
+                        BUG();
+                        ret = -EIO;
+                        break;
+        }
+        return ret;
+}
+/*
+ * This is a naive version.  If we ever have a new protocol, we'll expand
+ * it.  Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+                                  char __user *buf,
+                                  size_t count,
+                                  loff_t *ppos)
+{
+        char *proto_string = OCFS2_CONTROL_PROTO;
+        size_t to_write = 0;
+        if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+                return 0;
+        to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
+        if (to_write > count)
+                to_write = count;
+        if (copy_to_user(buf, proto_string + *ppos, to_write))
+                return -EFAULT;
+        *ppos += to_write;
+        /* Have we read the whole protocol list? */
+        if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+                ocfs2_control_set_handshake_state(file,
+                                                  OCFS2_CONTROL_HANDSHAKE_READ);
+        return to_write;
+}
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+        struct ocfs2_control_private *p = file->private_data;
+        mutex_lock(&ocfs2_control_lock);
+        if (ocfs2_control_get_handshake_state(file) !=
+            OCFS2_CONTROL_HANDSHAKE_VALID)
+                goto out;
+        if (atomic_dec_and_test(&ocfs2_control_opened)) {
+                if (!list_empty(&ocfs2_live_connection_list)) {
+                        /* XXX: Do bad things! */
+                        printk(KERN_ERR
+                               "ocfs2: Unexpected release of ocfs2_control!\n"
+                               "       Loss of cluster connection requires "
+                               "an emergency restart!\n");
+                        emergency_restart();
+                }
+                /*
+                 * Last valid close clears the node number and resets
+                 * the locking protocol version
+                 */
+                ocfs2_control_this_node = -1;
+                running_proto.pv_major = 0;
+                running_proto.pv_major = 0;
+        }
+out:
+        list_del_init(&p->op_list);
+        file->private_data = NULL;
+        mutex_unlock(&ocfs2_control_lock);
+        kfree(p);
+        return 0;
+}
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+        struct ocfs2_control_private *p;
+        p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+        if (!p)
+                return -ENOMEM;
+        p->op_this_node = -1;
+        mutex_lock(&ocfs2_control_lock);
+        file->private_data = p;
+        list_add(&p->op_list, &ocfs2_control_private_list);
+        mutex_unlock(&ocfs2_control_lock);
+        return 0;
+}
+static const struct file_operations ocfs2_control_fops = {
+        .open    = ocfs2_control_open,
+        .release = ocfs2_control_release,
+        .read    = ocfs2_control_read,
+        .write   = ocfs2_control_write,
+        .owner   = THIS_MODULE,
+};
+struct miscdevice ocfs2_control_device = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = "ocfs2_control",
+        .fops           = &ocfs2_control_fops,
+};
+static int ocfs2_control_init(void)
+{
+        int rc;
+        atomic_set(&ocfs2_control_opened, 0);
+        rc = misc_register(&ocfs2_control_device);
+        if (rc)
+                printk(KERN_ERR
+                       "ocfs2: Unable to register ocfs2_control device "
+                       "(errno %d)\n",
+                       -rc);
+        return rc;
+}
+static void ocfs2_control_exit(void)
+{
+        int rc;
+        rc = misc_deregister(&ocfs2_control_device);
+        if (rc)
+                printk(KERN_ERR
+                       "ocfs2: Unable to deregister ocfs2_control device "
+                       "(errno %d)\n",
+                       -rc);
+}
+static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
+{
+        struct ocfs2_lock_res *res = astarg;
+        return &res->l_lksb.lksb_fsdlm;
+}
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+        struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
+        int status = lksb->sb_status;
+        BUG_ON(user_stack.sp_proto == NULL);
+        /*
+         * For now we're punting on the issue of other non-standard errors
+         * where we can't tell if the unlock_ast or lock_ast should be called.
+         * The main "other error" that's possible is EINVAL which means the
+         * function was called with invalid args, which shouldn't be possible
+         * since the caller here is under our control.  Other non-standard
+         * errors probably fall into the same category, or otherwise are fatal
+         * which means we can't carry on anyway.
+         */
+        if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+                user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+        else
+                user_stack.sp_proto->lp_lock_ast(astarg);
+}
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+        BUG_ON(user_stack.sp_proto == NULL);
+        user_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+                         int mode,
+                         union ocfs2_dlm_lksb *lksb,
+                         u32 flags,
+                         void *name,
+                         unsigned int namelen,
+                         void *astarg)
+{
+        int ret;
+        if (!lksb->lksb_fsdlm.sb_lvbptr)
+                lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+                                             sizeof(struct dlm_lksb);
+        ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+                       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+                       fsdlm_lock_ast_wrapper, astarg,
+                       fsdlm_blocking_ast_wrapper);
+        return ret;
+}
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                           union ocfs2_dlm_lksb *lksb,
+                           u32 flags,
+                           void *astarg)
+{
+        int ret;
+        ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+                         flags, &lksb->lksb_fsdlm, astarg);
+        return ret;
+}
+static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+        return lksb->lksb_fsdlm.sb_status;
+}
+static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+        return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+}
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+                               struct ocfs2_protocol_version *request)
+{
+        if (existing->pv_major != request->pv_major)
+                return 1;
+        if (existing->pv_minor > request->pv_minor)
+                return 1;
+        if (existing->pv_minor < request->pv_minor)
+                request->pv_minor = existing->pv_minor;
+        return 0;
+}
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+        dlm_lockspace_t *fsdlm;
+        struct ocfs2_live_connection *control;
+        int rc = 0;
+        BUG_ON(conn == NULL);
+        rc = ocfs2_live_connection_new(conn, &control);
+        if (rc)
+                goto out;
+        /*
+         * running_proto must have been set before we allowed any mounts
+         * to proceed.
+         */
+        if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+                printk(KERN_ERR
+                       "Unable to mount with fs locking protocol version "
+                       "%u.%u because the userspace control daemon has "
+                       "negotiated %u.%u\n",
+                       conn->cc_version.pv_major, conn->cc_version.pv_minor,
+                       running_proto.pv_major, running_proto.pv_minor);
+                rc = -EPROTO;
+                ocfs2_live_connection_drop(control);
+                goto out;
+        }
+        rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+                               &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+        if (rc) {
+                ocfs2_live_connection_drop(control);
+                goto out;
+        }
+        conn->cc_private = control;
+        conn->cc_lockspace = fsdlm;
+out:
+        return rc;
+}
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                                   int hangup_pending)
+{
+        dlm_release_lockspace(conn->cc_lockspace, 2);
+        conn->cc_lockspace = NULL;
+        ocfs2_live_connection_drop(conn->cc_private);
+        conn->cc_private = NULL;
+        return 0;
+}
+static int user_cluster_this_node(unsigned int *this_node)
+{
+        int rc;
+        rc = ocfs2_control_get_this_node();
+        if (rc < 0)
+                return rc;
+        *this_node = rc;
+        return 0;
+}
+static struct ocfs2_stack_operations user_stack_ops = {
+        .connect        = user_cluster_connect,
+        .disconnect     = user_cluster_disconnect,
+        .this_node      = user_cluster_this_node,
+        .dlm_lock       = user_dlm_lock,
+        .dlm_unlock     = user_dlm_unlock,
+        .lock_status    = user_dlm_lock_status,
+        .lock_lvb       = user_dlm_lvb,
+        .dump_lksb      = user_dlm_dump_lksb,
+};
+static struct ocfs2_stack_plugin user_stack = {
+        .sp_name        = "user",
+        .sp_ops         = &user_stack_ops,
+        .sp_owner       = THIS_MODULE,
+};
+static int __init user_stack_init(void)
+{
+        int rc;
+        rc = ocfs2_control_init();
+        if (!rc) {
+                rc = ocfs2_stack_glue_register(&user_stack);
+                if (rc)
+                        ocfs2_control_exit();
+        }
+        return rc;
+}
+static void __exit user_stack_exit(void)
+{
+        ocfs2_stack_glue_unregister(&user_stack);
+        ocfs2_control_exit();
+}
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(user_stack_init);
+module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000000..119f60cea9cc
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,568 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#define OCFS2_STACK_PLUGIN_O2CB         "o2cb"
+#define OCFS2_STACK_PLUGIN_USER         "user"
+static struct ocfs2_locking_protocol *lproto;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+/*
+ * The stack currently in use.  If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+        struct ocfs2_stack_plugin *p;
+        assert_spin_locked(&ocfs2_stack_lock);
+        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                if (!strcmp(p->sp_name, name))
+                        return p;
+        }
+        return NULL;
+}
+static int ocfs2_stack_driver_request(const char *stack_name,
+                                      const char *plugin_name)
+{
+        int rc;
+        struct ocfs2_stack_plugin *p;
+        spin_lock(&ocfs2_stack_lock);
+        /*
+         * If the stack passed by the filesystem isn't the selected one,
+         * we can't continue.
+         */
+        if (strcmp(stack_name, cluster_stack_name)) {
+                rc = -EBUSY;
+                goto out;
+        }
+        if (active_stack) {
+                /*
+                 * If the active stack isn't the one we want, it cannot
+                 * be selected right now.
+                 */
+                if (!strcmp(active_stack->sp_name, plugin_name))
+                        rc = 0;
+                else
+                        rc = -EBUSY;
+                goto out;
+        }
+        p = ocfs2_stack_lookup(plugin_name);
+        if (!p || !try_module_get(p->sp_owner)) {
+                rc = -ENOENT;
+                goto out;
+        }
+        /* Ok, the stack is pinned */
+        p->sp_count++;
+        active_stack = p;
+        rc = 0;
+out:
+        spin_unlock(&ocfs2_stack_lock);
+        return rc;
+}
+/*
+ * This function looks up the appropriate stack and makes it active.  If
+ * there is no stack, it tries to load it.  It will fail if the stack still
+ * cannot be found.  It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+        int rc;
+        char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+        /*
+         * Classic stack does not pass in a stack name.  This is
+         * compatible with older tools as well.
+         */
+        if (!stack_name || !*stack_name)
+                stack_name = OCFS2_STACK_PLUGIN_O2CB;
+        if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+                printk(KERN_ERR
+                       "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+                       stack_name);
+                return -EINVAL;
+        }
+        /* Anything that isn't the classic stack is a user stack */
+        if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+                plugin_name = OCFS2_STACK_PLUGIN_USER;
+        rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+        if (rc == -ENOENT) {
+                request_module("ocfs2_stack_%s", plugin_name);
+                rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+        }
+        if (rc == -ENOENT) {
+                printk(KERN_ERR
+                       "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+                       plugin_name);
+        } else if (rc == -EBUSY) {
+                printk(KERN_ERR
+                       "ocfs2: A different cluster stack is in use\n");
+        }
+        return rc;
+}
+static void ocfs2_stack_driver_put(void)
+{
+        spin_lock(&ocfs2_stack_lock);
+        BUG_ON(active_stack == NULL);
+        BUG_ON(active_stack->sp_count == 0);
+        active_stack->sp_count--;
+        if (!active_stack->sp_count) {
+                module_put(active_stack->sp_owner);
+                active_stack = NULL;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+}
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+        int rc;
+        spin_lock(&ocfs2_stack_lock);
+        if (!ocfs2_stack_lookup(plugin->sp_name)) {
+                plugin->sp_count = 0;
+                plugin->sp_proto = lproto;
+                list_add(&plugin->sp_list, &ocfs2_stack_list);
+                printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+                       plugin->sp_name);
+                rc = 0;
+        } else {
+                printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+                       plugin->sp_name);
+                rc = -EEXIST;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+        struct ocfs2_stack_plugin *p;
+        spin_lock(&ocfs2_stack_lock);
+        p = ocfs2_stack_lookup(plugin->sp_name);
+        if (p) {
+                BUG_ON(p != plugin);
+                BUG_ON(plugin == active_stack);
+                BUG_ON(plugin->sp_count != 0);
+                list_del_init(&plugin->sp_list);
+                printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+                       plugin->sp_name);
+        } else {
+                printk(KERN_ERR "Stack \"%s\" is not registered\n",
+                       plugin->sp_name);
+        }
+        spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+{
+        struct ocfs2_stack_plugin *p;
+        BUG_ON(proto == NULL);
+        spin_lock(&ocfs2_stack_lock);
+        BUG_ON(active_stack != NULL);
+        lproto = proto;
+        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                p->sp_proto = lproto;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
+ * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
+ * underlying stack plugins need to pilfer the lksb off of the lock_res.
+ * If some other structure needs to be passed as an astarg, the plugins
+ * will need to be given a different avenue to the lksb.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+                   int mode,
+                   union ocfs2_dlm_lksb *lksb,
+                   u32 flags,
+                   void *name,
+                   unsigned int namelen,
+                   struct ocfs2_lock_res *astarg)
+{
+        BUG_ON(lproto == NULL);
+        return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+                                              name, namelen, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                     union ocfs2_dlm_lksb *lksb,
+                     u32 flags,
+                     struct ocfs2_lock_res *astarg)
+{
+        BUG_ON(lproto == NULL);
+        return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+        return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+/*
+ * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
+ * don't cast at the glue level.  The real answer is that the header
+ * ordering is nigh impossible.
+ */
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+        return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+        active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+int ocfs2_cluster_connect(const char *stack_name,
+                          const char *group,
+                          int grouplen,
+                          void (*recovery_handler)(int node_num,
+                                                   void *recovery_data),
+                          void *recovery_data,
+                          struct ocfs2_cluster_connection **conn)
+{
+        int rc = 0;
+        struct ocfs2_cluster_connection *new_conn;
+        BUG_ON(group == NULL);
+        BUG_ON(conn == NULL);
+        BUG_ON(recovery_handler == NULL);
+        if (grouplen > GROUP_NAME_MAX) {
+                rc = -EINVAL;
+                goto out;
+        }
+        new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+                           GFP_KERNEL);
+        if (!new_conn) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        memcpy(new_conn->cc_name, group, grouplen);
+        new_conn->cc_namelen = grouplen;
+        new_conn->cc_recovery_handler = recovery_handler;
+        new_conn->cc_recovery_data = recovery_data;
+        /* Start the new connection at our maximum compatibility level */
+        new_conn->cc_version = lproto->lp_max_version;
+        /* This will pin the stack driver if successful */
+        rc = ocfs2_stack_driver_get(stack_name);
+        if (rc)
+                goto out_free;
+        rc = active_stack->sp_ops->connect(new_conn);
+        if (rc) {
+                ocfs2_stack_driver_put();
+                goto out_free;
+        }
+        *conn = new_conn;
+out_free:
+        if (rc)
+                kfree(new_conn);
+out:
+        return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                             int hangup_pending)
+{
+        int ret;
+        BUG_ON(conn == NULL);
+        ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+        /* XXX Should we free it anyway? */
+        if (!ret) {
+                kfree(conn);
+                if (!hangup_pending)
+                        ocfs2_stack_driver_put();
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+        BUG_ON(group == NULL);
+        BUG_ON(group[grouplen] != '\0');
+        if (active_stack->sp_ops->hangup)
+                active_stack->sp_ops->hangup(group, grouplen);
+        /* cluster_disconnect() was called with hangup_pending==1 */
+        ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+        return active_stack->sp_ops->this_node(node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+/*
+ * Sysfs bits
+ */
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+                                               struct kobj_attribute *attr,
+                                               char *buf)
+{
+        ssize_t ret = 0;
+        spin_lock(&ocfs2_stack_lock);
+        if (lproto)
+                ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+                               lproto->lp_max_version.pv_major,
+                               lproto->lp_max_version.pv_minor);
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+        __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+               ocfs2_max_locking_protocol_show, NULL);
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+                                                 struct kobj_attribute *attr,
+                                                 char *buf)
+{
+        ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+        struct ocfs2_stack_plugin *p;
+        spin_lock(&ocfs2_stack_lock);
+        list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+                ret = snprintf(buf, remain, "%s\n",
+                               p->sp_name);
+                if (ret < 0) {
+                        total = ret;
+                        break;
+                }
+                if (ret == remain) {
+                        /* snprintf() didn't fit */
+                        total = -E2BIG;
+                        break;
+                }
+                total += ret;
+                remain -= ret;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return total;
+}
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+        __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+               ocfs2_loaded_cluster_plugins_show, NULL);
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+                                                struct kobj_attribute *attr,
+                                                char *buf)
+{
+        ssize_t ret = 0;
+        spin_lock(&ocfs2_stack_lock);
+        if (active_stack) {
+                ret = snprintf(buf, PAGE_SIZE, "%s\n",
+                               active_stack->sp_name);
+                if (ret == PAGE_SIZE)
+                        ret = -E2BIG;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+        __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+               ocfs2_active_cluster_plugin_show, NULL);
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        char *buf)
+{
+        ssize_t ret;
+        spin_lock(&ocfs2_stack_lock);
+        ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         const char *buf, size_t count)
+{
+        size_t len = count;
+        ssize_t ret;
+        if (len == 0)
+                return len;
+        if (buf[len - 1] == '\n')
+                len--;
+        if ((len != OCFS2_STACK_LABEL_LEN) ||
+            (strnlen(buf, len) != len))
+                return -EINVAL;
+        spin_lock(&ocfs2_stack_lock);
+        if (active_stack) {
+                if (!strncmp(buf, cluster_stack_name, len))
+                        ret = count;
+                else
+                        ret = -EBUSY;
+        } else {
+                memcpy(cluster_stack_name, buf, len);
+                ret = count;
+        }
+        spin_unlock(&ocfs2_stack_lock);
+        return ret;
+}
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+        __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+               ocfs2_cluster_stack_show,
+               ocfs2_cluster_stack_store);
+static struct attribute *ocfs2_attrs[] = {
+        &ocfs2_attr_max_locking_protocol.attr,
+        &ocfs2_attr_loaded_cluster_plugins.attr,
+        &ocfs2_attr_active_cluster_plugin.attr,
+        &ocfs2_attr_cluster_stack.attr,
+        NULL,
+};
+static struct attribute_group ocfs2_attr_group = {
+        .attrs = ocfs2_attrs,
+};
+static struct kset *ocfs2_kset;
+static void ocfs2_sysfs_exit(void)
+{
+        kset_unregister(ocfs2_kset);
+}
+static int ocfs2_sysfs_init(void)
+{
+        int ret;
+        ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+        if (!ocfs2_kset)
+                return -ENOMEM;
+        ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        kset_unregister(ocfs2_kset);
+        return ret;
+}
+static int __init ocfs2_stack_glue_init(void)
+{
+        strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+        return ocfs2_sysfs_init();
+}
+static void __exit ocfs2_stack_glue_exit(void)
+{
+        lproto = NULL;
+        ocfs2_sysfs_exit();
+}
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000000..005e4f170e0f
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,261 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+/*
+ * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
+ * some day, but right now we need it.  Let's fake it.  This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL           0x00100000
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h.  That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX          64
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior.  See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+        u8 pv_major;
+        u8 pv_minor;
+};
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+        struct ocfs2_protocol_version lp_max_version;
+        void (*lp_lock_ast)(void *astarg);
+        void (*lp_blocking_ast)(void *astarg, int level);
+        void (*lp_unlock_ast)(void *astarg, int error);
+};
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space.  This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+        struct dlm_lksb lksb;
+        char lvb[DLM_LVB_LEN];
+};
+/*
+ * A union of all lock status structures.  We define it here so that the
+ * size of the union is known.  Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+union ocfs2_dlm_lksb {
+        struct dlm_lockstatus lksb_o2dlm;
+        struct dlm_lksb lksb_fsdlm;
+        struct fsdlm_lksb_plus_lvb padding;
+};
+/*
+ * A cluster connection.  Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack.  ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+        char cc_name[GROUP_NAME_MAX];
+        int cc_namelen;
+        struct ocfs2_protocol_version cc_version;
+        void (*cc_recovery_handler)(int node_num, void *recovery_data);
+        void *cc_recovery_data;
+        void *cc_lockspace;
+        void *cc_private;
+};
+/*
+ * Each cluster stack implements the stack operations structure.  Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+        /*
+         * The fs code calls ocfs2_cluster_connect() to attach a new
+         * filesystem to the cluster stack.  The ->connect() op is passed
+         * an ocfs2_cluster_connection with the name and recovery field
+         * filled in.
+         *
+         * The stack must set up any notification mechanisms and create
+         * the filesystem lockspace in the DLM.  The lockspace should be
+         * stored on cc_lockspace.  Any other information can be stored on
+         * cc_private.
+         *
+         * ->connect() must not return until it is guaranteed that
+         *
+         *  - Node down notifications for the filesystem will be recieved
+         *    and passed to conn->cc_recovery_handler().
+         *  - Locking requests for the filesystem will be processed.
+         */
+        int (*connect)(struct ocfs2_cluster_connection *conn);
+        /*
+         * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+         * no longer needs cluster services.  All DLM locks have been
+         * dropped, and recovery notification is being ignored by the
+         * fs code.  The stack must disengage from the DLM and discontinue
+         * recovery notification.
+         *
+         * Once ->disconnect() has returned, the connection structure will
+         * be freed.  Thus, a stack must not return from ->disconnect()
+         * until it will no longer reference the conn pointer.
+         *
+         * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
+         * be dropping the reference on the module.
+         */
+        int (*disconnect)(struct ocfs2_cluster_connection *conn,
+                          int hangup_pending);
+        /*
+         * ocfs2_cluster_hangup() exists for compatibility with older
+         * ocfs2 tools.  Only the classic stack really needs it.  As such
+         * ->hangup() is not required of all stacks.  See the comment by
+         * ocfs2_cluster_hangup() for more details.
+         *
+         * Note that ocfs2_cluster_hangup() can only be called if
+         * hangup_pending was passed to ocfs2_cluster_disconnect().
+         */
+        void (*hangup)(const char *group, int grouplen);
+        /*
+         * ->this_node() returns the cluster's unique identifier for the
+         * local node.
+         */
+        int (*this_node)(unsigned int *node);
+        /*
+         * Call the underlying dlm lock function.  The ->dlm_lock()
+         * callback should convert the flags and mode as appropriate.
+         *
+         * ast and bast functions are not part of the call because the
+         * stack will likely want to wrap ast and bast calls before passing
+         * them to stack->sp_proto.
+         */
+        int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+                        int mode,
+                        union ocfs2_dlm_lksb *lksb,
+                        u32 flags,
+                        void *name,
+                        unsigned int namelen,
+                        void *astarg);
+        /*
+         * Call the underlying dlm unlock function.  The ->dlm_unlock()
+         * function should convert the flags as appropriate.
+         *
+         * The unlock ast is not passed, as the stack will want to wrap
+         * it before calling stack->sp_proto->lp_unlock_ast().
+         */
+        int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+                          union ocfs2_dlm_lksb *lksb,
+                          u32 flags,
+                          void *astarg);
+        /*
+         * Return the status of the current lock status block.  The fs
+         * code should never dereference the union.  The ->lock_status()
+         * callback pulls out the stack-specific lksb, converts the status
+         * to a proper errno, and returns it.
+         */
+        int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+        /*
+         * Pull the lvb pointer off of the stack-specific lksb.
+         */
+        void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+        /*
+         * This is an optoinal debugging hook.  If provided, the
+         * stack can dump debugging information about this lock.
+         */
+        void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+};
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure.  This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+        char *sp_name;
+        struct ocfs2_stack_operations *sp_ops;
+        struct module *sp_owner;
+        /* These are managed by the stackglue code. */
+        struct list_head sp_list;
+        unsigned int sp_count;
+        struct ocfs2_locking_protocol *sp_proto;
+};
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+                          const char *group,
+                          int grouplen,
+                          void (*recovery_handler)(int node_num,
+                                                   void *recovery_data),
+                          void *recovery_data,
+                          struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                             int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(unsigned int *node);
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+                   int mode,
+                   union ocfs2_dlm_lksb *lksb,
+                   u32 flags,
+                   void *name,
+                   unsigned int namelen,
+                   struct ocfs2_lock_res *astarg);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                     union ocfs2_dlm_lksb *lksb,
+                     u32 flags,
+                     struct ocfs2_lock_res *astarg);
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+#endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004df..d2d278fb9819 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,11 @@
 #include "buffer_head_io.h"
+#define NOT_ALLOC_NEW_GROUP             0
+#define ALLOC_NEW_GROUP                 1
+#define OCFS2_MAX_INODES_TO_STEAL       1024
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u64 *bg_blkno,
                                                u16 *bg_bit_off);
-void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
        struct inode *inode = ac->ac_inode;
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
                mutex_unlock(&inode->i_mutex);
                iput(inode);
+                ac->ac_inode = NULL;
        }
-        if (ac->ac_bh)
+        if (ac->ac_bh) {
                brelse(ac->ac_bh);
+                ac->ac_bh = NULL;
+        }
+}
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+        ocfs2_free_ac_resource(ac);
        kfree(ac);
 }
@@ -391,7 +404,8 @@ bail:
 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
-                                       u32 slot)
+                                       u32 slot,
+                                       int alloc_new_group)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        }
        ac->ac_inode = alloc_inode;
+        ac->ac_alloc_slot = slot;
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
+                if (alloc_new_group != ALLOC_NEW_GROUP) {
+                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
+                             "and we don't alloc a new group for it.\n",
+                             slot, bits_wanted, free_bits);
+                        status = -ENOSPC;
+                        goto bail;
+                }
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
                if (status < 0) {
                        if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
        (*ac)->ac_group_search = ocfs2_block_group_search;
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
-                                             EXTENT_ALLOC_SYSTEM_INODE, slot);
+                                             EXTENT_ALLOC_SYSTEM_INODE,
+                                             slot, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
        return status;
 }
+static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
+                                              struct ocfs2_alloc_context *ac)
+{
+        int i, status = -ENOSPC;
+        s16 slot = ocfs2_get_inode_steal_slot(osb);
+        /* Start to steal inodes from the first slot after ours. */
+        if (slot == OCFS2_INVALID_SLOT)
+                slot = osb->slot_num + 1;
+        for (i = 0; i < osb->max_slots; i++, slot++) {
+                if (slot == osb->max_slots)
+                        slot = 0;
+                if (slot == osb->slot_num)
+                        continue;
+                status = ocfs2_reserve_suballoc_bits(osb, ac,
+                                                     INODE_ALLOC_SYSTEM_INODE,
+                                                     slot, NOT_ALLOC_NEW_GROUP);
+                if (status >= 0) {
+                        ocfs2_set_inode_steal_slot(osb, slot);
+                        break;
+                }
+                ocfs2_free_ac_resource(ac);
+        }
+        return status;
+}
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                            struct ocfs2_alloc_context **ac)
 {
        int status;
+        s16 slot = ocfs2_get_inode_steal_slot(osb);
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
        (*ac)->ac_group_search = ocfs2_block_group_search;
+        /*
+         * slot is set when we successfully steal inode from other nodes.
+         * It is reset in 3 places:
+         * 1. when we flush the truncate log
+         * 2. when we complete local alloc recovery.
+         * 3. when we successfully allocate from our own slot.
+         * After it is set, we will go on stealing inodes until we find the
+         * need to check our slots to see whether there is some space for us.
+         */
+        if (slot != OCFS2_INVALID_SLOT &&
+            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+                goto inode_steal;
+        atomic_set(&osb->s_num_inodes_stolen, 0);
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                             osb->slot_num);
+                                             osb->slot_num, ALLOC_NEW_GROUP);
+        if (status >= 0) {
+                status = 0;
+                /*
+                 * Some inodes must be freed by us, so try to allocate
+                 * from our own next time.
+                 */
+                if (slot != OCFS2_INVALID_SLOT)
+                        ocfs2_init_inode_steal_slot(osb);
+                goto bail;
+        } else if (status < 0 && status != -ENOSPC) {
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_free_ac_resource(*ac);
+inode_steal:
+        status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+        atomic_inc(&osb->s_num_inodes_stolen);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                             OCFS2_INVALID_SLOT);
+                                             OCFS2_INVALID_SLOT,
+                                             ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb459..544c600662bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
        struct buffer_head *ac_bh; /* file entry bh */
+        u32    ac_alloc_slot;   /* which slot are we allocating from? */
        u32    ac_bits_wanted;
        u32    ac_bits_given;
 #define OCFS2_AC_USE_LOCAL 1
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9f..df63ba20ae90 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
 #include <linux/mount.h>
+#include <linux/seq_file.h>
-#include <cluster/nodemanager.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
        unsigned int    atime_quantum;
        signed short    slot;
        unsigned int    localalloc_opt;
+        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
        Opt_commit,
        Opt_localalloc,
        Opt_localflocks,
+        Opt_stack,
        Opt_err,
 };
@@ -172,6 +172,7 @@ static match_table_t tokens = {
        {Opt_commit, "commit=%u"},
        {Opt_localalloc, "localalloc=%d"},
        {Opt_localflocks, "localflocks"},
+        {Opt_stack, "cluster_stack=%s"},
        {Opt_err, NULL}
 };
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
                }
        }
+        if (ocfs2_userspace_stack(osb)) {
+                if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+                        mlog(ML_ERROR, "Userspace stack expected, but "
+                             "o2cb heartbeat arguments passed to mount\n");
+                        return -EINVAL;
+                }
+        }
        if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
-                if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) {
+                if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+                    !ocfs2_userspace_stack(osb)) {
                        mlog(ML_ERROR, "Heartbeat has to be started to mount "
                             "a read-write clustered device.\n");
                        return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
        return 0;
 }
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk.  If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+                                        struct mount_options *mopt)
+{
+        if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+                mlog(ML_ERROR,
+                     "cluster stack passed to mount, but this filesystem "
+                     "does not support it\n");
+                return -EINVAL;
+        }
+        if (ocfs2_userspace_stack(osb) &&
+            strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+                    OCFS2_STACK_LABEL_LEN)) {
+                mlog(ML_ERROR,
+                     "cluster stack passed to mount (\"%s\") does not "
+                     "match the filesystem (\"%s\")\n",
+                     mopt->cluster_stack,
+                     osb->osb_cluster_stack);
+                return -EINVAL;
+        }
+        return 0;
+}
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
-        /* for now we only have one cluster/node, make sure we see it
-         * in the heartbeat universe */
-        if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
-                if (!o2hb_check_local_node_heartbeating()) {
-                        status = -EINVAL;
-                        goto read_super_error;
-                }
-        }
        /* probe for superblock */
        status = ocfs2_sb_probe(sb, &bh, &sector_size);
        if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_size = parsed_options.localalloc_opt;
+        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+        if (status)
+                goto read_super_error;
        sb->s_magic = OCFS2_SUPER_MAGIC;
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        if (ocfs2_mount_local(osb))
                snprintf(nodestr, sizeof(nodestr), "local");
        else
-                snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+                snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
        printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
               "with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        mopt->cluster_stack[0] = '\0';
        if (!options) {
                status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
                        if (!is_remount)
                                mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
                        break;
+                case Opt_stack:
+                        /* Check both that the option we were passed
+                         * is of the right length and that it is a proper
+                         * string of the right length.
+                         */
+                        if (((args[0].to - args[0].from) !=
+                             OCFS2_STACK_LABEL_LEN) ||
+                            (strnlen(args[0].from,
+                                     OCFS2_STACK_LABEL_LEN) !=
+                             OCFS2_STACK_LABEL_LEN)) {
+                                mlog(ML_ERROR,
+                                     "Invalid cluster_stack option\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        memcpy(mopt->cluster_stack, args[0].from,
+                               OCFS2_STACK_LABEL_LEN);
+                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
                seq_printf(s, ",localflocks,");
+        if (osb->osb_cluster_stack[0])
+                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+                           osb->osb_cluster_stack);
        return 0;
 }
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
+        ocfs2_set_locking_protocol();
 leave:
        if (status < 0) {
                ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
        return 0;
 }
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
-        int status;
-        /* XXX hold a ref on the node while mounte?  easy enough, if
-         * desirable. */
-        if (ocfs2_mount_local(osb))
-                osb->node_num = 0;
-        else
-                osb->node_num = o2nm_this_node();
-        if (osb->node_num == O2NM_MAX_NODES) {
-                mlog(ML_ERROR, "could not find this host's node number\n");
-                status = -ENOENT;
-                goto bail;
-        }
-        mlog(0, "I am node %d\n", osb->node_num);
-        status = 0;
-bail:
-        return status;
-}
 static int ocfs2_mount_volume(struct super_block *sb)
 {
        int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
        if (ocfs2_is_hard_readonly(osb))
                goto leave;
-        status = ocfs2_fill_local_node_info(osb);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = ocfs2_dlm_init(osb);
        if (status < 0) {
                mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
        return status;
 }
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
-        mb();
-        return osb->recovery_thread_task != NULL;
-}
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
-        int tmp;
+        int tmp, hangup_needed = 0;
        struct ocfs2_super *osb = NULL;
        char nodestr[8];
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        ocfs2_truncate_log_shutdown(osb);
-        /* disable any new recovery threads and wait for any currently
+        /* This will disable recovery and flush any recovery work. */
-         * running ones to exit. Do this before setting the vol_state. */
+        ocfs2_recovery_exit(osb);
-        mutex_lock(&osb->recovery_lock);
-        osb->disable_recovery = 1;
-        mutex_unlock(&osb->recovery_lock);
-        wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-        /* At this point, we know that no more recovery threads can be
-         * launched, so wait for any recovery completion work to
-         * complete. */
-        flush_workqueue(ocfs2_wq);
        ocfs2_journal_shutdown(osb);
        ocfs2_sync_blockdev(sb);
-        /* No dlm means we've failed during mount, so skip all the
+        /* No cluster connection means we've failed during mount, so skip
-         * steps which depended on that to complete. */
+         * all the steps which depended on that to complete. */
-        if (osb->dlm) {
+        if (osb->cconn) {
                tmp = ocfs2_super_lock(osb, 1);
                if (tmp < 0) {
                        mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        if (osb->slot_num != OCFS2_INVALID_SLOT)
                ocfs2_put_slot(osb);
-        if (osb->dlm)
+        if (osb->cconn)
                ocfs2_super_unlock(osb, 1);
        ocfs2_release_system_inodes(osb);
-        if (osb->dlm)
+        /*
-                ocfs2_dlm_shutdown(osb);
+         * If we're dismounting due to mount error, mount.ocfs2 will clean
+         * up heartbeat.  If we're a local mount, there is no heartbeat.
+         * If we failed before we got a uuid_str yet, we can't stop
+         * heartbeat.  Otherwise, do it.
+         */
+        if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+                hangup_needed = 1;
+        if (osb->cconn)
+                ocfs2_dlm_shutdown(osb, hangup_needed);
        debugfs_remove(osb->osb_debug_root);
-        if (!mnt_err)
+        if (hangup_needed)
-                ocfs2_stop_heartbeat(osb);
+                ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
        atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
        if (ocfs2_mount_local(osb))
                snprintf(nodestr, sizeof(nodestr), "local");
        else
-                snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+                snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
        printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
               osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
-        osb->osb_locking_proto = ocfs2_locking_protocol;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
        /* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->s_sectsize_bits = blksize_bits(sector_size);
        BUG_ON(!osb->s_sectsize_bits);
-        init_waitqueue_head(&osb->recovery_event);
        spin_lock_init(&osb->dc_task_lock);
        init_waitqueue_head(&osb->dc_event);
        osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_LIST_HEAD(&osb->blocked_lock_list);
        osb->blocked_lock_count = 0;
        spin_lock_init(&osb->osb_lock);
+        ocfs2_init_inode_steal_slot(osb);
        atomic_set(&osb->alloc_stats.moves, 0);
        atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
-        mutex_init(&osb->recovery_lock);
+        status = ocfs2_recovery_init(osb);
+        if (status) {
-        osb->disable_recovery = 0;
+                mlog(ML_ERROR, "Unable to initialize recovery state\n");
-        osb->recovery_thread_task = NULL;
+                mlog_errno(status);
+                goto bail;
+        }
        init_waitqueue_head(&osb->checkpoint_event);
        atomic_set(&osb->needs_checkpoint, 0);
        osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
-        osb->node_num = O2NM_INVALID_NODE_NUM;
        osb->slot_num = OCFS2_INVALID_SLOT;
        osb->local_alloc_state = OCFS2_LA_UNUSED;
        osb->local_alloc_bh = NULL;
-        ocfs2_setup_hb_callbacks(osb);
        init_waitqueue_head(&osb->osb_mount_event);
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
+        if (ocfs2_userspace_stack(osb)) {
+                memcpy(osb->osb_cluster_stack,
+                       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+                       OCFS2_STACK_LABEL_LEN);
+                osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+                        mlog(ML_ERROR,
+                             "couldn't mount because of an invalid "
+                             "cluster stack label (%s) \n",
+                             osb->osb_cluster_stack);
+                        status = -EINVAL;
+                        goto bail;
+                }
+        } else {
+                /* The empty string is identical with classic tools that
+                 * don't know about s_cluster_info. */
+                osb->osb_cluster_stack[0] = '\0';
+        }
        get_random_bytes(&osb->s_next_generation, sizeof(u32));
        /* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        /* This function assumes that the caller has the main osb resource */
-        if (osb->slot_info)
+        ocfs2_free_slot_info(osb);
-                ocfs2_free_slot_info(osb->slot_info);
        kfree(osb->osb_orphan_wipes);
        /* FIXME
diff --git a/fs/open.c b/fs/open.c
index 54198538b67e..b70e7666bb2c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length)
        if (!S_ISREG(inode->i_mode))
                goto dput_and_out;
-        error = vfs_permission(&nd, MAY_WRITE);
+        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto dput_and_out;
-        error = -EROFS;
+        error = vfs_permission(&nd, MAY_WRITE);
-        if (IS_RDONLY(inode))
+        if (error)
-                goto dput_and_out;
+                goto mnt_drop_write_and_out;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto dput_and_out;
+                goto mnt_drop_write_and_out;
        error = get_write_access(inode);
        if (error)
-                goto dput_and_out;
+                goto mnt_drop_write_and_out;
        /*
         * Make sure that there are no leases.  get_write_access() protects
@@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 put_write_and_out:
        put_write_access(inode);
+mnt_drop_write_and_out:
+        mnt_drop_write(nd.path.mnt);
 dput_and_out:
        path_put(&nd.path);
 out:
@@ -335,7 +337,7 @@ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 {
        long ret = do_sys_ftruncate(fd, length, 1);
        /* avoid REGPARM breakage on x86: */
-        prevent_tail_call(ret);
+        asmlinkage_protect(2, ret, fd, length);
        return ret;
 }
@@ -350,7 +352,7 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 {
        long ret = do_sys_ftruncate(fd, length, 0);
        /* avoid REGPARM breakage on x86: */
-        prevent_tail_call(ret);
+        asmlinkage_protect(2, ret, fd, length);
        return ret;
 }
 #endif
@@ -457,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
        if(res || !(mode & S_IWOTH) ||
           special_file(nd.path.dentry->d_inode->i_mode))
                goto out_path_release;
+        /*
-        if(IS_RDONLY(nd.path.dentry->d_inode))
+         * This is a rare case where using __mnt_is_readonly()
+         * is OK without a mnt_want/drop_write() pair.  Since
+         * no actual write to the fs is performed here, we do
+         * not need to telegraph to that to anyone.
+         *
+         * By doing this, we accept that this access is
+         * inherently racy and know that the fs may change
+         * state before we even see this result.
+         */
+        if (__mnt_is_readonly(nd.path.mnt))
                res = -EROFS;
 out_path_release:
@@ -567,12 +578,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
        audit_inode(NULL, dentry);
-        err = -EROFS;
+        err = mnt_want_write(file->f_path.mnt);
-        if (IS_RDONLY(inode))
+        if (err)
                goto out_putf;
        err = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto out_putf;
+                goto out_drop_write;
        mutex_lock(&inode->i_mutex);
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
@@ -581,6 +592,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
        err = notify_change(dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
+out_drop_write:
+        mnt_drop_write(file->f_path.mnt);
 out_putf:
        fput(file);
 out:
@@ -600,13 +613,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
                goto out;
        inode = nd.path.dentry->d_inode;
-        error = -EROFS;
+        error = mnt_want_write(nd.path.mnt);
-        if (IS_RDONLY(inode))
+        if (error)
                goto dput_and_out;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto dput_and_out;
+                goto out_drop_write;
        mutex_lock(&inode->i_mutex);
        if (mode == (mode_t) -1)
@@ -616,6 +629,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
        error = notify_change(nd.path.dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
+out_drop_write:
+        mnt_drop_write(nd.path.mnt);
 dput_and_out:
        path_put(&nd.path);
 out:
@@ -638,9 +653,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
                printk(KERN_ERR "chown_common: NULL inode\n");
                goto out;
        }
-        error = -EROFS;
-        if (IS_RDONLY(inode))
-                goto out;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                goto out;
@@ -671,7 +683,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
        error = user_path_walk(filename, &nd);
        if (error)
                goto out;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_release;
        error = chown_common(nd.path.dentry, user, group);
+        mnt_drop_write(nd.path.mnt);
+out_release:
        path_put(&nd.path);
 out:
        return error;
@@ -691,7 +708,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
        error = __user_walk_fd(dfd, filename, follow, &nd);
        if (error)
                goto out;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_release;
        error = chown_common(nd.path.dentry, user, group);
+        mnt_drop_write(nd.path.mnt);
+out_release:
        path_put(&nd.path);
 out:
        return error;
@@ -705,7 +727,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group
        error = user_path_walk_link(filename, &nd);
        if (error)
                goto out;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto out_release;
        error = chown_common(nd.path.dentry, user, group);
+        mnt_drop_write(nd.path.mnt);
+out_release:
        path_put(&nd.path);
 out:
        return error;
@@ -722,14 +749,48 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
        if (!file)
                goto out;
+        error = mnt_want_write(file->f_path.mnt);
+        if (error)
+                goto out_fput;
        dentry = file->f_path.dentry;
        audit_inode(NULL, dentry);
        error = chown_common(dentry, user, group);
+        mnt_drop_write(file->f_path.mnt);
+out_fput:
        fput(file);
 out:
        return error;
 }
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput().  This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+                                          struct vfsmount *mnt)
+{
+        int error;
+        error = get_write_access(inode);
+        if (error)
+                return error;
+        /*
+         * Do not take mount writer counts on
+         * special files since no writes to
+         * the mount itself will occur.
+         */
+        if (!special_file(inode->i_mode)) {
+                /*
+                 * Balanced in __fput()
+                 */
+                error = mnt_want_write(mnt);
+                if (error)
+                        put_write_access(inode);
+        }
+        return error;
+}
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                        int flags, struct file *f,
                                        int (*open)(struct inode *, struct file *))
@@ -742,9 +803,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                FMODE_PREAD | FMODE_PWRITE;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
-                error = get_write_access(inode);
+                error = __get_file_write_access(inode, mnt);
                if (error)
                        goto cleanup_file;
+                if (!special_file(inode->i_mode))
+                        file_take_write(f);
        }
        f->f_mapping = inode->i_mapping;
@@ -784,8 +847,19 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 cleanup_all:
        fops_put(f->f_op);
-        if (f->f_mode & FMODE_WRITE)
+        if (f->f_mode & FMODE_WRITE) {
                put_write_access(inode);
+                if (!special_file(inode->i_mode)) {
+                        /*
+                         * We don't consider this a real
+                         * mnt_want/drop_write() pair
+                         * because it all happenend right
+                         * here, so just reset the state.
+                         */
+                        file_reset_write(f);
+                        mnt_drop_write(mnt);
+                }
+        }
        file_kill(f);
        f->f_path.dentry = NULL;
        f->f_path.mnt = NULL;
@@ -796,43 +870,6 @@ cleanup_file:
        return ERR_PTR(error);
 }
-/*
- * Note that while the flag value (low two bits) for sys_open means:
- *      00 - read-only
- *      01 - write-only
- *      10 - read-write
- *      11 - special
- * it is changed into
- *      00 - no permissions needed
- *      01 - read-permission
- *      10 - write-permission
- *      11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
-static struct file *do_filp_open(int dfd, const char *filename, int flags,
-                                 int mode)
-{
-        int namei_flags, error;
-        struct nameidata nd;
-        namei_flags = flags;
-        if ((namei_flags+1) & O_ACCMODE)
-                namei_flags++;
-        error = open_namei(dfd, filename, namei_flags, mode, &nd);
-        if (!error)
-                return nameidata_to_filp(&nd, flags);
-        return ERR_PTR(error);
-}
-struct file *filp_open(const char *filename, int flags, int mode)
-{
-        return do_filp_open(AT_FDCWD, filename, flags, mode);
-}
-EXPORT_SYMBOL(filp_open);
 /**
 * lookup_instantiate_filp - instantiates the open intent filp
 * @nd: pointer to nameidata
@@ -903,6 +940,18 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
        int error;
        struct file *f;
+        /*
+         * We must always pass in a valid mount pointer.   Historically
+         * callers got away with not passing it, but we must enforce this at
+         * the earliest possible point now to avoid strange problems deep in the
+         * filesystem stack.
+         */
+        if (!mnt) {
+                printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
+                dump_stack();
+                return ERR_PTR(-EINVAL);
+        }
        error = -ENFILE;
        f = get_empty_filp();
        if (f == NULL) {
@@ -1055,7 +1104,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
        ret = do_sys_open(AT_FDCWD, filename, flags, mode);
        /* avoid REGPARM breakage on x86: */
-        prevent_tail_call(ret);
+        asmlinkage_protect(3, ret, filename, flags, mode);
        return ret;
 }
@@ -1069,7 +1118,7 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
        ret = do_sys_open(dfd, filename, flags, mode);
        /* avoid REGPARM breakage on x86: */
-        prevent_tail_call(ret);
+        asmlinkage_protect(4, ret, dfd, filename, flags, mode);
        return ret;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 03f808c5b79d..6149e4b58c88 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
                return 0;
        if (IS_ERR(state))      /* I/O error reading the partition table */
                return -EIO;
+        /* tell userspace that the media / partition table may have changed */
+        kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
        for (p = 1; p < state->limit; p++) {
                sector_t size = state->parts[p].size;
                sector_t from = state->parts[p].from;
diff --git a/fs/pipe.c b/fs/pipe.c
index 3c185b6527bc..8be381bbcb54 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -957,13 +957,10 @@ struct file *create_write_pipe(void)
        struct dentry *dentry;
        struct qstr name = { .name = "" };
-        f = get_empty_filp();
-        if (!f)
-                return ERR_PTR(-ENFILE);
        err = -ENFILE;
        inode = get_pipe_inode();
        if (!inode)
-                goto err_file;
+                goto err;
        err = -ENOMEM;
        dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
@@ -978,22 +975,24 @@ struct file *create_write_pipe(void)
         */
        dentry->d_flags &= ~DCACHE_UNHASHED;
        d_instantiate(dentry, inode);
-        f->f_path.mnt = mntget(pipe_mnt);
-        f->f_path.dentry = dentry;
+        err = -ENFILE;
+        f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipe_fops);
+        if (!f)
+                goto err_dentry;
        f->f_mapping = inode->i_mapping;
        f->f_flags = O_WRONLY;
-        f->f_op = &write_pipe_fops;
-        f->f_mode = FMODE_WRITE;
        f->f_version = 0;
        return f;
+ err_dentry:
+        dput(dentry);
 err_inode:
        free_pipe_info(inode);
        iput(inode);
- err_file:
+ err:
-        put_filp(f);
        return ERR_PTR(err);
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 05ba692bc540..1d8f5447f3f7 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -225,7 +225,7 @@ out:
 */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-        int mycount = atomic_read(&mnt->mnt_count);
+        int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
        return (mycount > count);
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 88f8edf18258..81d7d145292a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -314,9 +314,12 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 static int lstats_show_proc(struct seq_file *m, void *v)
 {
        int i;
-        struct task_struct *task = m->private;
+        struct inode *inode = m->private;
-        seq_puts(m, "Latency Top version : v0.1\n");
+        struct task_struct *task = get_proc_task(inode);
+        if (!task)
+                return -ESRCH;
+        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < 32; i++) {
                if (task->latency_record[i].backtrace[0]) {
                        int q;
@@ -341,32 +344,24 @@ static int lstats_show_proc(struct seq_file *m, void *v)
                }
        }
+        put_task_struct(task);
        return 0;
 }
 static int lstats_open(struct inode *inode, struct file *file)
 {
-        int ret;
+        return single_open(file, lstats_show_proc, inode);
-        struct seq_file *m;
-        struct task_struct *task = get_proc_task(inode);
-        ret = single_open(file, lstats_show_proc, NULL);
-        if (!ret) {
-                m = file->private_data;
-                m->private = task;
-        }
-        return ret;
 }
 static ssize_t lstats_write(struct file *file, const char __user *buf,
                            size_t count, loff_t *offs)
 {
-        struct seq_file *m;
+        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
-        struct task_struct *task;
-        m = file->private_data;
+        if (!task)
-        task = m->private;
+                return -ESRCH;
        clear_all_latency_tracing(task);
+        put_task_struct(task);
        return count;
 }
@@ -416,6 +411,7 @@ static const struct limit_names lnames[RLIM_NLIMITS] = {
        [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
        [RLIMIT_NICE] = {"Max nice priority", NULL},
        [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
+        [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
 };
 /* Display limits for a process */
@@ -1040,6 +1036,26 @@ static const struct file_operations proc_loginuid_operations = {
        .read           = proc_loginuid_read,
        .write          = proc_loginuid_write,
 };
+static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
+                                  size_t count, loff_t *ppos)
+{
+        struct inode * inode = file->f_path.dentry->d_inode;
+        struct task_struct *task = get_proc_task(inode);
+        ssize_t length;
+        char tmpbuf[TMPBUFLEN];
+        if (!task)
+                return -ESRCH;
+        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+                                audit_get_sessionid(task));
+        put_task_struct(task);
+        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+static const struct file_operations proc_sessionid_operations = {
+        .read           = proc_sessionid_read,
+};
 #endif
 #ifdef CONFIG_FAULT_INJECTION
@@ -2273,6 +2289,9 @@ static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, task),
        DIR("fd",         S_IRUSR|S_IXUSR, fd),
        DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
+#ifdef CONFIG_NET
+        DIR("net",        S_IRUGO|S_IXUGO, net),
+#endif
        REG("environ",    S_IRUSR, environ),
        INF("auxv",       S_IRUSR, pid_auxv),
        ONE("status",     S_IRUGO, pid_status),
@@ -2320,6 +2339,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
+        REG("sessionid",  S_IRUSR, sessionid),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
@@ -2650,6 +2670,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
+        REG("sessionid",  S_IRUSR, sessionid),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 68971e66cd41..a36ad3c75cf4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -377,15 +377,14 @@ static struct dentry_operations proc_dentry_operations =
 * Don't create negative dentries here, return -ENOENT by hand
 * instead.
 */
-struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
+                struct dentry *dentry)
 {
        struct inode *inode = NULL;
-        struct proc_dir_entry * de;
        int error = -ENOENT;
        lock_kernel();
        spin_lock(&proc_subdir_lock);
-        de = PDE(dir);
        if (de) {
                for (de = de->subdir; de ; de = de->next) {
                        if (de->namelen != dentry->d_name.len)
@@ -393,8 +392,6 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
                        if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
                                unsigned int ino;
-                                if (de->shadow_proc)
-                                        de = de->shadow_proc(current, de);
                                ino = de->low_ino;
                                de_get(de);
                                spin_unlock(&proc_subdir_lock);
@@ -417,6 +414,12 @@ out_unlock:
        return ERR_PTR(error);
 }
+struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+                struct nameidata *nd)
+{
+        return proc_lookup_de(PDE(dir), dir, dentry);
+}
 /*
 * This returns non-zero if at EOF, so that the /proc
 * root directory can use this and check if it should
@@ -426,10 +429,9 @@ out_unlock:
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
-int proc_readdir(struct file * filp,
+int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-        void * dirent, filldir_t filldir)
+                filldir_t filldir)
 {
-        struct proc_dir_entry * de;
        unsigned int ino;
        int i;
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -438,7 +440,6 @@ int proc_readdir(struct file * filp,
        lock_kernel();
        ino = inode->i_ino;
-        de = PDE(inode);
        if (!de) {
                ret = -EINVAL;
                goto out;
@@ -499,6 +500,13 @@ out:	unlock_kernel();
        return ret;     
 }
+int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+}
 /*
 * These are the generic /proc directory operations. They
 * use the in-memory "struct proc_dir_entry" tree to parse
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1c81c8f1aeed..bc72f5c8c47d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -64,6 +64,8 @@ extern const struct file_operations proc_numa_maps_operations;
 extern const struct file_operations proc_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
 void free_proc_entry(struct proc_dir_entry *de);
@@ -83,3 +85,8 @@ static inline int proc_fd(struct inode *inode)
 {
        return PROC_I(inode)->fd;
 }
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
+                struct dentry *dentry);
+int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+                filldir_t filldir);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 468805d40e2b..2d563979cb02 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -32,6 +32,7 @@
 #include <linux/interrupt.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/genhd.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
 #include <linux/module.h>
@@ -377,7 +378,6 @@ static int stram_read_proc(char *page, char **start, off_t off,
 #endif
 #ifdef CONFIG_BLOCK
-extern const struct seq_operations partitions_op;
 static int partitions_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &partitions_op);
@@ -389,7 +389,6 @@ static const struct file_operations proc_partitions_operations = {
        .release        = seq_release,
 };
-extern const struct seq_operations diskstats_op;
 static int diskstats_open(struct inode *inode, struct file *file)
 {
        return seq_open(file, &diskstats_op);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 14e9b5aaf863..13cd7835d0df 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -44,7 +44,9 @@ int seq_open_net(struct inode *ino, struct file *f,
                put_net(net);
                return -ENOMEM;
        }
+#ifdef CONFIG_NET_NS
        p->net = net;
+#endif
        return 0;
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
@@ -52,17 +54,91 @@ EXPORT_SYMBOL_GPL(seq_open_net);
 int seq_release_net(struct inode *ino, struct file *f)
 {
        struct seq_file *seq;
-        struct seq_net_private *p;
        seq = f->private_data;
-        p = seq->private;
-        put_net(p->net);
+        put_net(seq_file_net(seq));
        seq_release_private(ino, f);
        return 0;
 }
 EXPORT_SYMBOL_GPL(seq_release_net);
+static struct net *get_proc_task_net(struct inode *dir)
+{
+        struct task_struct *task;
+        struct nsproxy *ns;
+        struct net *net = NULL;
+        rcu_read_lock();
+        task = pid_task(proc_pid(dir), PIDTYPE_PID);
+        if (task != NULL) {
+                ns = task_nsproxy(task);
+                if (ns != NULL)
+                        net = get_net(ns->net_ns);
+        }
+        rcu_read_unlock();
+        return net;
+}
+static struct dentry *proc_tgid_net_lookup(struct inode *dir,
+                struct dentry *dentry, struct nameidata *nd)
+{
+        struct dentry *de;
+        struct net *net;
+        de = ERR_PTR(-ENOENT);
+        net = get_proc_task_net(dir);
+        if (net != NULL) {
+                de = proc_lookup_de(net->proc_net, dir, dentry);
+                put_net(net);
+        }
+        return de;
+}
+static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct net *net;
+        net = get_proc_task_net(inode);
+        generic_fillattr(inode, stat);
+        if (net != NULL) {
+                stat->nlink = net->proc_net->nlink;
+                put_net(net);
+        }
+        return 0;
+}
+const struct inode_operations proc_net_inode_operations = {
+        .lookup         = proc_tgid_net_lookup,
+        .getattr        = proc_tgid_net_getattr,
+};
+static int proc_tgid_net_readdir(struct file *filp, void *dirent,
+                filldir_t filldir)
+{
+        int ret;
+        struct net *net;
+        ret = -EINVAL;
+        net = get_proc_task_net(filp->f_path.dentry->d_inode);
+        if (net != NULL) {
+                ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+                put_net(net);
+        }
+        return ret;
+}
+const struct file_operations proc_net_operations = {
+        .read           = generic_read_dir,
+        .readdir        = proc_tgid_net_readdir,
+};
 struct proc_dir_entry *proc_net_fops_create(struct net *net,
        const char *name, mode_t mode, const struct file_operations *fops)
@@ -83,14 +159,6 @@ struct net *get_proc_net(const struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(get_proc_net);
-static struct proc_dir_entry *shadow_pde;
-static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
-                                                struct proc_dir_entry *de)
-{
-        return task->nsproxy->net_ns->proc_net;
-}
 struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
                struct proc_dir_entry *parent)
 {
@@ -104,45 +172,39 @@ EXPORT_SYMBOL_GPL(proc_net_mkdir);
 static __net_init int proc_net_ns_init(struct net *net)
 {
-        struct proc_dir_entry *root, *netd, *net_statd;
+        struct proc_dir_entry *netd, *net_statd;
        int err;
        err = -ENOMEM;
-        root = kzalloc(sizeof(*root), GFP_KERNEL);
+        netd = kzalloc(sizeof(*netd), GFP_KERNEL);
-        if (!root)
+        if (!netd)
                goto out;
-        err = -EEXIST;
+        netd->data = net;
-        netd = proc_net_mkdir(net, "net", root);
+        netd->nlink = 2;
-        if (!netd)
+        netd->name = "net";
-                goto free_root;
+        netd->namelen = 3;
+        netd->parent = &proc_root;
        err = -EEXIST;
        net_statd = proc_net_mkdir(net, "stat", netd);
        if (!net_statd)
                goto free_net;
-        root->data = net;
-        net->proc_net_root = root;
        net->proc_net = netd;
        net->proc_net_stat = net_statd;
-        err = 0;
+        return 0;
+free_net:
+        kfree(netd);
 out:
        return err;
-free_net:
-        remove_proc_entry("net", root);
-free_root:
-        kfree(root);
-        goto out;
 }
 static __net_exit void proc_net_ns_exit(struct net *net)
 {
        remove_proc_entry("stat", net->proc_net);
-        remove_proc_entry("net", net->proc_net_root);
+        kfree(net->proc_net);
-        kfree(net->proc_net_root);
 }
 static struct pernet_operations __net_initdata proc_net_ns_ops = {
@@ -152,8 +214,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
 int __init proc_net_init(void)
 {
-        shadow_pde = proc_mkdir("net", NULL);
+        proc_symlink("net", NULL, "self/net");
-        shadow_pde->shadow_proc = proc_net_shadow;
        return register_pernet_subsys(&proc_net_ns_ops);
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 49958cffbd8d..9dfb5ff24209 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -527,13 +527,21 @@ struct pagemapread {
        char __user *out, *end;
 };
-#define PM_ENTRY_BYTES sizeof(u64)
+#define PM_ENTRY_BYTES      sizeof(u64)
-#define PM_RESERVED_BITS    3
+#define PM_STATUS_BITS      3
-#define PM_RESERVED_OFFSET  (64 - PM_RESERVED_BITS)
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
-#define PM_RESERVED_MASK    (((1LL<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_SPECIAL(nr)      (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_NOT_PRESENT      PM_SPECIAL(1LL)
+#define PM_PSHIFT_BITS      6
-#define PM_SWAP             PM_SPECIAL(2LL)
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+#define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)
 #define PM_END_OF_BUFFER    1
 static int add_to_pagemap(unsigned long addr, u64 pfn,
@@ -574,7 +582,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 u64 swap_pte_to_pagemap_entry(pte_t pte)
 {
        swp_entry_t e = pte_to_swp_entry(pte);
-        return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+        return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
@@ -588,9 +596,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                u64 pfn = PM_NOT_PRESENT;
                pte = pte_offset_map(pmd, addr);
                if (is_swap_pte(*pte))
-                        pfn = swap_pte_to_pagemap_entry(*pte);
+                        pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte))
+                                | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
                else if (pte_present(*pte))
-                        pfn = pte_pfn(*pte);
+                        pfn = PM_PFRAME(pte_pfn(*pte))
+                                | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
                /* unmap so we're not in atomic when we copy to userspace */
                pte_unmap(pte);
                err = add_to_pagemap(addr, pfn, pm);
@@ -611,12 +621,20 @@ static struct mm_walk pagemap_walk = {
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
 *
- * For each page in the address space, this file contains one 64-bit
+ * For each page in the address space, this file contains one 64-bit entry
- * entry representing the corresponding physical page frame number
+ * consisting of the following:
- * (PFN) if the page is present. If there is a swap entry for the
+ *
- * physical page, then an encoding of the swap file number and the
+ * Bits 0-55  page frame number (PFN) if present
- * page's offset into the swap file are returned. If no page is
+ * Bits 0-4   swap type if swapped
- * present at all, PM_NOT_PRESENT is returned. This allows determining
+ * Bits 5-55  swap offset if swapped
+ * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit  61    reserved for future use
+ * Bit  62    page swapped
+ * Bit  63    page present
+ *
+ * If the page is not present but in swap, then the PFN contains an
+ * encoding of the swap file number and the page's offset into the
+ * swap. Unmapped pages return a null PFN. This allows determining
 * precisely which pages are mapped (or in swap) and comparing mapped
 * pages between processes.
 *
@@ -640,17 +658,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        ret = -EACCES;
        if (!ptrace_may_attach(task))
-                goto out;
+                goto out_task;
        ret = -EINVAL;
        /* file position must be aligned */
        if (*ppos % PM_ENTRY_BYTES)
-                goto out;
+                goto out_task;
        ret = 0;
        mm = get_task_mm(task);
        if (!mm)
-                goto out;
+                goto out_task;
        ret = -ENOMEM;
        uaddr = (unsigned long)buf & PAGE_MASK;
@@ -658,7 +676,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
        pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
        if (!pages)
-                goto out_task;
+                goto out_mm;
        down_read(&current->mm->mmap_sem);
        ret = get_user_pages(current, current->mm, uaddr, pagecount,
@@ -668,6 +686,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (ret < 0)
                goto out_free;
+        if (ret != pagecount) {
+                pagecount = ret;
+                ret = -EFAULT;
+                goto out_pages;
+        }
        pm.out = buf;
        pm.end = buf + count;
@@ -699,15 +723,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                        ret = pm.out - buf;
        }
+out_pages:
        for (; pagecount; pagecount--) {
                page = pages[pagecount-1];
                if (!PageReserved(page))
                        SetPageDirty(page);
                page_cache_release(page);
        }
-        mmput(mm);
 out_free:
        kfree(pages);
+out_mm:
+        mmput(mm);
 out_task:
        put_task_struct(task);
 out:
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index f85c5cf4934c..7ee4208793b6 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -283,7 +283,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
                return balance_leaf_when_delete(tb, flag);
        zeros_num = 0;
-        if (flag == M_INSERT && body == 0)
+        if (flag == M_INSERT && !body)
                zeros_num = ih_item_len(ih);
        pos_in_item = tb->tb_path->pos_in_item;
@@ -1728,7 +1728,7 @@ struct buffer_head *get_FEB(struct tree_balance *tb)
        struct buffer_info bi;
        for (i = 0; i < MAX_FEB_SIZE; i++)
-                if (tb->FEB[i] != 0)
+                if (tb->FEB[i] != NULL)
                        break;
        if (i == MAX_FEB_SIZE)
@@ -1827,7 +1827,7 @@ int get_left_neighbor_position(struct tree_balance *tb, int h)
 {
        int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-        RFALSE(PATH_H_PPARENT(tb->tb_path, h) == 0 || tb->FL[h] == 0,
+        RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
               "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
               h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
@@ -1841,7 +1841,7 @@ int get_right_neighbor_position(struct tree_balance *tb, int h)
 {
        int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-        RFALSE(PATH_H_PPARENT(tb->tb_path, h) == 0 || tb->FR[h] == 0,
+        RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
               "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
               h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 0ee35c6c9b72..07d05e0842b7 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -153,7 +153,7 @@ static void create_virtual_node(struct tree_balance *tb, int h)
        if (vn->vn_mode == M_INSERT) {
                struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
-                RFALSE(vn->vn_ins_ih == 0,
+                RFALSE(vn->vn_ins_ih == NULL,
                       "vs-8040: item header of inserted item is not specified");
                vi->vi_item_len = tb->insert_size[0];
                vi->vi_ih = vn->vn_ins_ih;
@@ -857,7 +857,8 @@ static int get_lfree(struct tree_balance *tb, int h)
        struct buffer_head *l, *f;
        int order;
-        if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
+        if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
+            (l = tb->FL[h]) == NULL)
                return 0;
        if (f == l)
@@ -878,7 +879,8 @@ static int get_rfree(struct tree_balance *tb, int h)
        struct buffer_head *r, *f;
        int order;
-        if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
+        if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
+            (r = tb->FR[h]) == NULL)
                return 0;
        if (f == r)
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index e0f0f098a523..74363a7aacbc 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -4,6 +4,7 @@
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
@@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                   unsigned long arg)
 {
        unsigned int flags;
+        int err = 0;
        switch (cmd) {
        case REISERFS_IOC_UNPACK:
@@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                        if (!reiserfs_attrs(inode->i_sb))
                                return -ENOTTY;
-                        if (IS_RDONLY(inode))
+                        err = mnt_want_write(filp->f_path.mnt);
-                                return -EROFS;
+                        if (err)
+                                return err;
-                        if (!is_owner_or_cap(inode))
+                        if (!is_owner_or_cap(inode)) {
-                                return -EPERM;
+                                err = -EPERM;
+                                goto setflags_out;
-                        if (get_user(flags, (int __user *)arg))
+                        }
-                                return -EFAULT;
+                        if (get_user(flags, (int __user *)arg)) {
+                                err = -EFAULT;
-                        /* Is it quota file? Do not allow user to mess with it. */
+                                goto setflags_out;
-                        if (IS_NOQUOTA(inode))
+                        }
-                                return -EPERM;
+                        /*
+                         * Is it quota file? Do not allow user to mess with it
+                         */
+                        if (IS_NOQUOTA(inode)) {
+                                err = -EPERM;
+                                goto setflags_out;
+                        }
                        if (((flags ^ REISERFS_I(inode)->
                              i_attrs) & (REISERFS_IMMUTABLE_FL |
                                          REISERFS_APPEND_FL))
-                            && !capable(CAP_LINUX_IMMUTABLE))
+                            && !capable(CAP_LINUX_IMMUTABLE)) {
-                                return -EPERM;
+                                err = -EPERM;
+                                goto setflags_out;
+                        }
                        if ((flags & REISERFS_NOTAIL_FL) &&
                            S_ISREG(inode->i_mode)) {
                                int result;
                                result = reiserfs_unpack(inode, filp);
-                                if (result)
+                                if (result) {
-                                        return result;
+                                        err = result;
+                                        goto setflags_out;
+                                }
                        }
                        sd_attrs_to_i_attrs(flags, inode);
                        REISERFS_I(inode)->i_attrs = flags;
                        inode->i_ctime = CURRENT_TIME_SEC;
                        mark_inode_dirty(inode);
-                        return 0;
+setflags_out:
+                        mnt_drop_write(filp->f_path.mnt);
+                        return err;
                }
        case REISERFS_IOC_GETVERSION:
                return put_user(inode->i_generation, (int __user *)arg);
        case REISERFS_IOC_SETVERSION:
                if (!is_owner_or_cap(inode))
                        return -EPERM;
-                if (IS_RDONLY(inode))
+                err = mnt_want_write(filp->f_path.mnt);
-                        return -EROFS;
+                if (err)
-                if (get_user(inode->i_generation, (int __user *)arg))
+                        return err;
-                        return -EFAULT;
+                if (get_user(inode->i_generation, (int __user *)arg)) {
+                        err = -EFAULT;
+                        goto setversion_out;
+                }
                inode->i_ctime = CURRENT_TIME_SEC;
                mark_inode_dirty(inode);
-                return 0;
+setversion_out:
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
        default:
                return -ENOTTY;
        }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bb05a3e51b93..060eb3f598e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -38,7 +38,7 @@
 #include <asm/system.h>
 #include <linux/time.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
 #include <linux/vmalloc.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 281f8061ac58..6de060a6aa7f 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -626,7 +626,7 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
                               "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)",
                               shift_mode);
        }
-        RFALSE(src_bi->bi_bh == 0 || dest_bi->bi_bh == 0,
+        RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
               "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
               shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
 }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index b378eea332ca..8867533cb727 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -452,7 +452,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
        buflen = DEH_SIZE + ROUND_UP(namelen);
        if (buflen > sizeof(small_buf)) {
                buffer = kmalloc(buflen, GFP_NOFS);
-                if (buffer == 0)
+                if (!buffer)
                        return -ENOMEM;
        } else
                buffer = small_buf;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 6841452e0dea..393cc22c1717 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2031,7 +2031,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                return -EXDEV;
        }
        /* We must not pack tails for quota files on reiserfs for quota IO to work */
-        if (!REISERFS_I(nd.path.dentry->d_inode)->i_flags & i_nopack_mask) {
+        if (!(REISERFS_I(nd.path.dentry->d_inode)->i_flags & i_nopack_mask)) {
                reiserfs_warning(sb,
                                 "reiserfs: Quota file must have tail packing disabled.");
                path_put(&nd.path);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index eba037b3338f..d7c4935c1034 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -44,7 +44,6 @@
 #include <net/checksum.h>
 #include <linux/smp_lock.h>
 #include <linux/stat.h>
-#include <asm/semaphore.h>
 #define FL_READONLY 128
 #define FL_DIR_SEM_HELD 256
@@ -191,28 +190,11 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
        dput(xadir);
        if (err)
                xafile = ERR_PTR(err);
-        return xafile;
-}
-/* Opens a file pointer to the attribute associated with inode */
-static struct file *open_xa_file(const struct inode *inode, const char *name,
-                                 int flags)
-{
-        struct dentry *xafile;
-        struct file *fp;
-        xafile = get_xa_file_dentry(inode, name, flags);
-        if (IS_ERR(xafile))
-                return ERR_PTR(PTR_ERR(xafile));
        else if (!xafile->d_inode) {
                dput(xafile);
-                return ERR_PTR(-ENODATA);
+                xafile = ERR_PTR(-ENODATA);
        }
+        return xafile;
-        fp = dentry_open(xafile, NULL, O_RDWR);
-        /* dentry_open dputs the dentry if it fails */
-        return fp;
 }
 /*
@@ -228,9 +210,8 @@ static struct file *open_xa_file(const struct inode *inode, const char *name,
 * we're called with i_mutex held, so there are no worries about the directory
 * changing underneath us.
 */
-static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int __xattr_readdir(struct inode *inode, void *dirent, filldir_t filldir)
 {
-        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
        INITIALIZE_PATH(path_to_entry);
        struct buffer_head *bh;
@@ -374,23 +355,16 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
 *
 */
 static
-int xattr_readdir(struct file *file, filldir_t filler, void *buf)
+int xattr_readdir(struct inode *inode, filldir_t filler, void *buf)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        int res = -ENOENT;
-        int res = -ENOTDIR;
-        if (!file->f_op || !file->f_op->readdir)
-                goto out;
        mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
-//        down(&inode->i_zombie);
-        res = -ENOENT;
        if (!IS_DEADDIR(inode)) {
                lock_kernel();
-                res = __xattr_readdir(file, buf, filler);
+                res = __xattr_readdir(inode, buf, filler);
                unlock_kernel();
        }
-//        up(&inode->i_zombie);
        mutex_unlock(&inode->i_mutex);
-      out:
        return res;
 }
@@ -442,7 +416,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
                   size_t buffer_size, int flags)
 {
        int err = 0;
-        struct file *fp;
+        struct dentry *dentry;
        struct page *page;
        char *data;
        struct address_space *mapping;
@@ -460,18 +434,18 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
                xahash = xattr_hash(buffer, buffer_size);
      open_file:
-        fp = open_xa_file(inode, name, flags);
+        dentry = get_xa_file_dentry(inode, name, flags);
-        if (IS_ERR(fp)) {
+        if (IS_ERR(dentry)) {
-                err = PTR_ERR(fp);
+                err = PTR_ERR(dentry);
                goto out;
        }
-        xinode = fp->f_path.dentry->d_inode;
+        xinode = dentry->d_inode;
        REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
        /* we need to copy it off.. */
        if (xinode->i_nlink > 1) {
-                fput(fp);
+                dput(dentry);
                err = reiserfs_xattr_del(inode, name);
                if (err < 0)
                        goto out;
@@ -485,7 +459,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
        newattrs.ia_size = buffer_size;
        newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
        mutex_lock_nested(&xinode->i_mutex, I_MUTEX_XATTR);
-        err = notify_change(fp->f_path.dentry, &newattrs);
+        err = notify_change(dentry, &newattrs);
        if (err)
                goto out_filp;
@@ -518,15 +492,14 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
                        rxh->h_hash = cpu_to_le32(xahash);
                }
-                err = reiserfs_prepare_write(fp, page, page_offset,
+                err = reiserfs_prepare_write(NULL, page, page_offset,
                                            page_offset + chunk + skip);
                if (!err) {
                        if (buffer)
                                memcpy(data + skip, buffer + buffer_pos, chunk);
-                        err =
+                        err = reiserfs_commit_write(NULL, page, page_offset,
-                            reiserfs_commit_write(fp, page, page_offset,
+                                                    page_offset + chunk +
-                                                  page_offset + chunk +
+                                                    skip);
-                                                  skip);
                }
                unlock_page(page);
                reiserfs_put_page(page);
@@ -548,7 +521,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
      out_filp:
        mutex_unlock(&xinode->i_mutex);
-        fput(fp);
+        dput(dentry);
      out:
        return err;
@@ -562,7 +535,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
                   size_t buffer_size)
 {
        ssize_t err = 0;
-        struct file *fp;
+        struct dentry *dentry;
        size_t isize;
        size_t file_pos = 0;
        size_t buffer_pos = 0;
@@ -578,13 +551,13 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
        if (get_inode_sd_version(inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        fp = open_xa_file(inode, name, FL_READONLY);
+        dentry = get_xa_file_dentry(inode, name, FL_READONLY);
-        if (IS_ERR(fp)) {
+        if (IS_ERR(dentry)) {
-                err = PTR_ERR(fp);
+                err = PTR_ERR(dentry);
                goto out;
        }
-        xinode = fp->f_path.dentry->d_inode;
+        xinode = dentry->d_inode;
        isize = xinode->i_size;
        REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
@@ -652,7 +625,7 @@ reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
        }
      out_dput:
-        fput(fp);
+        dput(dentry);
      out:
        return err;
@@ -742,7 +715,6 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
 /* This is called w/ inode->i_mutex downed */
 int reiserfs_delete_xattrs(struct inode *inode)
 {
-        struct file *fp;
        struct dentry *dir, *root;
        int err = 0;
@@ -763,15 +735,8 @@ int reiserfs_delete_xattrs(struct inode *inode)
                return 0;
        }
-        fp = dentry_open(dir, NULL, O_RDWR);
-        if (IS_ERR(fp)) {
-                err = PTR_ERR(fp);
-                /* dentry_open dputs the dentry if it fails */
-                goto out;
-        }
        lock_kernel();
-        err = xattr_readdir(fp, reiserfs_delete_xattrs_filler, dir);
+        err = xattr_readdir(dir->d_inode, reiserfs_delete_xattrs_filler, dir);
        if (err) {
                unlock_kernel();
                goto out_dir;
@@ -791,7 +756,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
        unlock_kernel();
      out_dir:
-        fput(fp);
+        dput(dir);
      out:
        if (!err)
@@ -833,7 +798,6 @@ reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 {
-        struct file *fp;
        struct dentry *dir;
        int err = 0;
        struct reiserfs_chown_buf buf;
@@ -857,13 +821,6 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
                goto out;
        }
-        fp = dentry_open(dir, NULL, O_RDWR);
-        if (IS_ERR(fp)) {
-                err = PTR_ERR(fp);
-                /* dentry_open dputs the dentry if it fails */
-                goto out;
-        }
        lock_kernel();
        attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
@@ -871,7 +828,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
        buf.attrs = attrs;
        buf.inode = inode;
-        err = xattr_readdir(fp, reiserfs_chown_xattrs_filler, &buf);
+        err = xattr_readdir(dir->d_inode, reiserfs_chown_xattrs_filler, &buf);
        if (err) {
                unlock_kernel();
                goto out_dir;
@@ -881,7 +838,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
        unlock_kernel();
      out_dir:
-        fput(fp);
+        dput(dir);
      out:
        attrs->ia_valid = ia_valid;
@@ -1029,7 +986,6 @@ reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
 */
 ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
-        struct file *fp;
        struct dentry *dir;
        int err = 0;
        struct reiserfs_listxattr_buf buf;
@@ -1052,13 +1008,6 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
                goto out;
        }
-        fp = dentry_open(dir, NULL, O_RDWR);
-        if (IS_ERR(fp)) {
-                err = PTR_ERR(fp);
-                /* dentry_open dputs the dentry if it fails */
-                goto out;
-        }
        buf.r_buf = buffer;
        buf.r_size = buffer ? size : 0;
        buf.r_pos = 0;
@@ -1066,7 +1015,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
        REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir;
-        err = xattr_readdir(fp, reiserfs_listxattr_filler, &buf);
+        err = xattr_readdir(dir->d_inode, reiserfs_listxattr_filler, &buf);
        if (err)
                goto out_dir;
@@ -1076,7 +1025,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
                err = buf.r_pos;
      out_dir:
-        fput(fp);
+        dput(dir);
      out:
        reiserfs_read_unlock_xattr_i(dentry->d_inode);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 00b6f0a518c8..3f13d491c7c7 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -340,8 +340,9 @@ static struct dentry *
 romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        unsigned long offset, maxoff;
-        int fslen, res;
+        long res;
-        struct inode *inode;
+        int fslen;
+        struct inode *inode = NULL;
        char fsname[ROMFS_MAXFN];       /* XXX dynamic? */
        struct romfs_inode ri;
        const char *name;               /* got from dentry */
@@ -351,7 +352,7 @@ romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        offset = dir->i_ino & ROMFH_MASK;
        lock_kernel();
        if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                goto out;
+                goto error;
        maxoff = romfs_maxsize(dir->i_sb);
        offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
@@ -364,9 +365,9 @@ romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        for(;;) {
                if (!offset || offset >= maxoff)
-                        goto out0;
+                        goto success; /* negative success */
                if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-                        goto out;
+                        goto error;
                /* try to match the first 16 bytes of name */
                fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
@@ -397,23 +398,14 @@ romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        inode = romfs_iget(dir->i_sb, offset);
        if (IS_ERR(inode)) {
                res = PTR_ERR(inode);
-                goto out;
+                goto error;
        }
-        /*
+success:
-         * it's a bit funky, _lookup needs to return an error code
+        d_add(dentry, inode);
-         * (negative) or a NULL, both as a dentry.  ENOENT should not
-         * be returned, instead we need to create a negative dentry by
-         * d_add(dentry, NULL); and return 0 as no error.
-         * (Although as I see, it only matters on writable file
-         * systems).
-         */
-out0:   inode = NULL;
        res = 0;
-        d_add (dentry, inode);
+error:
+        unlock_kernel();
-out:    unlock_kernel();
        return ERR_PTR(res);
 }
diff --git a/fs/select.c b/fs/select.c
index 5633fe980781..00f58c5c7e05 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
                wait = NULL;
                if (retval || !*timeout || signal_pending(current))
                        break;
-                if(table.error) {
+                if (table.error) {
                        retval = table.error;
                        break;
                }
diff --git a/fs/signalfd.c b/fs/signalfd.c
index cb2b63ae0bf4..8ead0db35933 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -111,9 +111,14 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
                err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
                break;
-        default: /* this is just in case for now ... */
+        default:
+                /*
+                 * This case catches also the signals queued by sigqueue().
+                 */
                err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
                err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
+                err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+                err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
                break;
        }
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index fae8e85af0ed..6bd9b691a463 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -206,7 +206,7 @@ int smbiod_retry(struct smb_sb_info *server)
        smb_close_socket(server);
-        if (pid == 0) {
+        if (!pid) {
                /* FIXME: this is fatal, umount? */
                printk(KERN_ERR "smb_retry: no connection process\n");
                server->state = CONN_RETRIED;
diff --git a/fs/splice.c b/fs/splice.c
index 9b559ee711a8..eeb1a86a7014 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -320,7 +320,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
                        error = add_to_page_cache_lru(page, mapping, index,
-                                              GFP_KERNEL);
+                                                mapping_gfp_mask(mapping));
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
@@ -370,8 +370,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                         * for an in-flight io page
                         */
                        if (flags & SPLICE_F_NONBLOCK) {
-                                if (TestSetPageLocked(page))
+                                if (TestSetPageLocked(page)) {
+                                        error = -EAGAIN;
                                        break;
+                                }
                        } else
                                lock_page(page);
@@ -479,9 +481,8 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
                                 struct pipe_inode_info *pipe, size_t len,
                                 unsigned int flags)
 {
-        ssize_t spliced;
-        int ret;
        loff_t isize, left;
+        int ret;
        isize = i_size_read(in->f_mapping->host);
        if (unlikely(*ppos >= isize))
@@ -491,29 +492,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
        if (unlikely(left < len))
                len = left;
-        ret = 0;
+        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-        spliced = 0;
+        if (ret > 0)
-        while (len && !spliced) {
-                ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-                if (ret < 0)
-                        break;
-                else if (!ret) {
-                        if (spliced)
-                                break;
-                        if (flags & SPLICE_F_NONBLOCK) {
-                                ret = -EAGAIN;
-                                break;
-                        }
-                }
                *ppos += ret;
-                len -= ret;
-                spliced += ret;
-        }
-        if (spliced)
-                return spliced;
        return ret;
 }
@@ -1669,6 +1650,13 @@ static int link_pipe(struct pipe_inode_info *ipipe,
                i++;
        } while (len);
+        /*
+         * return EAGAIN if we have the potential of some data in the
+         * future, otherwise just return 0
+         */
+        if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
+                ret = -EAGAIN;
        inode_double_unlock(ipipe->inode, opipe->inode);
        /*
@@ -1709,11 +1697,8 @@ static long do_tee(struct file *in, struct file *out, size_t len,
                ret = link_ipipe_prep(ipipe, flags);
                if (!ret) {
                        ret = link_opipe_prep(opipe, flags);
-                        if (!ret) {
+                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
-                                if (!ret && (flags & SPLICE_F_NONBLOCK))
-                                        ret = -EAGAIN;
-                        }
                }
        }
diff --git a/fs/super.c b/fs/super.c
index 88811f60c8de..1f8f05ede437 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
 #include <linux/idr.h>
 #include <linux/kobject.h>
 #include <linux/mutex.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
@@ -556,21 +557,40 @@ out:
 }
 /**
- *      mark_files_ro
+ *      mark_files_ro - mark all files read-only
 *      @sb: superblock in question
 *
- *      All files are marked read/only.  We don't care about pending
+ *      All files are marked read-only.  We don't care about pending
- *      delete files so this should be used in 'force' mode only
+ *      delete files so this should be used in 'force' mode only.
 */
 static void mark_files_ro(struct super_block *sb)
 {
        struct file *f;
+retry:
        file_list_lock();
        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-                if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
+                struct vfsmount *mnt;
-                        f->f_mode &= ~FMODE_WRITE;
+                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+                       continue;
+                if (!file_count(f))
+                        continue;
+                if (!(f->f_mode & FMODE_WRITE))
+                        continue;
+                f->f_mode &= ~FMODE_WRITE;
+                if (file_check_writeable(f) != 0)
+                        continue;
+                file_release_write(f);
+                mnt = mntget(f->f_path.mnt);
+                file_list_unlock();
+                /*
+                 * This can sleep, so we can't hold
+                 * the file_list_lock() spinlock.
+                 */
+                mnt_drop_write(mnt);
+                mntput(mnt);
+                goto retry;
        }
        file_list_unlock();
 }
@@ -870,12 +890,12 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        if (!mnt)
                goto out;
-        if (data) {
+        if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
                secdata = alloc_secdata();
                if (!secdata)
                        goto out_mnt;
-                error = security_sb_copy_data(type, data, secdata);
+                error = security_sb_copy_data(data, secdata);
                if (error)
                        goto out_free_secdata;
        }
@@ -945,6 +965,7 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
        put_filesystem(type);
        return mnt;
 }
+EXPORT_SYMBOL_GPL(do_kern_mount);
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 {
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4948d9bc405d..a1c3a1fab7f0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -20,6 +20,7 @@
 #include <linux/idr.h>
 #include <linux/completion.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "sysfs.h"
 DEFINE_MUTEX(sysfs_mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a271c87c4472..ade9a7e6a757 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -12,6 +12,8 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/list.h>
@@ -86,7 +88,12 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
         * The code works fine with PAGE_SIZE return but it's likely to
         * indicate truncated result or overflow in normal use cases.
         */
-        BUG_ON(count >= (ssize_t)PAGE_SIZE);
+        if (count >= (ssize_t)PAGE_SIZE) {
+                print_symbol("fill_read_buffer: %s returned bad count\n",
+                        (unsigned long)ops->show);
+                /* Try to struggle along */
+                count = PAGE_SIZE - 1;
+        }
        if (count >= 0) {
                buffer->needs_read_fill = 0;
                buffer->count = count;
@@ -122,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        ssize_t retval = 0;
        mutex_lock(&buffer->mutex);
-        if (buffer->needs_read_fill) {
+        if (buffer->needs_read_fill || *ppos == 0) {
                retval = fill_read_buffer(file->f_path.dentry,buffer);
                if (retval)
                        goto out;
@@ -403,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
 * return POLLERR|POLLPRI, and select will return the fd whether
 * it is waiting for read, write, or exceptions.
 * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, as simply seeking and reading
+ * need to close and re-open the file, or seek to 0 and read again.
- * again will not get new data, or reset the state of 'poll'.
 * Reminder: this only works for attributes which actively support
 * it, and it is not possible to test an attribute from userspace
 * to see if it supports poll (Neither 'poll' nor 'select' return
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 5f66c4466151..817f5966edca 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 void sysfs_remove_link(struct kobject * kobj, const char * name)
 {
-        sysfs_hash_and_remove(kobj->sd, name);
+        struct sysfs_dirent *parent_sd = NULL;
+        if (!kobj)
+                parent_sd = &sysfs_root;
+        else
+                parent_sd = kobj->sd;
+        sysfs_hash_and_remove(parent_sd, name);
 }
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 1fca381f0ce2..1e7598fb9787 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -315,8 +315,8 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
                        }
                        UFSD(" change from %llu to %llu, pos %u\n",
-                             (unsigned long long)pos + oldb,
+                             (unsigned long long)(pos + oldb),
-                             (unsigned long long)pos + newb, pos);
+                             (unsigned long long)(pos + newb), pos);
                        bh->b_blocknr = newb + pos;
                        unmap_underlying_metadata(bh->b_bdev,
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index b26fc4dec1e7..23ceed8c8fb9 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -58,7 +58,7 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
        switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
        case UFS_ST_SUNOS:
-                if (fs32_to_cpu(sb, usb3->fs_postblformat == UFS_42POSTBLFMT)) {
+                if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) {
                        usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value);
                        break;
                }
diff --git a/fs/utimes.c b/fs/utimes.c
index b18da9c0b97f..a2bef77dc9c9 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -2,6 +2,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/linkage.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/stat.h>
@@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
        struct inode *inode;
        struct iattr newattrs;
        struct file *f = NULL;
+        struct vfsmount *mnt;
        error = -EINVAL;
        if (times && (!nsec_valid(times[0].tv_nsec) ||
@@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
                if (!f)
                        goto out;
                dentry = f->f_path.dentry;
+                mnt = f->f_path.mnt;
        } else {
                error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
                if (error)
                        goto out;
                dentry = nd.path.dentry;
+                mnt = nd.path.mnt;
        }
        inode = dentry->d_inode;
-        error = -EROFS;
+        error = mnt_want_write(mnt);
-        if (IS_RDONLY(inode))
+        if (error)
                goto dput_and_out;
        /* Don't worry, the checks are done in inode_change_ok() */
@@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
        if (times) {
                error = -EPERM;
                if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                        goto dput_and_out;
+                        goto mnt_drop_write_and_out;
                if (times[0].tv_nsec == UTIME_OMIT)
                        newattrs.ia_valid &= ~ATTR_ATIME;
@@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
        } else {
                error = -EACCES;
                if (IS_IMMUTABLE(inode))
-                        goto dput_and_out;
+                        goto mnt_drop_write_and_out;
                if (!is_owner_or_cap(inode)) {
                        if (f) {
                                if (!(f->f_mode & FMODE_WRITE))
-                                        goto dput_and_out;
+                                        goto mnt_drop_write_and_out;
                        } else {
                                error = vfs_permission(&nd, MAY_WRITE);
                                if (error)
-                                        goto dput_and_out;
+                                        goto mnt_drop_write_and_out;
                        }
                }
        }
        mutex_lock(&inode->i_mutex);
        error = notify_change(dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+        mnt_drop_write(mnt);
 dput_and_out:
        if (f)
                fput(f);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3acab1615460..f7062da505d4 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/xattr.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask)
         * filesystem  or on an immutable / append-only inode.
         */
        if (mask & MAY_WRITE) {
-                if (IS_RDONLY(inode))
-                        return -EROFS;
                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                        return -EPERM;
        }
@@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value,
        error = user_path_walk(path, &nd);
        if (error)
                return error;
-        error = setxattr(nd.path.dentry, name, value, size, flags);
+        error = mnt_want_write(nd.path.mnt);
+        if (!error) {
+                error = setxattr(nd.path.dentry, name, value, size, flags);
+                mnt_drop_write(nd.path.mnt);
+        }
        path_put(&nd.path);
        return error;
 }
@@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value,
        error = user_path_walk_link(path, &nd);
        if (error)
                return error;
-        error = setxattr(nd.path.dentry, name, value, size, flags);
+        error = mnt_want_write(nd.path.mnt);
+        if (!error) {
+                error = setxattr(nd.path.dentry, name, value, size, flags);
+                mnt_drop_write(nd.path.mnt);
+        }
        path_put(&nd.path);
        return error;
 }
@@ -295,7 +302,12 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = setxattr(dentry, name, value, size, flags);
+        error = mnt_want_write(f->f_path.mnt);
+        if (!error) {
+                error = setxattr(dentry, name, value, size, flags);
+                mnt_drop_write(f->f_path.mnt);
+        }
+out_fput:
        fput(f);
        return error;
 }
@@ -482,7 +494,11 @@ sys_removexattr(char __user *path, char __user *name)
        error = user_path_walk(path, &nd);
        if (error)
                return error;
-        error = removexattr(nd.path.dentry, name);
+        error = mnt_want_write(nd.path.mnt);
+        if (!error) {
+                error = removexattr(nd.path.dentry, name);
+                mnt_drop_write(nd.path.mnt);
+        }
        path_put(&nd.path);
        return error;
 }
@@ -496,7 +512,11 @@ sys_lremovexattr(char __user *path, char __user *name)
        error = user_path_walk_link(path, &nd);
        if (error)
                return error;
-        error = removexattr(nd.path.dentry, name);
+        error = mnt_want_write(nd.path.mnt);
+        if (!error) {
+                error = removexattr(nd.path.dentry, name);
+                mnt_drop_write(nd.path.mnt);
+        }
        path_put(&nd.path);
        return error;
 }
@@ -513,7 +533,11 @@ sys_fremovexattr(int fd, char __user *name)
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = removexattr(dentry, name);
+        error = mnt_want_write(f->f_path.mnt);
+        if (!error) {
+                error = removexattr(dentry, name);
+                mnt_drop_write(f->f_path.mnt);
+        }
        fput(f);
        return error;
 }
diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild
deleted file mode 100644
index 2566e96706f1..000000000000
--- a/fs/xfs/Kbuild
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# The xfs people like to share Makefile with 2.6 and 2.4.
-# Utilise file named Kbuild file which has precedence over Makefile.
-#
-include $(srctree)/$(obj)/Makefile-linux-2.6
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35115bca036e..524021ff5436 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -35,18 +35,6 @@ config XFS_QUOTA
          with or without the generic quota support enabled (CONFIG_QUOTA) -
          they are completely independent subsystems.
-config XFS_SECURITY
-        bool "XFS Security Label support"
-        depends on XFS_FS
-        help
-          Security labels support alternative access control models
-          implemented by security modules like SELinux.  This option
-          enables an extended attribute namespace for inode security
-          labels in the XFS filesystem.
-          If you are not using a security module that requires using
-          extended attributes for inode security labels, say N.
 config XFS_POSIX_ACL
        bool "XFS POSIX ACL support"
        depends on XFS_FS
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 49e3e7e5e3dc..36ec614e699a 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -1 +1,117 @@
-include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL)
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6 -funsigned-char
+XFS_LINUX := linux-2.6
+ifeq ($(CONFIG_XFS_DEBUG),y)
+        EXTRA_CFLAGS += -g
+endif
+obj-$(CONFIG_XFS_FS)            += xfs.o
+xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
+                                   xfs_dquot.o \
+                                   xfs_dquot_item.o \
+                                   xfs_trans_dquot.o \
+                                   xfs_qm_syscalls.o \
+                                   xfs_qm_bhv.o \
+                                   xfs_qm.o)
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
+endif
+xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
+xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
+xfs-y                           += xfs_alloc.o \
+                                   xfs_alloc_btree.o \
+                                   xfs_attr.o \
+                                   xfs_attr_leaf.o \
+                                   xfs_bit.o \
+                                   xfs_bmap.o \
+                                   xfs_bmap_btree.o \
+                                   xfs_btree.o \
+                                   xfs_buf_item.o \
+                                   xfs_da_btree.o \
+                                   xfs_dir2.o \
+                                   xfs_dir2_block.o \
+                                   xfs_dir2_data.o \
+                                   xfs_dir2_leaf.o \
+                                   xfs_dir2_node.o \
+                                   xfs_dir2_sf.o \
+                                   xfs_error.o \
+                                   xfs_extfree_item.o \
+                                   xfs_filestream.o \
+                                   xfs_fsops.o \
+                                   xfs_ialloc.o \
+                                   xfs_ialloc_btree.o \
+                                   xfs_iget.o \
+                                   xfs_inode.o \
+                                   xfs_inode_item.o \
+                                   xfs_iomap.o \
+                                   xfs_itable.o \
+                                   xfs_dfrag.o \
+                                   xfs_log.o \
+                                   xfs_log_recover.o \
+                                   xfs_mount.o \
+                                   xfs_mru_cache.o \
+                                   xfs_rename.o \
+                                   xfs_trans.o \
+                                   xfs_trans_ail.o \
+                                   xfs_trans_buf.o \
+                                   xfs_trans_extfree.o \
+                                   xfs_trans_inode.o \
+                                   xfs_trans_item.o \
+                                   xfs_utils.o \
+                                   xfs_vfsops.o \
+                                   xfs_vnodeops.o \
+                                   xfs_rw.o \
+                                   xfs_dmops.o \
+                                   xfs_qmops.o
+xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
+# Objects in linux/
+xfs-y                           += $(addprefix $(XFS_LINUX)/, \
+                                   kmem.o \
+                                   xfs_aops.o \
+                                   xfs_buf.o \
+                                   xfs_export.o \
+                                   xfs_file.o \
+                                   xfs_fs_subr.o \
+                                   xfs_globals.o \
+                                   xfs_ioctl.o \
+                                   xfs_iops.o \
+                                   xfs_lrw.o \
+                                   xfs_super.o \
+                                   xfs_vnode.o)
+# Objects in support/
+xfs-y                           += $(addprefix support/, \
+                                   debug.o \
+                                   uuid.o)
+xfs-$(CONFIG_XFS_TRACE)         += support/ktrace.o
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
deleted file mode 100644
index 97316451fc6d..000000000000
--- a/fs/xfs/Makefile-linux-2.6
+++ /dev/null
@@ -1,117 +0,0 @@
-#
-# Copyright (c) 2000-2005 Silicon Graphics, Inc.
-# All Rights Reserved.
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it would be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write the Free Software Foundation,
-# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-#
-EXTRA_CFLAGS +=  -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
-XFS_LINUX := linux-2.6
-ifeq ($(CONFIG_XFS_DEBUG),y)
-        EXTRA_CFLAGS += -g
-endif
-obj-$(CONFIG_XFS_FS)            += xfs.o
-xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
-                                   xfs_dquot.o \
-                                   xfs_dquot_item.o \
-                                   xfs_trans_dquot.o \
-                                   xfs_qm_syscalls.o \
-                                   xfs_qm_bhv.o \
-                                   xfs_qm.o)
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
-endif
-xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
-xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
-xfs-y                           += xfs_alloc.o \
-                                   xfs_alloc_btree.o \
-                                   xfs_attr.o \
-                                   xfs_attr_leaf.o \
-                                   xfs_bit.o \
-                                   xfs_bmap.o \
-                                   xfs_bmap_btree.o \
-                                   xfs_btree.o \
-                                   xfs_buf_item.o \
-                                   xfs_da_btree.o \
-                                   xfs_dir2.o \
-                                   xfs_dir2_block.o \
-                                   xfs_dir2_data.o \
-                                   xfs_dir2_leaf.o \
-                                   xfs_dir2_node.o \
-                                   xfs_dir2_sf.o \
-                                   xfs_error.o \
-                                   xfs_extfree_item.o \
-                                   xfs_filestream.o \
-                                   xfs_fsops.o \
-                                   xfs_ialloc.o \
-                                   xfs_ialloc_btree.o \
-                                   xfs_iget.o \
-                                   xfs_inode.o \
-                                   xfs_inode_item.o \
-                                   xfs_iomap.o \
-                                   xfs_itable.o \
-                                   xfs_dfrag.o \
-                                   xfs_log.o \
-                                   xfs_log_recover.o \
-                                   xfs_mount.o \
-                                   xfs_mru_cache.o \
-                                   xfs_rename.o \
-                                   xfs_trans.o \
-                                   xfs_trans_ail.o \
-                                   xfs_trans_buf.o \
-                                   xfs_trans_extfree.o \
-                                   xfs_trans_inode.o \
-                                   xfs_trans_item.o \
-                                   xfs_utils.o \
-                                   xfs_vfsops.o \
-                                   xfs_vnodeops.o \
-                                   xfs_rw.o \
-                                   xfs_dmops.o \
-                                   xfs_qmops.o
-xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
-# Objects in linux/
-xfs-y                           += $(addprefix $(XFS_LINUX)/, \
-                                   kmem.o \
-                                   xfs_aops.o \
-                                   xfs_buf.o \
-                                   xfs_export.o \
-                                   xfs_file.o \
-                                   xfs_fs_subr.o \
-                                   xfs_globals.o \
-                                   xfs_ioctl.o \
-                                   xfs_iops.o \
-                                   xfs_lrw.o \
-                                   xfs_super.o \
-                                   xfs_vnode.o)
-# Objects in support/
-xfs-y                           += $(addprefix support/, \
-                                   debug.o \
-                                   uuid.o)
-xfs-$(CONFIG_XFS_TRACE)         += support/ktrace.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index e040f1ce1b6a..9b1bb17a0501 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 #ifdef DEBUG
        if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
                printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-                        __FUNCTION__, (long)size);
+                        __func__, (long)size);
                dump_stack();
        }
 #endif
@@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                if (!(++retries % 100))
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
-                                        __FUNCTION__, lflags);
+                                        __func__, lflags);
                congestion_wait(WRITE, HZ/50);
        } while (1);
 }
@@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                if (!(++retries % 100))
                        printk(KERN_ERR "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
-                                        __FUNCTION__, lflags);
+                                        __func__, lflags);
                congestion_wait(WRITE, HZ/50);
        } while (1);
 }
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 2009e6d922ce..3abe7e9ceb33 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -20,8 +20,8 @@
 #include <linux/time.h>
 #include <linux/wait.h>
+#include <linux/semaphore.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 /*
 * sema_t structure just maps to struct semaphore in Linux kernel.
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e0519529c26c..a55c3b26d840 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -243,8 +243,12 @@ xfs_end_bio_unwritten(
        size_t                  size = ioend->io_size;
        if (likely(!ioend->io_error)) {
-                if (!XFS_FORCED_SHUTDOWN(ip->i_mount))
+                if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        xfs_iomap_write_unwritten(ip, offset, size);
+                        int error;
+                        error = xfs_iomap_write_unwritten(ip, offset, size);
+                        if (error)
+                                ioend->io_error = error;
+                }
                xfs_setfilesize(ioend);
        }
        xfs_destroy_ioend(ioend);
@@ -1532,9 +1536,9 @@ xfs_vm_bmap(
        struct xfs_inode        *ip = XFS_I(inode);
        xfs_itrace_entry(XFS_I(inode));
-        xfs_rwlock(ip, VRWLOCK_READ);
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
-        xfs_rwunlock(ip, VRWLOCK_READ);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e347bfd47c91..52f6846101d5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -400,7 +400,7 @@ _xfs_buf_lookup_pages(
                                printk(KERN_ERR
                                        "XFS: possible memory allocation "
                                        "deadlock in %s (mode:0x%x)\n",
-                                        __FUNCTION__, gfp_mask);
+                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
                        xfsbufd_wakeup(0, gfp_mask);
@@ -598,7 +598,7 @@ xfs_buf_get_flags(
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
                        printk(KERN_WARNING "%s: failed to map pages\n",
-                                        __FUNCTION__);
+                                        __func__);
                        goto no_buffer;
                }
        }
@@ -778,7 +778,7 @@ xfs_buf_get_noaddr(
        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
        if (unlikely(error)) {
                printk(KERN_WARNING "%s: failed to map pages\n",
-                                __FUNCTION__);
+                                __func__);
                goto fail_free_mem;
        }
@@ -1060,7 +1060,7 @@ xfs_buf_iostart(
                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
                bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
                xfs_buf_delwri_queue(bp, 1);
-                return status;
+                return 0;
        }
        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a3d207de48b8..841d7883528d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
+/*
+ * No error can be returned from xfs_buf_iostart for delwri
+ * buffers as they are queued and no I/O is issued.
+ */
+static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
        bp->b_strat = xfs_bdstrat_cb;
        bp->b_fspriv3 = mp;
-        return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
+        (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
 }
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index e7f3da61c6c3..652721ce0ea5 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -30,7 +30,7 @@ typedef struct cred {
 extern struct cred *sys_cred;
 /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
+static inline int capable_cred(cred_t *cr, int cid)
 {
        return (cr == sys_cred) ? 1 : capable(cid);
 }
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index ca4f66c4de16..265f0168ab76 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -22,6 +22,7 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
+#include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
@@ -30,8 +31,6 @@
 #include "xfs_inode.h"
 #include "xfs_vfsops.h"
-static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
 /*
 * Note that we only accept fileids which are long enough rather than allow
 * the parent generation number to default to zero.  XFS considers zero a
@@ -66,7 +65,7 @@ xfs_fs_encode_fh(
        int                     len;
        /* Directories don't need their parent encoded, they have ".." */
-        if (S_ISDIR(inode->i_mode))
+        if (S_ISDIR(inode->i_mode) || !connectable)
                fileid_type = FILEID_INO32_GEN;
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
@@ -213,17 +212,16 @@ xfs_fs_get_parent(
        struct dentry           *child)
 {
        int                     error;
-        bhv_vnode_t             *cvp;
+        struct xfs_inode        *cip;
        struct dentry           *parent;
-        cvp = NULL;
+        error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
-        error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp);
        if (unlikely(error))
                return ERR_PTR(-error);
-        parent = d_alloc_anon(vn_to_inode(cvp));
+        parent = d_alloc_anon(cip->i_vnode);
        if (unlikely(!parent)) {
-                VN_RELE(cvp);
+                iput(cip->i_vnode);
                return ERR_PTR(-ENOMEM);
        }
        return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index edab1ffbb163..05905246434d 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -469,16 +469,11 @@ xfs_file_open_exec(
        struct inode    *inode)
 {
        struct xfs_mount *mp = XFS_M(inode->i_sb);
+        struct xfs_inode *ip = XFS_I(inode);
-        if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) {
+        if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
-                if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) {
+                     DM_EVENT_ENABLED(ip, DM_EVENT_READ))
-                        bhv_vnode_t *vp = vn_from_inode(inode);
+                return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
-                        return -XFS_SEND_DATA(mp, DM_EVENT_READ,
-                                                vp, 0, 0, 0, NULL);
-                }
-        }
        return 0;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index ac6d34cc355d..1eefe61f0e10 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -17,18 +17,7 @@
 */
 #include "xfs.h"
 #include "xfs_vnodeops.h"
-/*
- * The following six includes are needed so that we can include
- * xfs_inode.h.  What a mess..
- */
 #include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 int  fs_noerr(void) { return 0; }
@@ -42,11 +31,10 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
+        struct address_space *mapping = ip->i_vnode->i_mapping;
-        struct inode    *inode = vn_to_inode(vp);
-        if (VN_CACHED(vp))
+        if (mapping->nrpages)
-                truncate_inode_pages(inode->i_mapping, first);
+                truncate_inode_pages(mapping, first);
 }
 int
@@ -56,15 +44,14 @@ xfs_flushinval_pages(
        xfs_off_t       last,
        int             fiopt)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
+        struct address_space *mapping = ip->i_vnode->i_mapping;
-        struct inode    *inode = vn_to_inode(vp);
        int             ret = 0;
-        if (VN_CACHED(vp)) {
+        if (mapping->nrpages) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                ret = filemap_write_and_wait(inode->i_mapping);
+                ret = filemap_write_and_wait(mapping);
                if (!ret)
-                        truncate_inode_pages(inode->i_mapping, first);
+                        truncate_inode_pages(mapping, first);
        }
        return ret;
 }
@@ -77,17 +64,16 @@ xfs_flush_pages(
        uint64_t        flags,
        int             fiopt)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
+        struct address_space *mapping = ip->i_vnode->i_mapping;
-        struct inode    *inode = vn_to_inode(vp);
        int             ret = 0;
        int             ret2;
-        if (VN_DIRTY(vp)) {
+        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                ret = filemap_fdatawrite(inode->i_mapping);
+                ret = filemap_fdatawrite(mapping);
                if (flags & XFS_B_ASYNC)
                        return ret;
-                ret2 = filemap_fdatawait(inode->i_mapping);
+                ret2 = filemap_fdatawait(mapping);
                if (!ret)
                        ret = ret2;
        }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a9952e490ac9..4ddb86b73c6b 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -535,8 +535,6 @@ xfs_attrmulti_attr_set(
        char                    *kbuf;
        int                     error = EFAULT;
-        if (IS_RDONLY(inode))
-                return -EROFS;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                return EPERM;
        if (len > XATTR_SIZE_MAX)
@@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove(
        char                    *name,
        __uint32_t              flags)
 {
-        if (IS_RDONLY(inode))
-                return -EROFS;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                return EPERM;
        return xfs_attr_remove(XFS_I(inode), name, flags);
@@ -573,6 +569,7 @@ STATIC int
 xfs_attrmulti_by_handle(
        xfs_mount_t             *mp,
        void                    __user *arg,
+        struct file             *parfilp,
        struct inode            *parinode)
 {
        int                     error;
@@ -626,13 +623,21 @@ xfs_attrmulti_by_handle(
                                        &ops[i].am_length, ops[i].am_flags);
                        break;
                case ATTR_OP_SET:
+                        ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+                        if (ops[i].am_error)
+                                break;
                        ops[i].am_error = xfs_attrmulti_attr_set(inode,
                                        attr_name, ops[i].am_attrvalue,
                                        ops[i].am_length, ops[i].am_flags);
+                        mnt_drop_write(parfilp->f_path.mnt);
                        break;
                case ATTR_OP_REMOVE:
+                        ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+                        if (ops[i].am_error)
+                                break;
                        ops[i].am_error = xfs_attrmulti_attr_remove(inode,
                                        attr_name, ops[i].am_flags);
+                        mnt_drop_write(parfilp->f_path.mnt);
                        break;
                default:
                        ops[i].am_error = EINVAL;
@@ -651,314 +656,6 @@ xfs_attrmulti_by_handle(
        return -error;
 }
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions.  Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-STATIC int
-xfs_ioc_space(
-        struct xfs_inode        *ip,
-        struct inode            *inode,
-        struct file             *filp,
-        int                     flags,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_bulkstat(
-        xfs_mount_t             *mp,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_fsgeometry_v1(
-        xfs_mount_t             *mp,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_fsgeometry(
-        xfs_mount_t             *mp,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_xattr(
-        xfs_inode_t             *ip,
-        struct file             *filp,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_fsgetxattr(
-        xfs_inode_t             *ip,
-        int                     attr,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_getbmap(
-        struct xfs_inode        *ip,
-        int                     flags,
-        unsigned int            cmd,
-        void                    __user *arg);
-STATIC int
-xfs_ioc_getbmapx(
-        struct xfs_inode        *ip,
-        void                    __user *arg);
-int
-xfs_ioctl(
-        xfs_inode_t             *ip,
-        struct file             *filp,
-        int                     ioflags,
-        unsigned int            cmd,
-        void                    __user *arg)
-{
-        struct inode            *inode = filp->f_path.dentry->d_inode;
-        xfs_mount_t             *mp = ip->i_mount;
-        int                     error;
-        xfs_itrace_entry(XFS_I(inode));
-        switch (cmd) {
-        case XFS_IOC_ALLOCSP:
-        case XFS_IOC_FREESP:
-        case XFS_IOC_RESVSP:
-        case XFS_IOC_UNRESVSP:
-        case XFS_IOC_ALLOCSP64:
-        case XFS_IOC_FREESP64:
-        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64:
-                /*
-                 * Only allow the sys admin to reserve space unless
-                 * unwritten extents are enabled.
-                 */
-                if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
-                    !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
-        case XFS_IOC_DIOINFO: {
-                struct dioattr  da;
-                xfs_buftarg_t   *target =
-                        XFS_IS_REALTIME_INODE(ip) ?
-                        mp->m_rtdev_targp : mp->m_ddev_targp;
-                da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
-                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-                if (copy_to_user(arg, &da, sizeof(da)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_FSBULKSTAT_SINGLE:
-        case XFS_IOC_FSBULKSTAT:
-        case XFS_IOC_FSINUMBERS:
-                return xfs_ioc_bulkstat(mp, cmd, arg);
-        case XFS_IOC_FSGEOMETRY_V1:
-                return xfs_ioc_fsgeometry_v1(mp, arg);
-        case XFS_IOC_FSGEOMETRY:
-                return xfs_ioc_fsgeometry(mp, arg);
-        case XFS_IOC_GETVERSION:
-                return put_user(inode->i_generation, (int __user *)arg);
-        case XFS_IOC_FSGETXATTR:
-                return xfs_ioc_fsgetxattr(ip, 0, arg);
-        case XFS_IOC_FSGETXATTRA:
-                return xfs_ioc_fsgetxattr(ip, 1, arg);
-        case XFS_IOC_GETXFLAGS:
-        case XFS_IOC_SETXFLAGS:
-        case XFS_IOC_FSSETXATTR:
-                return xfs_ioc_xattr(ip, filp, cmd, arg);
-        case XFS_IOC_FSSETDM: {
-                struct fsdmidata        dmi;
-                if (copy_from_user(&dmi, arg, sizeof(dmi)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
-                                dmi.fsd_dmstate);
-                return -error;
-        }
-        case XFS_IOC_GETBMAP:
-        case XFS_IOC_GETBMAPA:
-                return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
-        case XFS_IOC_GETBMAPX:
-                return xfs_ioc_getbmapx(ip, arg);
-        case XFS_IOC_FD_TO_HANDLE:
-        case XFS_IOC_PATH_TO_HANDLE:
-        case XFS_IOC_PATH_TO_FSHANDLE:
-                return xfs_find_handle(cmd, arg);
-        case XFS_IOC_OPEN_BY_HANDLE:
-                return xfs_open_by_handle(mp, arg, filp, inode);
-        case XFS_IOC_FSSETDM_BY_HANDLE:
-                return xfs_fssetdm_by_handle(mp, arg, inode);
-        case XFS_IOC_READLINK_BY_HANDLE:
-                return xfs_readlink_by_handle(mp, arg, inode);
-        case XFS_IOC_ATTRLIST_BY_HANDLE:
-                return xfs_attrlist_by_handle(mp, arg, inode);
-        case XFS_IOC_ATTRMULTI_BY_HANDLE:
-                return xfs_attrmulti_by_handle(mp, arg, inode);
-        case XFS_IOC_SWAPEXT: {
-                error = xfs_swapext((struct xfs_swapext __user *)arg);
-                return -error;
-        }
-        case XFS_IOC_FSCOUNTS: {
-                xfs_fsop_counts_t out;
-                error = xfs_fs_counts(mp, &out);
-                if (error)
-                        return -error;
-                if (copy_to_user(arg, &out, sizeof(out)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_SET_RESBLKS: {
-                xfs_fsop_resblks_t inout;
-                __uint64_t         in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&inout, arg, sizeof(inout)))
-                        return -XFS_ERROR(EFAULT);
-                /* input parameter is passed in resblks field of structure */
-                in = inout.resblks;
-                error = xfs_reserve_blocks(mp, &in, &inout);
-                if (error)
-                        return -error;
-                if (copy_to_user(arg, &inout, sizeof(inout)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_GET_RESBLKS: {
-                xfs_fsop_resblks_t out;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                error = xfs_reserve_blocks(mp, NULL, &out);
-                if (error)
-                        return -error;
-                if (copy_to_user(arg, &out, sizeof(out)))
-                        return -XFS_ERROR(EFAULT);
-                return 0;
-        }
-        case XFS_IOC_FSGROWFSDATA: {
-                xfs_growfs_data_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_growfs_data(mp, &in);
-                return -error;
-        }
-        case XFS_IOC_FSGROWFSLOG: {
-                xfs_growfs_log_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_growfs_log(mp, &in);
-                return -error;
-        }
-        case XFS_IOC_FSGROWFSRT: {
-                xfs_growfs_rt_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_growfs_rt(mp, &in);
-                return -error;
-        }
-        case XFS_IOC_FREEZE:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen == SB_UNFROZEN)
-                        freeze_bdev(inode->i_sb->s_bdev);
-                return 0;
-        case XFS_IOC_THAW:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen != SB_UNFROZEN)
-                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-                return 0;
-        case XFS_IOC_GOINGDOWN: {
-                __uint32_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (get_user(in, (__uint32_t __user *)arg))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_fs_goingdown(mp, in);
-                return -error;
-        }
-        case XFS_IOC_ERROR_INJECTION: {
-                xfs_error_injection_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
-                error = xfs_errortag_add(in.errtag, mp);
-                return -error;
-        }
-        case XFS_IOC_ERROR_CLEARALL:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                error = xfs_errortag_clearall(mp, 1);
-                return -error;
-        default:
-                return -ENOTTY;
-        }
-}
 STATIC int
 xfs_ioc_space(
        struct xfs_inode        *ip,
@@ -1179,85 +876,85 @@ xfs_ioc_fsgetxattr(
 }
 STATIC int
-xfs_ioc_xattr(
+xfs_ioc_fssetxattr(
        xfs_inode_t             *ip,
        struct file             *filp,
-        unsigned int            cmd,
        void                    __user *arg)
 {
        struct fsxattr          fa;
        struct bhv_vattr        *vattr;
-        int                     error = 0;
+        int                     error;
        int                     attr_flags;
-        unsigned int            flags;
+        if (copy_from_user(&fa, arg, sizeof(fa)))
+                return -EFAULT;
        vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
        if (unlikely(!vattr))
                return -ENOMEM;
-        switch (cmd) {
+        attr_flags = 0;
-        case XFS_IOC_FSSETXATTR: {
+        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                if (copy_from_user(&fa, arg, sizeof(fa))) {
+                attr_flags |= ATTR_NONBLOCK;
-                        error = -EFAULT;
-                        break;
-                }
-                attr_flags = 0;
+        vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-                if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+        vattr->va_xflags  = fa.fsx_xflags;
-                        attr_flags |= ATTR_NONBLOCK;
+        vattr->va_extsize = fa.fsx_extsize;
+        vattr->va_projid  = fa.fsx_projid;
-                vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
+        error = -xfs_setattr(ip, vattr, attr_flags, NULL);
-                vattr->va_xflags  = fa.fsx_xflags;
+        if (!error)
-                vattr->va_extsize = fa.fsx_extsize;
+                vn_revalidate(XFS_ITOV(ip));    /* update flags */
-                vattr->va_projid  = fa.fsx_projid;
+        kfree(vattr);
+        return 0;
+}
-                error = xfs_setattr(ip, vattr, attr_flags, NULL);
+STATIC int
-                if (likely(!error))
+xfs_ioc_getxflags(
-                        vn_revalidate(XFS_ITOV(ip));    /* update flags */
+        xfs_inode_t             *ip,
-                error = -error;
+        void                    __user *arg)
-                break;
+{
-        }
+        unsigned int            flags;
-        case XFS_IOC_GETXFLAGS: {
+        flags = xfs_di2lxflags(ip->i_d.di_flags);
-                flags = xfs_di2lxflags(ip->i_d.di_flags);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
-                if (copy_to_user(arg, &flags, sizeof(flags)))
+                return -EFAULT;
-                        error = -EFAULT;
+        return 0;
-                break;
+}
-        }
-        case XFS_IOC_SETXFLAGS: {
+STATIC int
-                if (copy_from_user(&flags, arg, sizeof(flags))) {
+xfs_ioc_setxflags(
-                        error = -EFAULT;
+        xfs_inode_t             *ip,
-                        break;
+        struct file             *filp,
-                }
+        void                    __user *arg)
+{
+        struct bhv_vattr        *vattr;
+        unsigned int            flags;
+        int                     attr_flags;
+        int                     error;
-                if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+        if (copy_from_user(&flags, arg, sizeof(flags)))
-                              FS_NOATIME_FL | FS_NODUMP_FL | \
+                return -EFAULT;
-                              FS_SYNC_FL)) {
-                        error = -EOPNOTSUPP;
-                        break;
-                }
-                attr_flags = 0;
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-                if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
-                        attr_flags |= ATTR_NONBLOCK;
+                      FS_SYNC_FL))
+                return -EOPNOTSUPP;
-                vattr->va_mask = XFS_AT_XFLAGS;
+        vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
-                vattr->va_xflags = xfs_merge_ioc_xflags(flags,
+        if (unlikely(!vattr))
-                                                        xfs_ip2xflags(ip));
+                return -ENOMEM;
-                error = xfs_setattr(ip, vattr, attr_flags, NULL);
+        attr_flags = 0;
-                if (likely(!error))
+        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                        vn_revalidate(XFS_ITOV(ip));    /* update flags */
+                attr_flags |= ATTR_NONBLOCK;
-                error = -error;
-                break;
-        }
-        default:
+        vattr->va_mask = XFS_AT_XFLAGS;
-                error = -ENOTTY;
+        vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-                break;
-        }
+        error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+        if (likely(!error))
+                vn_revalidate(XFS_ITOV(ip));    /* update flags */
        kfree(vattr);
        return error;
 }
@@ -1332,3 +1029,259 @@ xfs_ioc_getbmapx(
        return 0;
 }
+int
+xfs_ioctl(
+        xfs_inode_t             *ip,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        struct inode            *inode = filp->f_path.dentry->d_inode;
+        xfs_mount_t             *mp = ip->i_mount;
+        int                     error;
+        xfs_itrace_entry(XFS_I(inode));
+        switch (cmd) {
+        case XFS_IOC_ALLOCSP:
+        case XFS_IOC_FREESP:
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_ALLOCSP64:
+        case XFS_IOC_FREESP64:
+        case XFS_IOC_RESVSP64:
+        case XFS_IOC_UNRESVSP64:
+                /*
+                 * Only allow the sys admin to reserve space unless
+                 * unwritten extents are enabled.
+                 */
+                if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
+                    !capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+        case XFS_IOC_DIOINFO: {
+                struct dioattr  da;
+                xfs_buftarg_t   *target =
+                        XFS_IS_REALTIME_INODE(ip) ?
+                        mp->m_rtdev_targp : mp->m_ddev_targp;
+                da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+                if (copy_to_user(arg, &da, sizeof(da)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSBULKSTAT_SINGLE:
+        case XFS_IOC_FSBULKSTAT:
+        case XFS_IOC_FSINUMBERS:
+                return xfs_ioc_bulkstat(mp, cmd, arg);
+        case XFS_IOC_FSGEOMETRY_V1:
+                return xfs_ioc_fsgeometry_v1(mp, arg);
+        case XFS_IOC_FSGEOMETRY:
+                return xfs_ioc_fsgeometry(mp, arg);
+        case XFS_IOC_GETVERSION:
+                return put_user(inode->i_generation, (int __user *)arg);
+        case XFS_IOC_FSGETXATTR:
+                return xfs_ioc_fsgetxattr(ip, 0, arg);
+        case XFS_IOC_FSGETXATTRA:
+                return xfs_ioc_fsgetxattr(ip, 1, arg);
+        case XFS_IOC_FSSETXATTR:
+                return xfs_ioc_fssetxattr(ip, filp, arg);
+        case XFS_IOC_GETXFLAGS:
+                return xfs_ioc_getxflags(ip, arg);
+        case XFS_IOC_SETXFLAGS:
+                return xfs_ioc_setxflags(ip, filp, arg);
+        case XFS_IOC_FSSETDM: {
+                struct fsdmidata        dmi;
+                if (copy_from_user(&dmi, arg, sizeof(dmi)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+                                dmi.fsd_dmstate);
+                return -error;
+        }
+        case XFS_IOC_GETBMAP:
+        case XFS_IOC_GETBMAPA:
+                return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+        case XFS_IOC_GETBMAPX:
+                return xfs_ioc_getbmapx(ip, arg);
+        case XFS_IOC_FD_TO_HANDLE:
+        case XFS_IOC_PATH_TO_HANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE:
+                return xfs_find_handle(cmd, arg);
+        case XFS_IOC_OPEN_BY_HANDLE:
+                return xfs_open_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_FSSETDM_BY_HANDLE:
+                return xfs_fssetdm_by_handle(mp, arg, inode);
+        case XFS_IOC_READLINK_BY_HANDLE:
+                return xfs_readlink_by_handle(mp, arg, inode);
+        case XFS_IOC_ATTRLIST_BY_HANDLE:
+                return xfs_attrlist_by_handle(mp, arg, inode);
+        case XFS_IOC_ATTRMULTI_BY_HANDLE:
+                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_SWAPEXT: {
+                error = xfs_swapext((struct xfs_swapext __user *)arg);
+                return -error;
+        }
+        case XFS_IOC_FSCOUNTS: {
+                xfs_fsop_counts_t out;
+                error = xfs_fs_counts(mp, &out);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &out, sizeof(out)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_SET_RESBLKS: {
+                xfs_fsop_resblks_t inout;
+                __uint64_t         in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&inout, arg, sizeof(inout)))
+                        return -XFS_ERROR(EFAULT);
+                /* input parameter is passed in resblks field of structure */
+                in = inout.resblks;
+                error = xfs_reserve_blocks(mp, &in, &inout);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &inout, sizeof(inout)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_GET_RESBLKS: {
+                xfs_fsop_resblks_t out;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                error = xfs_reserve_blocks(mp, NULL, &out);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &out, sizeof(out)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSGROWFSDATA: {
+                xfs_growfs_data_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_data(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSLOG: {
+                xfs_growfs_log_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_log(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSRT: {
+                xfs_growfs_rt_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_rt(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FREEZE:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (inode->i_sb->s_frozen == SB_UNFROZEN)
+                        freeze_bdev(inode->i_sb->s_bdev);
+                return 0;
+        case XFS_IOC_THAW:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (inode->i_sb->s_frozen != SB_UNFROZEN)
+                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+                return 0;
+        case XFS_IOC_GOINGDOWN: {
+                __uint32_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (get_user(in, (__uint32_t __user *)arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_fs_goingdown(mp, in);
+                return -error;
+        }
+        case XFS_IOC_ERROR_INJECTION: {
+                xfs_error_injection_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_errortag_add(in.errtag, mp);
+                return -error;
+        }
+        case XFS_IOC_ERROR_CLEARALL:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                error = xfs_errortag_clearall(mp, 1);
+                return -error;
+        default:
+                return -ENOTTY;
+        }
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cc4abd3daa49..a1237dad6430 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,12 +62,11 @@ void
 xfs_synchronize_atime(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp;
+        struct inode    *inode = ip->i_vnode;
-        vp = XFS_ITOV_NULL(ip);
+        if (inode) {
-        if (vp) {
+                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-                ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec;
+                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-                ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec;
        }
 }
@@ -80,11 +79,10 @@ void
 xfs_mark_inode_dirty_sync(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp;
+        struct inode    *inode = ip->i_vnode;
-        vp = XFS_ITOV_NULL(ip);
+        if (inode)
-        if (vp)
+                mark_inode_dirty_sync(inode);
-                mark_inode_dirty_sync(vn_to_inode(vp));
 }
 /*
@@ -157,13 +155,6 @@ xfs_ichgtime_fast(
         */
        ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-        /*
-         * We're not supposed to change timestamps in readonly-mounted
-         * filesystems.  Throw it away if anyone asks us.
-         */
-        if (unlikely(IS_RDONLY(inode)))
-                return;
        if (flags & XFS_ICHGTIME_MOD) {
                tvp = &inode->i_mtime;
                ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
@@ -215,66 +206,62 @@ xfs_validate_fields(
 */
 STATIC int
 xfs_init_security(
-        bhv_vnode_t     *vp,
+        struct inode    *inode,
        struct inode    *dir)
 {
-        struct inode    *ip = vn_to_inode(vp);
+        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
        void            *value;
        char            *name;
        int             error;
-        error = security_inode_init_security(ip, dir, &name, &value, &length);
+        error = security_inode_init_security(inode, dir, &name,
+                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
                        return 0;
                return -error;
        }
-        error = xfs_attr_set(XFS_I(ip), name, value,
+        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-                        length, ATTR_SECURE);
        if (!error)
-                xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED);
+                xfs_iflags_set(ip, XFS_IMODIFIED);
        kfree(name);
        kfree(value);
        return error;
 }
-/*
+static void
- * Determine whether a process has a valid fs_struct (kernel daemons
+xfs_dentry_to_name(
- * like knfsd don't have an fs_struct).
+        struct xfs_name *namep,
- *
+        struct dentry   *dentry)
- * XXX(hch):  nfsd is broken, better fix it instead.
- */
-STATIC_INLINE int
-xfs_has_fs_struct(struct task_struct *task)
 {
-        return (task->fs != init_task.fs);
+        namep->name = dentry->d_name.name;
+        namep->len = dentry->d_name.len;
 }
 STATIC void
 xfs_cleanup_inode(
        struct inode    *dir,
-        bhv_vnode_t     *vp,
+        struct inode    *inode,
        struct dentry   *dentry,
        int             mode)
 {
-        struct dentry   teardown = {};
+        struct xfs_name teardown;
        /* Oh, the horror.
         * If we can't add the ACL or we fail in
         * xfs_init_security we must back out.
         * ENOSPC can hit here, among other things.
         */
-        teardown.d_inode = vn_to_inode(vp);
+        xfs_dentry_to_name(&teardown, dentry);
-        teardown.d_name = dentry->d_name;
        if (S_ISDIR(mode))
-                xfs_rmdir(XFS_I(dir), &teardown);
+                xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
        else
-                xfs_remove(XFS_I(dir), &teardown);
+                xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-        VN_RELE(vp);
+        iput(inode);
 }
 STATIC int
@@ -284,9 +271,10 @@ xfs_vn_mknod(
        int             mode,
        dev_t           rdev)
 {
-        struct inode    *ip;
+        struct inode    *inode;
-        bhv_vnode_t     *vp = NULL, *dvp = vn_from_inode(dir);
+        struct xfs_inode *ip = NULL;
        xfs_acl_t       *default_acl = NULL;
+        struct xfs_name name;
        attrexists_t    test_default_acl = _ACL_DEFAULT_EXISTS;
        int             error;
@@ -297,59 +285,67 @@ xfs_vn_mknod(
        if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
                return -EINVAL;
-        if (unlikely(test_default_acl && test_default_acl(dvp))) {
+        if (test_default_acl && test_default_acl(dir)) {
                if (!_ACL_ALLOC(default_acl)) {
                        return -ENOMEM;
                }
-                if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
+                if (!_ACL_GET_DEFAULT(dir, default_acl)) {
                        _ACL_FREE(default_acl);
                        default_acl = NULL;
                }
        }
-        if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current))
+        xfs_dentry_to_name(&name, dentry);
+        if (IS_POSIXACL(dir) && !default_acl)
                mode &= ~current->fs->umask;
        switch (mode & S_IFMT) {
-        case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
                rdev = sysv_encode_dev(rdev);
        case S_IFREG:
-                error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL);
+                error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
                break;
        case S_IFDIR:
-                error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL);
+                error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
                break;
        default:
                error = EINVAL;
                break;
        }
-        if (unlikely(!error)) {
+        if (unlikely(error))
-                error = xfs_init_security(vp, dir);
+                goto out_free_acl;
-                if (error)
-                        xfs_cleanup_inode(dir, vp, dentry, mode);
-        }
-        if (unlikely(default_acl)) {
+        inode = ip->i_vnode;
-                if (!error) {
-                        error = _ACL_INHERIT(vp, mode, default_acl);
+        error = xfs_init_security(inode, dir);
-                        if (!error)
+        if (unlikely(error))
-                                xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED);
+                goto out_cleanup_inode;
-                        else
-                                xfs_cleanup_inode(dir, vp, dentry, mode);
+        if (default_acl) {
-                }
+                error = _ACL_INHERIT(inode, mode, default_acl);
+                if (unlikely(error))
+                        goto out_cleanup_inode;
+                xfs_iflags_set(ip, XFS_IMODIFIED);
                _ACL_FREE(default_acl);
        }
-        if (likely(!error)) {
-                ASSERT(vp);
-                ip = vn_to_inode(vp);
-                if (S_ISDIR(mode))
+        if (S_ISDIR(mode))
-                        xfs_validate_fields(ip);
+                xfs_validate_fields(inode);
-                d_instantiate(dentry, ip);
+        d_instantiate(dentry, inode);
-                xfs_validate_fields(dir);
+        xfs_validate_fields(dir);
-        }
+        return -error;
+ out_cleanup_inode:
+        xfs_cleanup_inode(dir, inode, dentry, mode);
+ out_free_acl:
+        if (default_acl)
+                _ACL_FREE(default_acl);
        return -error;
 }
@@ -378,13 +374,15 @@ xfs_vn_lookup(
        struct dentry   *dentry,
        struct nameidata *nd)
 {
-        bhv_vnode_t     *cvp;
+        struct xfs_inode *cip;
+        struct xfs_name name;
        int             error;
        if (dentry->d_name.len >= MAXNAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
-        error = xfs_lookup(XFS_I(dir), dentry, &cvp);
+        xfs_dentry_to_name(&name, dentry);
+        error = xfs_lookup(XFS_I(dir), &name, &cip);
        if (unlikely(error)) {
                if (unlikely(error != ENOENT))
                        return ERR_PTR(-error);
@@ -392,7 +390,7 @@ xfs_vn_lookup(
                return NULL;
        }
-        return d_splice_alias(vn_to_inode(cvp), dentry);
+        return d_splice_alias(cip->i_vnode, dentry);
 }
 STATIC int
@@ -401,23 +399,24 @@ xfs_vn_link(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-        struct inode    *ip;    /* inode of guy being linked to */
+        struct inode    *inode; /* inode of guy being linked to */
-        bhv_vnode_t     *vp;    /* vp of name being linked */
+        struct xfs_name name;
        int             error;
-        ip = old_dentry->d_inode;       /* inode being linked to */
+        inode = old_dentry->d_inode;
-        vp = vn_from_inode(ip);
+        xfs_dentry_to_name(&name, dentry);
-        VN_HOLD(vp);
+        igrab(inode);
-        error = xfs_link(XFS_I(dir), vp, dentry);
+        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
        if (unlikely(error)) {
-                VN_RELE(vp);
+                iput(inode);
-        } else {
+                return -error;
-                xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
-                xfs_validate_fields(ip);
-                d_instantiate(dentry, ip);
        }
-        return -error;
+        xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+        xfs_validate_fields(inode);
+        d_instantiate(dentry, inode);
+        return 0;
 }
 STATIC int
@@ -426,11 +425,13 @@ xfs_vn_unlink(
        struct dentry   *dentry)
 {
        struct inode    *inode;
+        struct xfs_name name;
        int             error;
        inode = dentry->d_inode;
+        xfs_dentry_to_name(&name, dentry);
-        error = xfs_remove(XFS_I(dir), dentry);
+        error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
        if (likely(!error)) {
                xfs_validate_fields(dir);       /* size needs update */
                xfs_validate_fields(inode);
@@ -444,29 +445,34 @@ xfs_vn_symlink(
        struct dentry   *dentry,
        const char      *symname)
 {
-        struct inode    *ip;
+        struct inode    *inode;
-        bhv_vnode_t     *cvp;   /* used to lookup symlink to put in dentry */
+        struct xfs_inode *cip = NULL;
+        struct xfs_name name;
        int             error;
        mode_t          mode;
-        cvp = NULL;
        mode = S_IFLNK |
                (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+        xfs_dentry_to_name(&name, dentry);
-        error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode,
+        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
-                            &cvp, NULL);
+        if (unlikely(error))
-        if (likely(!error && cvp)) {
+                goto out;
-                error = xfs_init_security(cvp, dir);
-                if (likely(!error)) {
+        inode = cip->i_vnode;
-                        ip = vn_to_inode(cvp);
-                        d_instantiate(dentry, ip);
+        error = xfs_init_security(inode, dir);
-                        xfs_validate_fields(dir);
+        if (unlikely(error))
-                        xfs_validate_fields(ip);
+                goto out_cleanup_inode;
-                } else {
-                        xfs_cleanup_inode(dir, cvp, dentry, 0);
+        d_instantiate(dentry, inode);
-                }
+        xfs_validate_fields(dir);
-        }
+        xfs_validate_fields(inode);
+        return 0;
+ out_cleanup_inode:
+        xfs_cleanup_inode(dir, inode, dentry, 0);
+ out:
        return -error;
 }
@@ -476,9 +482,12 @@ xfs_vn_rmdir(
        struct dentry   *dentry)
 {
        struct inode    *inode = dentry->d_inode;
+        struct xfs_name name;
        int             error;
-        error = xfs_rmdir(XFS_I(dir), dentry);
+        xfs_dentry_to_name(&name, dentry);
+        error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
        if (likely(!error)) {
                xfs_validate_fields(inode);
                xfs_validate_fields(dir);
@@ -494,12 +503,15 @@ xfs_vn_rename(
        struct dentry   *ndentry)
 {
        struct inode    *new_inode = ndentry->d_inode;
-        bhv_vnode_t     *tvp;   /* target directory */
+        struct xfs_name oname;
+        struct xfs_name nname;
        int             error;
-        tvp = vn_from_inode(ndir);
+        xfs_dentry_to_name(&oname, odentry);
+        xfs_dentry_to_name(&nname, ndentry);
-        error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry);
+        error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+                                                        XFS_I(ndir), &nname);
        if (likely(!error)) {
                if (new_inode)
                        xfs_validate_fields(new_inode);
@@ -700,11 +712,19 @@ xfs_vn_setattr(
        return -error;
 }
+/*
+ * block_truncate_page can return an error, but we can't propagate it
+ * at all here. Leave a complaint + stack trace in the syslog because
+ * this could be bad. If it is bad, we need to propagate the error further.
+ */
 STATIC void
 xfs_vn_truncate(
        struct inode    *inode)
 {
-        block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks);
+        int     error;
+        error = block_truncate_page(inode->i_mapping, inode->i_size,
+                                                        xfs_get_blocks);
+        WARN_ON(error);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 3ca39c4e5d2a..e5143323e71f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -99,7 +99,6 @@
 /*
 * Feature macros (disable/enable)
 */
-#undef  HAVE_REFCACHE   /* reference cache not needed for NFS in 2.6 */
 #define HAVE_SPLICE     /* a splice(2) exists in 2.6, but not in 2.4 */
 #ifdef CONFIG_SMP
 #define HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 166353388490..1ebd8004469c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,6 +51,7 @@
 #include "xfs_vnodeops.h"
 #include <linux/capability.h>
+#include <linux/mount.h>
 #include <linux/writeback.h>
@@ -176,7 +177,6 @@ xfs_read(
 {
        struct file             *file = iocb->ki_filp;
        struct inode            *inode = file->f_mapping->host;
-        bhv_vnode_t             *vp = XFS_ITOV(ip);
        xfs_mount_t             *mp = ip->i_mount;
        size_t                  size = 0;
        ssize_t                 ret = 0;
@@ -228,11 +228,11 @@ xfs_read(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                bhv_vrwlock_t locktype = VRWLOCK_READ;
                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+                int iolock = XFS_IOLOCK_SHARED;
-                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size,
+                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
-                                        dmflags, &locktype);
+                                        dmflags, &iolock);
                if (ret) {
                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
                        if (unlikely(ioflags & IO_ISDIRECT))
@@ -242,7 +242,7 @@ xfs_read(
        }
        if (unlikely(ioflags & IO_ISDIRECT)) {
-                if (VN_CACHED(vp))
+                if (inode->i_mapping->nrpages)
                        ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
                                                    -1, FI_REMAPF_LOCKED);
                mutex_unlock(&inode->i_mutex);
@@ -276,7 +276,6 @@ xfs_splice_read(
        int                     flags,
        int                     ioflags)
 {
-        bhv_vnode_t             *vp = XFS_ITOV(ip);
        xfs_mount_t             *mp = ip->i_mount;
        ssize_t                 ret;
@@ -287,11 +286,11 @@ xfs_splice_read(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                bhv_vrwlock_t locktype = VRWLOCK_READ;
+                int iolock = XFS_IOLOCK_SHARED;
                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count,
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(infilp), &locktype);
+                                        FILP_DELAY_FLAG(infilp), &iolock);
                if (error) {
                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
                        return -error;
@@ -317,7 +316,6 @@ xfs_splice_write(
        int                     flags,
        int                     ioflags)
 {
-        bhv_vnode_t             *vp = XFS_ITOV(ip);
        xfs_mount_t             *mp = ip->i_mount;
        ssize_t                 ret;
        struct inode            *inode = outfilp->f_mapping->host;
@@ -330,11 +328,11 @@ xfs_splice_write(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-                bhv_vrwlock_t locktype = VRWLOCK_WRITE;
+                int iolock = XFS_IOLOCK_EXCL;
                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count,
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(outfilp), &locktype);
+                                        FILP_DELAY_FLAG(outfilp), &iolock);
                if (error) {
                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return -error;
@@ -573,14 +571,12 @@ xfs_write(
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
        struct inode            *inode = mapping->host;
-        bhv_vnode_t             *vp = XFS_ITOV(xip);
        unsigned long           segs = nsegs;
        xfs_mount_t             *mp;
        ssize_t                 ret = 0, error = 0;
        xfs_fsize_t             isize, new_size;
        int                     iolock;
        int                     eventsent = 0;
-        bhv_vrwlock_t           locktype;
        size_t                  ocount = 0, count;
        loff_t                  pos;
        int                     need_i_mutex;
@@ -607,11 +603,9 @@ xfs_write(
 relock:
        if (ioflags & IO_ISDIRECT) {
                iolock = XFS_IOLOCK_SHARED;
-                locktype = VRWLOCK_WRITE_DIRECT;
                need_i_mutex = 0;
        } else {
                iolock = XFS_IOLOCK_EXCL;
-                locktype = VRWLOCK_WRITE;
                need_i_mutex = 1;
                mutex_lock(&inode->i_mutex);
        }
@@ -634,9 +628,8 @@ start:
                        dmflags |= DM_FLAGS_IMUX;
                xfs_iunlock(xip, XFS_ILOCK_EXCL);
-                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
+                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
-                                      pos, count,
+                                      pos, count, dmflags, &iolock);
-                                      dmflags, &locktype);
                if (error) {
                        goto out_unlock_internal;
                }
@@ -664,10 +657,9 @@ start:
                        return XFS_ERROR(-EINVAL);
                }
-                if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+                if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
                        iolock = XFS_IOLOCK_EXCL;
-                        locktype = VRWLOCK_WRITE;
                        need_i_mutex = 1;
                        mutex_lock(&inode->i_mutex);
                        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
@@ -679,10 +671,16 @@ start:
        if (new_size > xip->i_size)
                xip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS))) {
+        /*
+         * We're not supposed to change timestamps in readonly-mounted
+         * filesystems.  Throw it away if anyone asks us.
+         */
+        if (likely(!(ioflags & IO_INVIS) &&
+                   !mnt_want_write(file->f_path.mnt))) {
                file_update_time(file);
                xfs_ichgtime_fast(xip, inode,
                                  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                mnt_drop_write(file->f_path.mnt);
        }
        /*
@@ -727,7 +725,7 @@ retry:
        current->backing_dev_info = mapping->backing_dev_info;
        if ((ioflags & IO_ISDIRECT)) {
-                if (VN_CACHED(vp)) {
+                if (mapping->nrpages) {
                        WARN_ON(need_i_mutex == 0);
                        xfs_inval_cached_trace(xip, pos, -1,
                                        (pos & PAGE_CACHE_MASK), -1);
@@ -744,7 +742,6 @@ retry:
                        mutex_unlock(&inode->i_mutex);
                        iolock = XFS_IOLOCK_SHARED;
-                        locktype = VRWLOCK_WRITE_DIRECT;
                        need_i_mutex = 0;
                }
@@ -781,15 +778,15 @@ retry:
        if (ret == -ENOSPC &&
            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-                xfs_rwunlock(xip, locktype);
+                xfs_iunlock(xip, iolock);
                if (need_i_mutex)
                        mutex_unlock(&inode->i_mutex);
-                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
+                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
-                                DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+                                DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
                                0, 0, 0); /* Delay flag intentionally  unused */
                if (need_i_mutex)
                        mutex_lock(&inode->i_mutex);
-                xfs_rwlock(xip, locktype);
+                xfs_ilock(xip, iolock);
                if (error)
                        goto out_unlock_internal;
                pos = xip->i_size;
@@ -817,7 +814,8 @@ retry:
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
                int error2;
-                xfs_rwunlock(xip, locktype);
+                xfs_iunlock(xip, iolock);
                if (need_i_mutex)
                        mutex_unlock(&inode->i_mutex);
                error2 = sync_page_range(inode, mapping, pos, ret);
@@ -825,7 +823,7 @@ retry:
                        error = error2;
                if (need_i_mutex)
                        mutex_lock(&inode->i_mutex);
-                xfs_rwlock(xip, locktype);
+                xfs_ilock(xip, iolock);
                error2 = xfs_write_sync_logforce(mp, xip);
                if (!error)
                        error = error2;
@@ -846,7 +844,7 @@ retry:
                        xip->i_d.di_size = xip->i_size;
                xfs_iunlock(xip, XFS_ILOCK_EXCL);
        }
-        xfs_rwunlock(xip, locktype);
+        xfs_iunlock(xip, iolock);
 out_unlock_mutex:
        if (need_i_mutex)
                mutex_unlock(&inode->i_mutex);
@@ -884,28 +882,23 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
 }
 /*
- * Wrapper around bdstrat so that we can stop data
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
- * from going to disk in case we are shutting down the filesystem.
+ * we are shutting down the filesystem.  Typically user data goes thru this
- * Typically user data goes thru this path; one of the exceptions
+ * path; one of the exceptions is the superblock.
- * is the superblock.
 */
-int
+void
 xfsbdstrat(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
        ASSERT(mp);
        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                /* Grio redirection would go here
-                 * if (XFS_BUF_IS_GRIO(bp)) {
-                 */
                xfs_buf_iorequest(bp);
-                return 0;
+                return;
        }
        xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-        return (xfs_bioerror_relse(bp));
+        xfs_bioerror_relse(bp);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e200253139cf..e1d498b4ba7a 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *,
 #define xfs_inval_cached_trace(ip, offset, len, first, last)
 #endif
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+/* errors from xfsbdstrat() must be extracted from the buffer */
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 8ba7a2fa6c1d..afd0b0d5fdb2 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void);
 # define XFS_STATS_DEC(count)
 # define XFS_STATS_ADD(count, inc)
-static __inline void xfs_init_procfs(void) { };
+static inline void xfs_init_procfs(void) { };
-static __inline void xfs_cleanup_procfs(void) { };
+static inline void xfs_cleanup_procfs(void) { };
 #endif  /* !CONFIG_PROC_FS */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 21dfc9da235e..865eb708aa95 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -171,7 +171,7 @@ xfs_parseargs(
        char                    *this_char, *value, *eov;
        int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
        int                     iosize;
-        int                     ikeep = 0;
+        int                     dmapi_implies_ikeep = 1;
        args->flags |= XFSMNT_BARRIER;
        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
@@ -302,10 +302,10 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
                        args->flags &= ~XFSMNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                        ikeep = 1;
+                        args->flags |= XFSMNT_IKEEP;
-                        args->flags &= ~XFSMNT_IDELETE;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
-                        args->flags |= XFSMNT_IDELETE;
+                        dmapi_implies_ikeep = 0;
+                        args->flags &= ~XFSMNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
                        args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
@@ -410,8 +410,8 @@ xfs_parseargs(
         * Note that if "ikeep" or "noikeep" mount options are
         * supplied, then they are honored.
         */
-        if (!(args->flags & XFSMNT_DMAPI) && !ikeep)
+        if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
-                args->flags |= XFSMNT_IDELETE;
+                args->flags |= XFSMNT_IKEEP;
        if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
                if (dsunit) {
@@ -446,6 +446,7 @@ xfs_showargs(
 {
        static struct proc_xfs_info xfs_info_set[] = {
                /* the few simple ones we can get from the mount struct */
+                { XFS_MOUNT_IKEEP,              "," MNTOPT_IKEEP },
                { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
                { XFS_MOUNT_INO64,              "," MNTOPT_INO64 },
                { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
@@ -461,7 +462,6 @@ xfs_showargs(
        };
        static struct proc_xfs_info xfs_info_unset[] = {
                /* the few simple ones we can get from the mount struct */
-                { XFS_MOUNT_IDELETE,            "," MNTOPT_IKEEP },
                { XFS_MOUNT_COMPAT_IOSIZE,      "," MNTOPT_LARGEIO },
                { XFS_MOUNT_BARRIER,            "," MNTOPT_NOBARRIER },
                { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_64BITINODE },
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
-        int                     error = 0, flags = FLUSH_INODE;
+        int                     error = 0;
+        int                     flags = 0;
        xfs_itrace_entry(XFS_I(inode));
        if (sync) {
@@ -934,7 +935,7 @@ xfs_fs_clear_inode(
                xfs_inactive(ip);
                xfs_iflags_clear(ip, XFS_IMODIFIED);
                if (xfs_reclaim(ip))
-                        panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode);
+                        panic("%s: cannot reclaim 0x%p\n", __func__, inode);
        }
        ASSERT(XFS_I(inode) == NULL);
@@ -1027,8 +1028,7 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR |
+                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-                                     SYNC_REFCACHE | SYNC_SUPER);
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
 }
@@ -1306,7 +1306,7 @@ xfs_fs_fill_super(
        void                    *data,
        int                     silent)
 {
-        struct inode            *rootvp;
+        struct inode            *root;
        struct xfs_mount        *mp = NULL;
        struct xfs_mount_args   *args = xfs_args_allocate(sb, silent);
        int                     error;
@@ -1344,19 +1344,18 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        rootvp = igrab(mp->m_rootip->i_vnode);
+        root = igrab(mp->m_rootip->i_vnode);
-        if (!rootvp) {
+        if (!root) {
                error = ENOENT;
                goto fail_unmount;
        }
+        if (is_bad_inode(root)) {
-        sb->s_root = d_alloc_root(vn_to_inode(rootvp));
+                error = EINVAL;
-        if (!sb->s_root) {
-                error = ENOMEM;
                goto fail_vnrele;
        }
-        if (is_bad_inode(sb->s_root->d_inode)) {
+        sb->s_root = d_alloc_root(root);
-                error = EINVAL;
+        if (!sb->s_root) {
+                error = ENOMEM;
                goto fail_vnrele;
        }
@@ -1378,7 +1377,7 @@ fail_vnrele:
                dput(sb->s_root);
                sb->s_root = NULL;
        } else {
-                VN_RELE(rootvp);
+                iput(root);
        }
 fail_unmount:
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efcf45b14ab..3efb7c6d3303 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -50,13 +50,7 @@ extern void xfs_qm_exit(void);
 # define set_posix_acl_flag(sb) do { } while (0)
 #endif
-#ifdef CONFIG_XFS_SECURITY
+#define XFS_SECURITY_STRING     "security attributes, "
-# define XFS_SECURITY_STRING    "security attributes, "
-# define ENOSECURITY            0
-#else
-# define XFS_SECURITY_STRING
-# define ENOSECURITY            EOPNOTSUPP
-#endif
 #ifdef CONFIG_XFS_RT
 # define XFS_REALTIME_STRING    "realtime, "
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 4da03a4e3520..7e60c7776b1c 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work {
 #define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
 #define SYNC_IOWAIT             0x0100  /* wait for all I/O to complete */
-#define SYNC_SUPER              0x0200  /* flush superblock to disk */
 /*
 * When remounting a filesystem read-only or freezing the filesystem,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b5ea418693b1..8b4d63ce8694 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -23,8 +23,6 @@ struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
-typedef struct dentry   bhv_vname_t;
-typedef __u64           bhv_vnumber_t;
 typedef struct inode    bhv_vnode_t;
 #define VN_ISLNK(vp)    S_ISLNK((vp)->i_mode)
@@ -46,18 +44,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
 }
 /*
- * Values for the vop_rwlock/rwunlock flags parameter.
- */
-typedef enum bhv_vrwlock {
-        VRWLOCK_NONE,
-        VRWLOCK_READ,
-        VRWLOCK_WRITE,
-        VRWLOCK_WRITE_DIRECT,
-        VRWLOCK_TRY_READ,
-        VRWLOCK_TRY_WRITE
-} bhv_vrwlock_t;
-/*
 * Return values for xfs_inactive.  A return value of
 * VN_INACTIVE_NOCACHE implies that the file system behavior
 * has disassociated its state and bhv_desc_t from the vnode.
@@ -73,12 +59,9 @@ typedef enum bhv_vrwlock {
 #define IO_INVIS        0x00020         /* don't update inode timestamps */
 /*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
 */
 #define FLUSH_SYNC              1       /* wait for flush to complete   */
-#define FLUSH_INODE             2       /* flush the inode itself       */
-#define FLUSH_LOG               4       /* force the last log entry for
-                                         * this inode out to disk       */
 /*
 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
@@ -226,13 +209,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
 }
 /*
- * Vname handling macros.
- */
-#define VNAME(dentry)           ((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry)        ((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry)  (vn_from_inode((dentry)->d_inode))
-/*
 * Dealing with bad inodes
 */
 static inline int VN_BAD(bhv_vnode_t *vp)
@@ -303,9 +279,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
 extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
 extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
 #define xfs_itrace_entry(ip)    \
-        _xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address)
+        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit(ip)     \
-        _xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address)
+        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
 #define xfs_itrace_exit_tag(ip, tag)    \
        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
 #define xfs_itrace_ref(ip)      \
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 665babcca6a6..631ebb31b295 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1291,7 +1291,7 @@ xfs_qm_dqflush(
        if (flags & XFS_QMOPT_DELWRI) {
                xfs_bdwrite(mp, bp);
        } else if (flags & XFS_QMOPT_ASYNC) {
-                xfs_bawrite(mp, bp);
+                error = xfs_bawrite(mp, bp);
        } else {
                error = xfs_bwrite(mp, bp);
        }
@@ -1439,9 +1439,7 @@ xfs_qm_dqpurge(
        uint            flags)
 {
        xfs_dqhash_t    *thishash;
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = dqp->q_mount;
-        mp = dqp->q_mount;
        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
        ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
@@ -1485,6 +1483,7 @@ xfs_qm_dqpurge(
         * we're unmounting, we do care, so we flush it and wait.
         */
        if (XFS_DQ_IS_DIRTY(dqp)) {
+                int     error;
                xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
                /* dqflush unlocks dqflock */
                /*
@@ -1495,7 +1494,10 @@ xfs_qm_dqpurge(
                 * We don't care about getting disk errors here. We need
                 * to purge this dquot anyway, so we go ahead regardless.
                 */
-                (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+                error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+                if (error)
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
        ASSERT(dqp->q_pincount == 0);
@@ -1580,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait(
                    XFS_INCORE_TRYLOCK);
        if (bp != NULL) {
                if (XFS_BUF_ISDELAYWRITE(bp)) {
+                        int     error;
                        if (XFS_BUF_ISPINNED(bp)) {
                                xfs_log_force(dqp->q_mount,
                                              (xfs_lsn_t)0,
                                              XFS_LOG_FORCE);
                        }
-                        xfs_bawrite(dqp->q_mount, bp);
+                        error = xfs_bawrite(dqp->q_mount, bp);
+                        if (error)
+                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                                        "xfs_qm_dqflock_pushbuf_wait: "
+                                        "pushbuf error %d on dqp %p, bp %p",
+                                        error, dqp, bp);
                } else {
                        xfs_buf_relse(bp);
                }
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1800e8d1f646..36e05ca78412 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push(
        xfs_dq_logitem_t        *logitem)
 {
        xfs_dquot_t     *dqp;
+        int             error;
        dqp = logitem->qli_dquot;
@@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push(
         * lock without sleeping, then there must not have been
         * anyone in the process of flushing the dquot.
         */
-        xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+                        error, dqp);
        xfs_dqunlock(dqp);
 }
@@ -262,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf(
                                              XFS_LOG_FORCE);
                        }
                        if (dopush) {
+                                int     error;
 #ifdef XFSRACEDEBUG
                                delay_for_intr();
                                delay(300);
 #endif
-                                xfs_bawrite(mp, bp);
+                                error = xfs_bawrite(mp, bp);
+                                if (error)
+                                        xfs_fs_cmn_err(CE_WARN, mp,
+        "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
+                                                        error, qip, bp);
                        } else {
                                xfs_buf_relse(bp);
                        }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 1f3da5b8657b..40ea56409561 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy(
 * necessary data structures like quotainfo.  This is also responsible for
 * running a quotacheck as necessary.  We are guaranteed that the superblock
 * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
 */
-int
+void
 xfs_qm_mount_quotas(
        xfs_mount_t     *mp,
        int             mfsi_flags)
@@ -313,7 +316,6 @@ xfs_qm_mount_quotas(
        int             error = 0;
        uint            sbf;
        /*
         * If quotas on realtime volumes is not supported, we disable
         * quotas immediately.
@@ -332,7 +334,8 @@ xfs_qm_mount_quotas(
         * Allocate the quotainfo structure inside the mount struct, and
         * create quotainode(s), and change/rev superblock if necessary.
         */
-        if ((error = xfs_qm_init_quotainfo(mp))) {
+        error = xfs_qm_init_quotainfo(mp);
+        if (error) {
                /*
                 * We must turn off quotas.
                 */
@@ -344,12 +347,11 @@ xfs_qm_mount_quotas(
         * If any of the quotas are not consistent, do a quotacheck.
         */
        if (XFS_QM_NEED_QUOTACHECK(mp) &&
-                !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+            !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-                if ((error = xfs_qm_quotacheck(mp))) {
+                error = xfs_qm_quotacheck(mp);
-                        /* Quotacheck has failed and quotas have
+                if (error) {
-                         * been disabled.
+                        /* Quotacheck failed and disabled quotas. */
-                         */
+                        return;
-                        return XFS_ERROR(error);
                }
        }
        /* 
@@ -357,12 +359,10 @@ xfs_qm_mount_quotas(
         * quotachecked status, since we won't be doing accounting for
         * that type anymore.
         */
-        if (!XFS_IS_UQUOTA_ON(mp)) {
+        if (!XFS_IS_UQUOTA_ON(mp))
                mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-        }
+        if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
-        if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
                mp->m_qflags &= ~XFS_OQUOTA_CHKD;
-        }
 write_changes:
        /*
@@ -392,7 +392,7 @@ xfs_qm_mount_quotas(
                xfs_fs_cmn_err(CE_WARN, mp,
                        "Failed to initialize disk quotas.");
        }
-        return XFS_ERROR(error);
+        return;
 }
 /*
@@ -1405,13 +1405,13 @@ xfs_qm_qino_alloc(
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
                unsigned oldv = mp->m_sb.sb_versionnum;
 #endif
-                ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
+                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
                                   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
                       (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
                        XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
-                XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
+                xfs_sb_version_addquota(&mp->m_sb);
                mp->m_sb.sb_uquotino = NULLFSINO;
                mp->m_sb.sb_gquotino = NULLFSINO;
@@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc(
 }
-STATIC int
+STATIC void
 xfs_qm_reset_dqcounts(
        xfs_mount_t     *mp,
        xfs_buf_t       *bp,
@@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts(
                ddq->d_rtbwarns = 0;
                ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
        }
-        return 0;
 }
 STATIC int
@@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs(
                if (error)
                        break;
-                (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+                xfs_qm_reset_dqcounts(mp, bp, firstid, type);
                xfs_bdwrite(mp, bp);
                /*
                 * goto the next block.
@@ -1810,7 +1808,7 @@ xfs_qm_dqusage_adjust(
         * Now release the inode. This will send it to 'inactive', and
         * possibly even free blocks.
         */
-        VN_RELE(XFS_ITOV(ip));
+        IRELE(ip);
        /*
         * Goto next inode.
@@ -1880,6 +1878,14 @@ xfs_qm_quotacheck(
        } while (! done);
        /*
+         * We've made all the changes that we need to make incore.
+         * Flush them down to disk buffers if everything was updated
+         * successfully.
+         */
+        if (!error)
+                error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+        /*
         * We can get this error if we couldn't do a dquot allocation inside
         * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
         * dirty dquots that might be cached, we just want to get rid of them
@@ -1890,11 +1896,6 @@ xfs_qm_quotacheck(
                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
                goto error_return;
        }
-        /*
-         * We've made all the changes that we need to make incore.
-         * Now flush_them down to disk buffers.
-         */
-        xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
        /*
         * We didn't log anything, because if we crashed, we'll have to
@@ -1926,7 +1927,10 @@ xfs_qm_quotacheck(
                ASSERT(mp->m_quotainfo != NULL);
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
-                (void)xfs_mount_reset_sbqflags(mp);
+                if (xfs_mount_reset_sbqflags(mp)) {
+                        cmn_err(CE_WARN, "XFS quotacheck %s: "
+                                "Failed to reset quota flags.", mp->m_fsname);
+                }
        } else {
                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
        }
@@ -1954,7 +1958,7 @@ xfs_qm_init_quotainos(
        /*
         * Get the uquota and gquota inodes
         */
-        if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+        if (xfs_sb_version_hasquota(&mp->m_sb)) {
                if (XFS_IS_UQUOTA_ON(mp) &&
                    mp->m_sb.sb_uquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_uquotino > 0);
@@ -1968,7 +1972,7 @@ xfs_qm_init_quotainos(
                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
                                             0, 0, &gip, 0))) {
                                if (uip)
-                                        VN_RELE(XFS_ITOV(uip));
+                                        IRELE(uip);
                                return XFS_ERROR(error);
                        }
                }
@@ -1999,7 +2003,7 @@ xfs_qm_init_quotainos(
                                          sbflags | XFS_SB_GQUOTINO, flags);
                if (error) {
                        if (uip)
-                                VN_RELE(XFS_ITOV(uip));
+                                IRELE(uip);
                        return XFS_ERROR(error);
                }
@@ -2093,12 +2097,17 @@ xfs_qm_shake_freelist(
                 * dirty dquots.
                 */
                if (XFS_DQ_IS_DIRTY(dqp)) {
+                        int     error;
                        xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
                        /*
                         * We flush it delayed write, so don't bother
                         * releasing the mplock.
                         */
-                        (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        if (error) {
+                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
                        dqp = dqp->dq_flnext;
                        continue;
@@ -2265,12 +2274,17 @@ xfs_qm_dqreclaim_one(void)
                 * dirty dquots.
                 */
                if (XFS_DQ_IS_DIRTY(dqp)) {
+                        int     error;
                        xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
                        /*
                         * We flush it delayed write, so don't bother
                         * releasing the freelist lock.
                         */
-                        (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        if (error) {
+                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
                        continue;
                }
@@ -2378,9 +2392,9 @@ xfs_qm_write_sb_changes(
        }
        xfs_mod_sb(tp, flags);
-        (void) xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
-        return 0;
+        return error;
 }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index baf537c1c177..cd2300e374af 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int              xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void             xfs_qm_mount_quotas(xfs_mount_t *, int);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97bb32937585..f4f6c4c861d7 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -118,7 +118,7 @@ xfs_qm_newmount(
        *quotaflags = 0;
        *needquotamount = B_FALSE;
-        quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+        quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
                                (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
        if (quotaondisk) {
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
index a50ffabcf554..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void);
 # define XQM_STATS_INC(count)   do { } while (0)
-static __inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 2cc5886cfe85..8342823dbdc3 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff(
        /*
         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
-         * and synchronously.
+         * and synchronously. If we fail to write, we should abort the
+         * operation as it cannot be recovered safely if we crash.
         */
-        xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+        error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+        if (error)
+                goto out_error;
        /*
         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff(
         * So, we have QUOTAOFF start and end logitems; the start
         * logitem won't get overwritten until the end logitem appears...
         */
-        xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+        error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+        if (error) {
+                /* We're screwed now. Shutdown is the only option. */
+                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                goto out_error;
+        }
        /*
         * If quotas is completely disabled, close shop.
@@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff(
                XFS_PURGE_INODE(XFS_QI_GQIP(mp));
                XFS_QI_GQIP(mp) = NULL;
        }
+out_error:
        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
        return (error);
@@ -371,35 +380,34 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-        int             error;
+        int             error = 0, error2 = 0;
        xfs_inode_t     *qip;
        if (!capable(CAP_SYS_ADMIN))
                return XFS_ERROR(EPERM);
-        error = 0;
+        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-        if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
        if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
                error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
-                if (! error) {
+                if (!error) {
-                        (void) xfs_truncate_file(mp, qip);
+                        error = xfs_truncate_file(mp, qip);
-                        VN_RELE(XFS_ITOV(qip));
+                        IRELE(qip);
                }
        }
        if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
            mp->m_sb.sb_gquotino != NULLFSINO) {
-                error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+                error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
-                if (! error) {
+                if (!error2) {
-                        (void) xfs_truncate_file(mp, qip);
+                        error2 = xfs_truncate_file(mp, qip);
-                        VN_RELE(XFS_ITOV(qip));
+                        IRELE(qip);
                }
        }
-        return (error);
+        return error ? error : error2;
 }
@@ -522,7 +530,7 @@ xfs_qm_scall_getqstat(
        memset(out, 0, sizeof(fs_quota_stat_t));
        out->qs_version = FS_QSTAT_VERSION;
-        if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
                out->qs_uquota.qfs_ino = NULLFSINO;
                out->qs_gquota.qfs_ino = NULLFSINO;
                return (0);
@@ -552,13 +560,13 @@ xfs_qm_scall_getqstat(
                out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
                out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
                if (tempuqip)
-                        VN_RELE(XFS_ITOV(uip));
+                        IRELE(uip);
        }
        if (gip) {
                out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
                out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
                if (tempgqip)
-                        VN_RELE(XFS_ITOV(gip));
+                        IRELE(gip);
        }
        if (mp->m_quotainfo) {
                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
@@ -726,12 +734,12 @@ xfs_qm_scall_setqlim(
        xfs_trans_log_dquot(tp, dqp);
        xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-        return (0);
+        return error;
 }
 STATIC int
@@ -1095,7 +1103,7 @@ again:
                 * inactive code in hell.
                 */
                if (vnode_refd)
-                        VN_RELE(vp);
+                        IRELE(ip);
                XFS_MOUNT_ILOCK(mp);
                /*
                 * If an inode was inserted or removed, we gotta
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 129067cfcb86..0b75d302508f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -24,7 +24,7 @@ static int          ktrace_zentries;
 void __init
 ktrace_init(int zentries)
 {
-        ktrace_zentries = zentries;
+        ktrace_zentries = roundup_pow_of_two(zentries);
        ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
                                        "ktrace_hdr");
@@ -47,13 +47,16 @@ ktrace_uninit(void)
 * ktrace_alloc()
 *
 * Allocate a ktrace header and enough buffering for the given
- * number of entries.
+ * number of entries. Round the number of entries up to a
+ * power of 2 so we can do fast masking to get the index from
+ * the atomic index counter.
 */
 ktrace_t *
 ktrace_alloc(int nentries, unsigned int __nocast sleep)
 {
        ktrace_t        *ktp;
        ktrace_entry_t  *ktep;
+        int             entries;
        ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
@@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
        /*
         * Special treatment for buffers with the ktrace_zentries entries
         */
-        if (nentries == ktrace_zentries) {
+        entries = roundup_pow_of_two(nentries);
+        if (entries == ktrace_zentries) {
                ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
                                                            sleep);
        } else {
-                ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+                ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
                                                            sleep | KM_LARGE);
        }
@@ -91,8 +95,10 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
        }
        ktp->kt_entries  = ktep;
-        ktp->kt_nentries = nentries;
+        ktp->kt_nentries = entries;
-        ktp->kt_index    = 0;
+        ASSERT(is_power_of_2(entries));
+        ktp->kt_index_mask = entries - 1;
+        atomic_set(&ktp->kt_index, 0);
        ktp->kt_rollover = 0;
        return ktp;
 }
@@ -151,8 +157,6 @@ ktrace_enter(
        void            *val14,
        void            *val15)
 {
-        static DEFINE_SPINLOCK(wrap_lock);
-        unsigned long   flags;
        int             index;
        ktrace_entry_t  *ktep;
@@ -161,12 +165,8 @@ ktrace_enter(
        /*
         * Grab an entry by pushing the index up to the next one.
         */
-        spin_lock_irqsave(&wrap_lock, flags);
+        index = atomic_add_return(1, &ktp->kt_index);
-        index = ktp->kt_index;
+        index = (index - 1) & ktp->kt_index_mask;
-        if (++ktp->kt_index == ktp->kt_nentries)
-                ktp->kt_index = 0;
-        spin_unlock_irqrestore(&wrap_lock, flags);
        if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
                ktp->kt_rollover = 1;
@@ -199,11 +199,12 @@ int
 ktrace_nentries(
        ktrace_t        *ktp)
 {
-        if (ktp == NULL) {
+        int     index;
+        if (ktp == NULL)
                return 0;
-        }
-        return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+        index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
+        return (ktp->kt_rollover ? ktp->kt_nentries : index);
 }
 /*
@@ -228,7 +229,7 @@ ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
        int             nentries;
        if (ktp->kt_rollover)
-                index = ktp->kt_index;
+                index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
        else
                index = 0;
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 56e72b40a859..741d6947ca60 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -30,7 +30,8 @@ typedef struct ktrace_entry {
 */
 typedef struct ktrace {
        int             kt_nentries;    /* number of entries in trace buf */
-        int             kt_index;       /* current index in entries */
+        atomic_t        kt_index;       /* current index in entries */
+        unsigned int    kt_index_mask;
        int             kt_rollover;
        ktrace_entry_t  *kt_entries;    /* buffer of entries */
 } ktrace_t;
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..765aaf65e2d3 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
 #define STATIC
 #define DEBUG 1
 #define XFS_BUF_LOCK_TRACKING 1
-/* #define QUOTADEBUG 1 */
+#define QUOTADEBUG 1
 #endif
 #ifdef CONFIG_XFS_TRACE
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7272fe39a92d..8e130b9720ae 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -307,12 +307,13 @@ xfs_acl_vset(
        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
-        if (error)
-                goto out;
        /* Incoming ACL exists, set file mode based on its value */
-        if (kind == _ACL_TYPE_ACCESS)
+        if (!error && kind == _ACL_TYPE_ACCESS)
-                xfs_acl_setmode(vp, xfs_acl, &basicperms);
+                error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
+        if (error)
+                goto out;
        /*
         * If we have more than std unix permissions, set up the actual attr.
@@ -323,7 +324,7 @@ xfs_acl_vset(
        if (!basicperms) {
                xfs_acl_set_attr(vp, xfs_acl, kind, &error);
        } else {
-                xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+                error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
        }
 out:
@@ -707,7 +708,9 @@ xfs_acl_inherit(
        memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
        xfs_acl_filter_mode(mode, cacl);
-        xfs_acl_setmode(vp, cacl, &basicperms);
+        error = xfs_acl_setmode(vp, cacl, &basicperms);
+        if (error)
+                goto out_error;
        /*
         * Set the Default and Access ACL on the file.  The mode is already
@@ -720,6 +723,7 @@ xfs_acl_inherit(
                xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
        if (!error && !basicperms)
                xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+out_error:
        _ACL_FREE(cacl);
        return error;
 }
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdbfbbee4959..1956f83489f1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -45,7 +45,7 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
                    xfs_agnumber_t agno,
                    xfs_agblock_t bno,
@@ -55,24 +55,24 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 ktrace_t *xfs_alloc_trace_buf;
 #define TRACE_ALLOC(s,a)        \
-        xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__)
+        xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
 #define TRACE_FREE(s,a,b,x,f)   \
-        xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__)
+        xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
 #define TRACE_MODAGF(s,a,f)     \
-        xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__)
+        xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
-#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp)       \
+#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp)   \
-        xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+        xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp)   \
+#define TRACE_UNBUSY(__func__,s,ag,sl,tp)       \
-        xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+        xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \
+#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp)        \
-        xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+        xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
 #else
 #define TRACE_ALLOC(s,a)
 #define TRACE_FREE(s,a,b,x,f)
 #define TRACE_MODAGF(s,a,f)
 #define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
 #define TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
 #endif  /* XFS_ALLOC_TRACE */
 /*
@@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 * Compute aligned version of the found extent.
 * Takes alignment and min length into account.
 */
-STATIC int                              /* success (>= minlen) */
+STATIC void
 xfs_alloc_compute_aligned(
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
@@ -116,7 +116,6 @@ xfs_alloc_compute_aligned(
        }
        *resbno = bno;
        *reslen = len;
-        return len >= minlen;
 }
 /*
@@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if (!xfs_alloc_compute_aligned(ltbno, ltlen,
+                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
-                                        args->alignment, args->minlen,
+                                        args->minlen, &ltbnoa, &ltlena);
-                                        &ltbnoa, &ltlena))
+                        if (ltlena < args->minlen)
                                continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
@@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if (xfs_alloc_compute_aligned(ltbno, ltlen,
+                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
-                                        args->alignment, args->minlen,
+                                        args->minlen, &ltbnoa, &ltlena);
-                                        &ltbnoa, &ltlena))
+                        if (ltlena >= args->minlen)
                                break;
                        if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
@@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if (xfs_alloc_compute_aligned(gtbno, gtlen,
+                        xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
-                                        args->alignment, args->minlen,
+                                        args->minlen, &gtbnoa, &gtlena);
-                                        &gtbnoa, &gtlena))
+                        if (gtlena >= args->minlen)
                                break;
                        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
                                goto error0;
@@ -2562,9 +2561,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
 /*
- * returns non-zero if any of (agno,bno):len is in a busy list
+ * If we find the extent in the busy list, force the log out to get the
+ * extent out of the busy list so the caller can use it straight away.
 */
-STATIC int
+STATIC void
 xfs_alloc_search_busy(xfs_trans_t *tp,
                    xfs_agnumber_t agno,
                    xfs_agblock_t bno,
@@ -2572,7 +2572,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 {
        xfs_mount_t             *mp;
        xfs_perag_busy_t        *bsy;
-        int                     n;
        xfs_agblock_t           uend, bend;
        xfs_lsn_t               lsn;
        int                     cnt;
@@ -2585,21 +2584,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
        uend = bno + len - 1;
        /* search pagb_list for this slot, skipping open slots */
-        for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+        for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
-             cnt; bsy++, n++) {
                /*
                 * (start1,length1) within (start2, length2)
                 */
                if (bsy->busy_tp != NULL) {
                        bend = bsy->busy_start + bsy->busy_length - 1;
-                        if ((bno > bend) ||
+                        if ((bno > bend) || (uend < bsy->busy_start)) {
-                            (uend < bsy->busy_start)) {
                                cnt--;
                        } else {
                                TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-                                                 "found1", agno, bno, len, n,
+                                         "found1", agno, bno, len, tp);
-                                                 tp);
                                break;
                        }
                }
@@ -2610,15 +2606,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
         * transaction that freed the block
         */
        if (cnt) {
-                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
                lsn = bsy->busy_tp->t_commit_lsn;
                spin_unlock(&mp->m_perag[agno].pagb_lock);
                xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
        } else {
-                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
+                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
-                n = -1;
                spin_unlock(&mp->m_perag[agno].pagb_lock);
        }
-        return n;
 }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e58f321fdae9..36d781ee5fcc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2647,14 +2647,6 @@ attr_trusted_capable(
 }
 STATIC int
-attr_secure_capable(
-        bhv_vnode_t     *vp,
-        cred_t          *cred)
-{
-        return -ENOSECURITY;
-}
-STATIC int
 attr_system_set(
        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
@@ -2724,7 +2716,7 @@ struct attrnames attr_secure = {
        .attr_get       = attr_generic_get,
        .attr_set       = attr_generic_set,
        .attr_remove    = attr_generic_remove,
-        .attr_capable   = attr_secure_capable,
+        .attr_capable   = (attrcapable_t)fs_noerr,
 };
 struct attrnames attr_user = {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index b08e2a2a8add..303d41e4217b 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
        if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
                if (bytes <= XFS_IFORK_ASIZE(dp))
-                        return mp->m_attroffset >> 3;
+                        return dp->i_d.di_forkoff;
                return 0;
        }
@@ -227,10 +227,10 @@ STATIC void
 xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
 {
        if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
-            !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) {
+            !(xfs_sb_version_hasattr2(&mp->m_sb))) {
                spin_lock(&mp->m_sb_lock);
-                if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
+                if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
-                        XFS_SB_VERSION_ADDATTR2(&mp->m_sb);
+                        xfs_sb_version_addattr2(&mp->m_sb);
                        spin_unlock(&mp->m_sb_lock);
                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
                } else
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 48228848f5ae..fab0b6d5a41b 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,6 +25,109 @@
 * XFS bit manipulation routines, used in non-realtime code.
 */
+#ifndef HAVE_ARCH_HIGHBIT
+/*
+ * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
+ */
+static const char xfs_highbit[256] = {
+       -1, 0, 1, 1, 2, 2, 2, 2,                 /* 00 .. 07 */
+        3, 3, 3, 3, 3, 3, 3, 3,                 /* 08 .. 0f */
+        4, 4, 4, 4, 4, 4, 4, 4,                 /* 10 .. 17 */
+        4, 4, 4, 4, 4, 4, 4, 4,                 /* 18 .. 1f */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 20 .. 27 */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 28 .. 2f */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 30 .. 37 */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 38 .. 3f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 40 .. 47 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 48 .. 4f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 50 .. 57 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 58 .. 5f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 60 .. 67 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 68 .. 6f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 70 .. 77 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 78 .. 7f */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 80 .. 87 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 88 .. 8f */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 90 .. 97 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 98 .. 9f */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* a0 .. a7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* a8 .. af */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* b0 .. b7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* b8 .. bf */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* c0 .. c7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* c8 .. cf */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* d0 .. d7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* d8 .. df */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* e0 .. e7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* e8 .. ef */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* f0 .. f7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* f8 .. ff */
+};
+#endif
+/*
+ * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
+ */
+inline int
+xfs_highbit32(
+        __uint32_t      v)
+{
+#ifdef HAVE_ARCH_HIGHBIT
+        return highbit32(v);
+#else
+        int             i;
+        if (v & 0xffff0000)
+                if (v & 0xff000000)
+                        i = 24;
+                else
+                        i = 16;
+        else if (v & 0x0000ffff)
+                if (v & 0x0000ff00)
+                        i = 8;
+                else
+                        i = 0;
+        else
+                return -1;
+        return i + xfs_highbit[(v >> i) & 0xff];
+#endif
+}
+/*
+ * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_lowbit64(
+        __uint64_t      v)
+{
+        __uint32_t      w = (__uint32_t)v;
+        int             n = 0;
+        if (w) {        /* lower bits */
+                n = ffs(w);
+        } else {        /* upper bits */
+                w = (__uint32_t)(v >> 32);
+                if (w && (n = ffs(w)))
+                        n += 32;
+        }
+        return n - 1;
+}
+/*
+ * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_highbit64(
+        __uint64_t      v)
+{
+        __uint32_t      h = (__uint32_t)(v >> 32);
+        if (h)
+                return xfs_highbit32(h) + 32;
+        return xfs_highbit32((__uint32_t)v);
+}
 /*
 * Return whether bitmap is empty.
 * Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 325a007dec91..082641a9782c 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,30 +47,13 @@ static inline __uint64_t xfs_mask64lo(int n)
 }
 /* Get high bit set out of 32-bit argument, -1 if none set */
-static inline int xfs_highbit32(__uint32_t v)
+extern int xfs_highbit32(__uint32_t v);
-{
-        return fls(v) - 1;
-}
-/* Get high bit set out of 64-bit argument, -1 if none set */
-static inline int xfs_highbit64(__uint64_t v)
-{
-        return fls64(v) - 1;
-}
-/* Get low bit set out of 32-bit argument, -1 if none set */
-static inline int xfs_lowbit32(__uint32_t v)
-{
-        __uint32_t t = v;
-        return (t) ? find_first_bit((unsigned long *)&t, 32) : -1;
-}
 /* Get low bit set out of 64-bit argument, -1 if none set */
-static inline int xfs_lowbit64(__uint64_t v)
+extern int xfs_lowbit64(__uint64_t v);
-{
-        __uint64_t t = v;
+/* Get high bit set out of 64-bit argument, -1 if none set */
-        return (t) ? find_first_bit((unsigned long *)&t, 64) : -1;
+extern int xfs_highbit64(__uint64_t);
-}
 /* Return whether bitmap is empty (1 == empty) */
 extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1c0a5a585a82..eb198c01c35d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update(
        int             whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)       \
-        xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w)
+        xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
 #define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
-        xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w)
+        xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
 #define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w)    \
-        xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w)
+        xfs_bmap_trace_post_update(__func__,d,ip,i,w)
 #define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w)     \
-        xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w)
+        xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
 #else
 #define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
 #define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
@@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align(
 #define XFS_ALLOC_GAP_UNITS     4
-STATIC int
+STATIC void
 xfs_bmap_adjacent(
        xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
 {
@@ -2548,7 +2548,6 @@ xfs_bmap_adjacent(
                        ap->rval = gotbno;
        }
 #undef ISVALID
-        return 0;
 }
 STATIC int
@@ -4047,17 +4046,17 @@ xfs_bmap_add_attrfork(
                xfs_trans_log_inode(tp, ip, logflags);
        if (error)
                goto error2;
-        if (!XFS_SB_VERSION_HASATTR(&mp->m_sb) ||
+        if (!xfs_sb_version_hasattr(&mp->m_sb) ||
-           (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2)) {
+           (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
                __int64_t sbfields = 0;
                spin_lock(&mp->m_sb_lock);
-                if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+                if (!xfs_sb_version_hasattr(&mp->m_sb)) {
-                        XFS_SB_VERSION_ADDATTR(&mp->m_sb);
+                        xfs_sb_version_addattr(&mp->m_sb);
                        sbfields |= XFS_SB_VERSIONNUM;
                }
-                if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2) {
+                if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
-                        XFS_SB_VERSION_ADDATTR2(&mp->m_sb);
+                        xfs_sb_version_addattr2(&mp->m_sb);
                        sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
                }
                if (sbfields) {
@@ -4154,16 +4153,21 @@ xfs_bmap_compute_maxlevels(
         * number of leaf entries, is controlled by the type of di_nextents
         * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
         * (a signed 16-bit number, xfs_aextnum_t).
+         *
+         * Note that we can no longer assume that if we are in ATTR1 that
+         * the fork offset of all the inodes will be (m_attroffset >> 3)
+         * because we could have mounted with ATTR2 and then mounted back
+         * with ATTR1, keeping the di_forkoff's fixed but probably at
+         * various positions. Therefore, for both ATTR1 and ATTR2
+         * we have to assume the worst case scenario of a minimum size
+         * available.
         */
        if (whichfork == XFS_DATA_FORK) {
                maxleafents = MAXEXTNUM;
-                sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
+                sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-                        XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
        } else {
                maxleafents = MAXAEXTNUM;
-                sz = (mp->m_flags & XFS_MOUNT_ATTR2) ?
+                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
-                        XFS_BMDR_SPACE_CALC(MINABTPTRS) :
-                        mp->m_sb.sb_inodesize - mp->m_attroffset;
        }
        maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
@@ -5043,7 +5047,7 @@ xfs_bmapi(
                         * A wasdelay extent has been initialized, so
                         * shouldn't be flagged as unwritten.
                         */
-                        if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                        if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                                if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
@@ -5483,7 +5487,7 @@ xfs_bunmapi(
                         * get rid of part of a realtime extent.
                         */
                        if (del.br_state == XFS_EXT_UNWRITTEN ||
-                            !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                            !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                                /*
                                 * This piece is unwritten, or we're not
                                 * using unwritten extents.  Skip over it.
@@ -5535,7 +5539,7 @@ xfs_bunmapi(
                        } else if ((del.br_startoff == start &&
                                    (del.br_state == XFS_EXT_UNWRITTEN ||
                                     xfs_trans_get_block_res(tp) == 0)) ||
-                                   !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                                   !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                                /*
                                 * Can't make it unwritten.  There isn't
                                 * a full extent here so just skip it.
@@ -5772,7 +5776,6 @@ xfs_getbmap(
        int                     error;          /* return value */
        __int64_t               fixlen;         /* length for -1 case */
        int                     i;              /* extent number */
-        bhv_vnode_t             *vp;            /* corresponding vnode */
        int                     lock;           /* lock state */
        xfs_bmbt_irec_t         *map;           /* buffer for user's data */
        xfs_mount_t             *mp;            /* file system mount point */
@@ -5789,7 +5792,6 @@ xfs_getbmap(
        int                     bmapi_flags;    /* flags for xfs_bmapi */
        __int32_t               oflags;         /* getbmapx bmv_oflags field */
-        vp = XFS_ITOV(ip);
        mp = ip->i_mount;
        whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -5811,7 +5813,7 @@ xfs_getbmap(
        if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
            whichfork == XFS_DATA_FORK) {
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
                if (error)
                        return XFS_ERROR(error);
        }
@@ -5869,6 +5871,10 @@ xfs_getbmap(
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
                error = xfs_flush_pages(ip, (xfs_off_t)0,
                                               -1, 0, FI_REMAPF);
+                if (error) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                return error;
+                }
        }
        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
@@ -6162,10 +6168,10 @@ xfs_check_block(
                        }
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
-                                        __FUNCTION__, j, i,
+                                        __func__, j, i,
                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
-                                        __FUNCTION__);
+                                        __func__);
                        }
                }
        }
@@ -6192,7 +6198,7 @@ xfs_bmap_check_leaf_extents(
        xfs_mount_t             *mp;    /* file system mount structure */
        __be64                  *pp;    /* pointer to block address */
        xfs_bmbt_rec_t          *ep;    /* pointer to current extent */
-        xfs_bmbt_rec_t          *lastp; /* pointer to previous extent */
+        xfs_bmbt_rec_t          last = {0, 0}; /* last extent in prev block */
        xfs_bmbt_rec_t          *nextp; /* pointer to next extent */
        int                     bp_release = 0;
@@ -6262,7 +6268,6 @@ xfs_bmap_check_leaf_extents(
        /*
         * Loop over all leaf nodes checking that all extents are in the right order.
         */
-        lastp = NULL;
        for (;;) {
                xfs_fsblock_t   nextbno;
                xfs_extnum_t    num_recs;
@@ -6283,18 +6288,16 @@ xfs_bmap_check_leaf_extents(
                 */
                ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                if (i) {
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                }
                for (j = 1; j < num_recs; j++) {
                        nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
-                        if (lastp) {
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
-                                xfs_btree_check_rec(XFS_BTNUM_BMAP,
-                                        (void *)lastp, (void *)ep);
-                        }
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
-                                (void *)(nextp));
-                        lastp = ep;
                        ep = nextp;
                }
+                last = *ep;
                i += num_recs;
                if (bp_release) {
                        bp_release = 0;
@@ -6325,13 +6328,13 @@ xfs_bmap_check_leaf_extents(
        return;
 error0:
-        cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
+        cmn_err(CE_WARN, "%s: at error0", __func__);
        if (bp_release)
                xfs_trans_brelse(NULL, bp);
 error_norelse:
        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
-                __FUNCTION__, i);
+                __func__, i);
-        panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
+        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
        return;
 }
 #endif
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 87224b7d7984..6ff70cda451c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -151,7 +151,7 @@ xfs_bmap_trace_exlist(
        xfs_extnum_t            cnt,            /* count of entries in list */
        int                     whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
-        xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w)
+        xfs_bmap_trace_exlist(__func__,ip,c,w)
 #else
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987326a3..4f0e849d973e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor(
 }
 #define XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-        xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__)
+        xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
 #define XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-        xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__)
+        xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
 #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-        xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__)
+        xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
 #define XFS_BMBT_TRACE_ARGI(c,i)        \
-        xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__)
+        xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
 #define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-        xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__)
+        xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
 #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-        xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__)
+        xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
 #define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-        xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__)
+        xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
 #define XFS_BMBT_TRACE_CURSOR(c,s)      \
-        xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__)
+        xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
 #else
 #define XFS_BMBT_TRACE_ARGBI(c,b,i)
 #define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
 /*
 * Insert the current record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. It appears, however, that some callers assume that the cursor is
+ * always valid. Hence if we do a multi-level split we need to revalidate the
+ * cursor.
+ *
+ * When a split occurs, we will see a new cursor returned. Use that as a
+ * trigger to determine if we need to revalidate the original cursor. If we get
+ * a split, then use the original irec to lookup up the path of the record we
+ * just inserted.
+ *
+ * Note that the fact that the btree root is in the inode means that we can
+ * have the level of the tree change without a "split" occurring at the root
+ * level. What happens is that the root is migrated to an allocated block and
+ * the inode root is pointed to it. This means a single split can change the
+ * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
+ * the level change should be accounted as a split so as to correctly trigger a
+ * revalidation of the old cursor.
 */
 int                                     /* error */
 xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
        xfs_fsblock_t   nbno;
        xfs_btree_cur_t *ncur;
        xfs_bmbt_rec_t  nrec;
+        xfs_bmbt_irec_t oirec;          /* original irec */
        xfs_btree_cur_t *pcur;
+        int             splits = 0;
        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
        level = 0;
        nbno = NULLFSBLOCK;
+        oirec = cur->bc_rec.b;
        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
        ncur = NULL;
        pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
                                &i))) {
                        if (pcur != cur)
                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
-                        return error;
                }
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+                        /* allocating a new root is effectively a split */
+                        if (cur->bc_nlevels != pcur->bc_nlevels)
+                                splits++;
                        cur->bc_nlevels = pcur->bc_nlevels;
                        cur->bc_private.b.allocated +=
                                pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
                }
                if (ncur) {
+                        splits++;
                        pcur = ncur;
                        ncur = NULL;
                }
        } while (nbno != NULLFSBLOCK);
+        if (splits > 1) {
+                /* revalidate the old cursor as we had a multi-level split */
+                error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
+                                oirec.br_startblock, oirec.br_blockcount, &i);
+                if (error)
+                        goto error0;
+                ASSERT(i == 1);
+        }
        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
        *stat = i;
        return 0;
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 2d950e975918..cd0d4b4bb816 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -120,7 +120,7 @@ typedef enum {
 * Extent state and extent format macros.
 */
 #define XFS_EXTFMT_INODE(x)     \
-        (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \
+        (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
                XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
 #define ISUNWRITTEN(x)  ((x)->br_state == XFS_EXT_UNWRITTEN)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 63debd147eb5..53a71c62025d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -645,7 +645,12 @@ xfs_buf_item_push(
        bp = bip->bli_buf;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
-                xfs_bawrite(bip->bli_item.li_mountp, bp);
+                int     error;
+                error = xfs_bawrite(bip->bli_item.li_mountp, bp);
+                if (error)
+                        xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
+                        "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
+                                        error, bip, bp);
        } else {
                xfs_buf_relse(bp);
        }
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index d16c1b971074..d5d1e60ee224 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -86,7 +86,7 @@ struct xfs_mount_args {
 #define XFSMNT_NOUUID           0x01000000      /* Ignore fs uuid */
 #define XFSMNT_DMAPI            0x02000000      /* enable dmapi/xdsm */
 #define XFSMNT_BARRIER          0x04000000      /* use write barriers */
-#define XFSMNT_IDELETE          0x08000000      /* inode cluster delete */
+#define XFSMNT_IKEEP            0x08000000      /* inode cluster delete */
 #define XFSMNT_SWALLOC          0x10000000      /* turn on stripe width
                                                 * allocation */
 #define XFSMNT_DIRSYNC          0x40000000      /* sync creat,link,unlink,rename
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index be7c4251fa61..7cb26529766b 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,12 +44,13 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
+struct xfs_name xfs_name_dotdot = {"..", 2};
 void
 xfs_dir_mount(
        xfs_mount_t     *mp)
 {
-        ASSERT(XFS_SB_VERSION_HASDIRV2(&mp->m_sb));
+        ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
        ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
               XFS_MAX_BLOCKSIZE);
        mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
@@ -146,8 +147,7 @@ int
 xfs_dir_createname(
        xfs_trans_t             *tp,
        xfs_inode_t             *dp,
-        char                    *name,
+        struct xfs_name         *name,
-        int                     namelen,
        xfs_ino_t               inum,           /* new entry inode number */
        xfs_fsblock_t           *first,         /* bmap's firstblock */
        xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
@@ -162,9 +162,9 @@ xfs_dir_createname(
                return rval;
        XFS_STATS_INC(xs_dir_create);
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
        args.inumber = inum;
        args.dp = dp;
        args.firstblock = first;
@@ -197,8 +197,7 @@ int
 xfs_dir_lookup(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,
+        struct xfs_name *name,
-        int             namelen,
        xfs_ino_t       *inum)          /* out: inode number */
 {
        xfs_da_args_t   args;
@@ -207,18 +206,14 @@ xfs_dir_lookup(
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_lookup);
+        memset(&args, 0, sizeof(xfs_da_args_t));
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
-        args.inumber = 0;
        args.dp = dp;
-        args.firstblock = NULL;
-        args.flist = NULL;
-        args.total = 0;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.justcheck = args.addname = 0;
        args.oknoent = 1;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -247,8 +242,7 @@ int
 xfs_dir_removename(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,
+        struct xfs_name *name,
-        int             namelen,
        xfs_ino_t       ino,
        xfs_fsblock_t   *first,         /* bmap's firstblock */
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
@@ -261,9 +255,9 @@ xfs_dir_removename(
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_remove);
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
        args.inumber = ino;
        args.dp = dp;
        args.firstblock = first;
@@ -329,8 +323,7 @@ int
 xfs_dir_replace(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,          /* name of entry to replace */
+        struct xfs_name *name,          /* name of entry to replace */
-        int             namelen,
        xfs_ino_t       inum,           /* new inode number */
        xfs_fsblock_t   *first,         /* bmap's firstblock */
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
@@ -345,9 +338,9 @@ xfs_dir_replace(
        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
        args.inumber = inum;
        args.dp = dp;
        args.firstblock = first;
@@ -374,28 +367,29 @@ xfs_dir_replace(
 /*
 * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
 */
 int
 xfs_dir_canenter(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
-        char            *name,          /* name of entry to add */
+        struct xfs_name *name,          /* name of entry to add */
-        int             namelen)
+        uint            resblks)
 {
        xfs_da_args_t   args;
        int             rval;
        int             v;              /* type-checking value */
+        if (resblks)
+                return 0;
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        memset(&args, 0, sizeof(xfs_da_args_t));
-        args.name = name;
+        args.name = name->name;
-        args.namelen = namelen;
+        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name, namelen);
+        args.hashval = xfs_da_hashname(name->name, name->len);
-        args.inumber = 0;
        args.dp = dp;
-        args.firstblock = NULL;
-        args.flist = NULL;
-        args.total = 0;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
        args.justcheck = args.addname = args.oknoent = 1;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index b265197e74cf..6392f939029f 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,6 +59,8 @@ typedef	__uint32_t	xfs_dir2_db_t;
 */
 typedef xfs_off_t       xfs_dir2_off_t;
+extern struct xfs_name  xfs_name_dotdot;
 /*
 * Generic directory interface routines
 */
@@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
 extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_inode *pdp);
 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t inum,
+                                struct xfs_name *name, xfs_ino_t inum,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t *inum);
+                                struct xfs_name *name, xfs_ino_t *inum);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t ino,
+                                struct xfs_name *name, xfs_ino_t ino,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen, xfs_ino_t inum,
+                                struct xfs_name *name, xfs_ino_t inum,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
-                                char *name, int namelen);
+                                struct xfs_name *name, uint resblks);
 extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 /*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eb03eab5ca52..3f3785b10804 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -73,7 +73,7 @@ xfs_filestreams_trace(
 #define TRACE4(mp,t,a0,a1,a2,a3)        TRACE6(mp,t,a0,a1,a2,a3,0,0)
 #define TRACE5(mp,t,a0,a1,a2,a3,a4)     TRACE6(mp,t,a0,a1,a2,a3,a4,0)
 #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
-        xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
+        xfs_filestreams_trace(mp, t, __func__, __LINE__, \
                                (__psunsigned_t)a0, (__psunsigned_t)a1, \
                                (__psunsigned_t)a2, (__psunsigned_t)a3, \
                                (__psunsigned_t)a4, (__psunsigned_t)a5)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index eadc1591c795..d3a0f538d6a6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -77,36 +77,36 @@ xfs_fs_geometry(
        if (new_version >= 3) {
                geo->version = XFS_FSOP_GEOM_VERSION;
                geo->flags =
-                        (XFS_SB_VERSION_HASATTR(&mp->m_sb) ?
+                        (xfs_sb_version_hasattr(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
-                        (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ?
+                        (xfs_sb_version_hasnlink(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
-                        (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ?
+                        (xfs_sb_version_hasquota(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
-                        (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ?
+                        (xfs_sb_version_hasalign(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
-                        (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ?
+                        (xfs_sb_version_hasdalign(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
-                        (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ?
+                        (xfs_sb_version_hasshared(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
-                        (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ?
+                        (xfs_sb_version_hasextflgbit(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
-                        (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+                        (xfs_sb_version_hasdirv2(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
-                        (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
+                        (xfs_sb_version_hassector(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
                        (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
-                        (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ?
+                        (xfs_sb_version_hasattr2(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
-                geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
+                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
                geo->dirblocksize = mp->m_dirblksize;
        }
        if (new_version >= 4) {
                geo->flags |=
-                        (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ?
+                        (xfs_sb_version_haslogv2(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
                geo->logsunit = mp->m_sb.sb_logsunit;
        }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c5836b951d0c..a64dfbd565a5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -107,6 +107,16 @@ xfs_ialloc_log_di(
 /*
 * Allocation group level functions.
 */
+static inline int
+xfs_ialloc_cluster_alignment(
+        xfs_alloc_arg_t *args)
+{
+        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+            args->mp->m_sb.sb_inoalignmt >=
+             XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+                return args->mp->m_sb.sb_inoalignmt;
+        return 1;
+}
 /*
 * Allocate new inodes in the allocation group specified by agbp.
@@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc(
                args.mod = args.total = args.wasdel = args.isfl =
                        args.userdata = args.minalignslop = 0;
                args.prod = 1;
-                args.alignment = 1;
                /*
-                 * Allow space for the inode btree to split.
+                 * We need to take into account alignment here to ensure that
+                 * we don't modify the free list if we fail to have an exact
+                 * block. If we don't have an exact match, and every oher
+                 * attempt allocation attempt fails, we'll end up cancelling
+                 * a dirty transaction and shutting down.
+                 *
+                 * For an exact allocation, alignment must be 1,
+                 * however we need to take cluster alignment into account when
+                 * fixing up the freelist. Use the minalignslop field to
+                 * indicate that extra blocks might be required for alignment,
+                 * but not to use them in the actual exact allocation.
                 */
+                args.alignment = 1;
+                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+                /* Allow space for the inode btree to split. */
                args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
@@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc(
                        ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
                        args.alignment = args.mp->m_dalign;
                        isaligned = 1;
-                } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+                } else
-                           args.mp->m_sb.sb_inoalignmt >=
+                        args.alignment = xfs_ialloc_cluster_alignment(&args);
-                           XFS_B_TO_FSBT(args.mp,
-                                XFS_INODE_CLUSTER_SIZE(args.mp)))
-                                args.alignment = args.mp->m_sb.sb_inoalignmt;
-                else
-                        args.alignment = 1;
                /*
                 * Need to figure out where to allocate the inode blocks.
                 * Ideally they should be spaced out through the a.g.
@@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc(
                args.agbno = be32_to_cpu(agi->agi_root);
                args.fsbno = XFS_AGB_TO_FSB(args.mp,
                                be32_to_cpu(agi->agi_seqno), args.agbno);
-                if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+                args.alignment = xfs_ialloc_cluster_alignment(&args);
-                        args.mp->m_sb.sb_inoalignmt >=
-                        XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
-                                args.alignment = args.mp->m_sb.sb_inoalignmt;
-                else
-                        args.alignment = 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        }
@@ -271,7 +285,7 @@ xfs_ialloc_ag_alloc(
         * use the old version so that old kernels will continue to be
         * able to use the file system.
         */
-        if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
+        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
                version = XFS_DINODE_VERSION_2;
        else
                version = XFS_DINODE_VERSION_1;
@@ -1053,7 +1067,7 @@ xfs_difree(
        /*
         * When an inode cluster is free, it becomes eligible for removal
         */
-        if ((mp->m_flags & XFS_MOUNT_IDELETE) &&
+        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
            (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
                *delete = 1;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index f01b07687faf..e657c5128460 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -78,7 +78,6 @@ xfs_iget_core(
        xfs_inode_t     *ip;
        xfs_inode_t     *iq;
        int             error;
-        xfs_icluster_t  *icl, *new_icl = NULL;
        unsigned long   first_index, mask;
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
@@ -229,29 +228,17 @@ finish_inode:
        }
        /*
-         * This is a bit messy - we preallocate everything we _might_
+         * Preload the radix tree so we can insert safely under the
-         * need before we pick up the ici lock. That way we don't have to
+         * write spinlock.
-         * juggle locks and go all the way back to the start.
         */
-        new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
        if (radix_tree_preload(GFP_KERNEL)) {
+                xfs_idestroy(ip);
                delay(1);
                goto again;
        }
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-        /*
-         * Find the cluster if it exists
-         */
-        icl = NULL;
-        if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
-                                                        first_index, 1)) {
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
-                        icl = iq->i_cluster;
-        }
        /*
         * insert the new inode
         */
@@ -266,30 +253,13 @@ finish_inode:
        }
        /*
-         * These values _must_ be set before releasing ihlock!
+         * These values _must_ be set before releasing the radix tree lock!
         */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        ASSERT(ip->i_cluster == NULL);
-        if (!icl) {
-                spin_lock_init(&new_icl->icl_lock);
-                INIT_HLIST_HEAD(&new_icl->icl_inodes);
-                icl = new_icl;
-                new_icl = NULL;
-        } else {
-                ASSERT(!hlist_empty(&icl->icl_inodes));
-        }
-        spin_lock(&icl->icl_lock);
-        hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
-        ip->i_cluster = icl;
-        spin_unlock(&icl->icl_lock);
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        if (new_icl)
-                kmem_zone_free(xfs_icluster_zone, new_icl);
        /*
         * Link ip to its mount and thread it on the mount's inode list.
@@ -528,18 +498,6 @@ xfs_iextract(
        xfs_put_perag(mp, pag);
        /*
-         * Remove from cluster list
-         */
-        mp = ip->i_mount;
-        spin_lock(&ip->i_cluster->icl_lock);
-        hlist_del(&ip->i_cnode);
-        spin_unlock(&ip->i_cluster->icl_lock);
-        /* was last inode in cluster? */
-        if (hlist_empty(&ip->i_cluster->icl_inodes))
-                kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
-        /*
         * Remove from mount's inode list.
         */
        XFS_MOUNT_ILOCK(mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a550546a7083..ca12acb90394 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,7 +55,6 @@
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
-kmem_zone_t *xfs_icluster_zone;
 /*
 * Used in xfs_itruncate().  This is the maximum number of extents
@@ -126,6 +125,90 @@ xfs_inobp_check(
 #endif
 /*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_imap_t      *imap,
+        xfs_buf_t       **bpp,
+        uint            buf_flags,
+        uint            imap_flags)
+{
+        int             error;
+        int             i;
+        int             ni;
+        xfs_buf_t       *bp;
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                   (int)imap->im_len, buf_flags, &bp);
+        if (error) {
+                if (error != EAGAIN) {
+                        cmn_err(CE_WARN,
+                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                                "an error %d on %s.  Returning error.",
+                                error, mp->m_fsname);
+                } else {
+                        ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+                }
+                return error;
+        }
+        /*
+         * Validate the magic number and version of every inode in the buffer
+         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+         */
+#ifdef DEBUG
+        ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else   /* usual case */
+        ni = 1;
+#endif
+        for (i = 0; i < ni; i++) {
+                int             di_ok;
+                xfs_dinode_t    *dip;
+                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                        (i << mp->m_sb.sb_inodelog));
+                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                                XFS_ERRTAG_ITOBP_INOTOBP,
+                                                XFS_RANDOM_ITOBP_INOTOBP))) {
+                        if (imap_flags & XFS_IMAP_BULKSTAT) {
+                                xfs_trans_brelse(tp, bp);
+                                return XFS_ERROR(EINVAL);
+                        }
+                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+                                                XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+                        cmn_err(CE_PANIC,
+                                        "Device %s - bad inode magic/vsn "
+                                        "daddr %lld #%d (magic=%x)",
+                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
+                                (unsigned long long)imap->im_blkno, i,
+                                be16_to_cpu(dip->di_core.di_magic));
+#endif
+                        xfs_trans_brelse(tp, bp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+        }
+        xfs_inobp_check(mp, bp);
+        /*
+         * Mark the buffer as an inode buffer now that it looks good
+         */
+        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+        *bpp = bp;
+        return 0;
+}
+/*
 * This routine is called to map an inode number within a file
 * system to the buffer containing the on-disk version of the
 * inode.  It returns a pointer to the buffer containing the
@@ -147,72 +230,19 @@ xfs_inotobp(
        xfs_buf_t       **bpp,
        int             *offset)
 {
-        int             di_ok;
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        xfs_dinode_t    *dip;
-        /*
-         * Call the space management code to find the location of the
-         * inode on disk.
-         */
        imap.im_blkno = 0;
        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
-        if (error != 0) {
+        if (error)
-                cmn_err(CE_WARN,
-        "xfs_inotobp: xfs_imap()  returned an "
-        "error %d on %s.  Returning error.", error, mp->m_fsname);
                return error;
-        }
-        /*
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
-         * If the inode number maps to a block outside the bounds of the
+        if (error)
-         * file system then return NULL rather than calling read_buf
-         * and panicing when we get an error from the driver.
-         */
-        if ((imap.im_blkno + imap.im_len) >
-            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                cmn_err(CE_WARN,
-        "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
-        "of the file system %s.  Returning EINVAL.",
-                        (unsigned long long)imap.im_blkno,
-                        imap.im_len, mp->m_fsname);
-                return XFS_ERROR(EINVAL);
-        }
-        /*
-         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-         * default to just a read_buf() call.
-         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-        if (error) {
-                cmn_err(CE_WARN,
-        "xfs_inotobp: xfs_trans_read_buf()  returned an "
-        "error %d on %s.  Returning error.", error, mp->m_fsname);
                return error;
-        }
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
-        di_ok =
-                be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-                XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-        if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
-                        XFS_RANDOM_ITOBP_INOTOBP))) {
-                XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
-                xfs_trans_brelse(tp, bp);
-                cmn_err(CE_WARN,
-        "xfs_inotobp: XFS_TEST_ERROR()  returned an "
-        "error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        xfs_inobp_check(mp, bp);
-        /*
-         * Set *dipp to point to the on-disk inode in the buffer.
-         */
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        *offset = imap.im_boffset;
@@ -248,46 +278,21 @@ xfs_itobp(
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
        xfs_daddr_t     bno,
-        uint            imap_flags)
+        uint            imap_flags,
+        uint            buf_flags)
 {
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        int             i;
-        int             ni;
        if (ip->i_blkno == (xfs_daddr_t)0) {
-                /*
-                 * Call the space management code to find the location of the
-                 * inode on disk.
-                 */
                imap.im_blkno = bno;
-                if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
+                error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                        XFS_IMAP_LOOKUP | imap_flags)))
+                                        XFS_IMAP_LOOKUP | imap_flags);
+                if (error)
                        return error;
                /*
-                 * If the inode number maps to a block outside the bounds
-                 * of the file system then return NULL rather than calling
-                 * read_buf and panicing when we get an error from the
-                 * driver.
-                 */
-                if ((imap.im_blkno + imap.im_len) >
-                    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-                                        "(imap.im_blkno (0x%llx) "
-                                        "+ imap.im_len (0x%llx)) > "
-                                        " XFS_FSB_TO_BB(mp, "
-                                        "mp->m_sb.sb_dblocks) (0x%llx)",
-                                        (unsigned long long) imap.im_blkno,
-                                        (unsigned long long) imap.im_len,
-                                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
-                        return XFS_ERROR(EINVAL);
-                }
-                /*
                 * Fill in the fields in the inode that will be used to
                 * map the inode to its buffer from now on.
                 */
@@ -305,76 +310,17 @@ xfs_itobp(
        }
        ASSERT(bno == 0 || bno == imap.im_blkno);
-        /*
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
-         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
+        if (error)
-         * default to just a read_buf() call.
-         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-        if (error) {
-#ifdef DEBUG
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-                                "xfs_trans_read_buf() returned error %d, "
-                                "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
-                                error, (unsigned long long) imap.im_blkno,
-                                (unsigned long long) imap.im_len);
-#endif /* DEBUG */
                return error;
-        }
-        /*
-         * Validate the magic number and version of every inode in the buffer
-         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-         * No validation is done here in userspace (xfs_repair).
-         */
-#if !defined(__KERNEL__)
-        ni = 0;
-#elif defined(DEBUG)
-        ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else   /* usual case */
-        ni = 1;
-#endif
-        for (i = 0; i < ni; i++) {
-                int             di_ok;
-                xfs_dinode_t    *dip;
-                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+        if (!bp) {
-                                        (i << mp->m_sb.sb_inodelog));
+                ASSERT(buf_flags & XFS_BUF_TRYLOCK);
-                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                ASSERT(tp == NULL);
-                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                *bpp = NULL;
-                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                return EAGAIN;
-                                                XFS_ERRTAG_ITOBP_INOTOBP,
-                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (imap_flags & XFS_IMAP_BULKSTAT) {
-                                xfs_trans_brelse(tp, bp);
-                                return XFS_ERROR(EINVAL);
-                        }
-#ifdef DEBUG
-                        cmn_err(CE_ALERT,
-                                        "Device %s - bad inode magic/vsn "
-                                        "daddr %lld #%d (magic=%x)",
-                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
-                                (unsigned long long)imap.im_blkno, i,
-                                be16_to_cpu(dip->di_core.di_magic));
-#endif
-                        XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
-                                             mp, dip);
-                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
        }
-        xfs_inobp_check(mp, bp);
-        /*
-         * Mark the buffer as an inode buffer now that it looks good
-         */
-        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-        /*
-         * Set *dipp to point to the on-disk inode in the buffer.
-         */
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        return 0;
@@ -878,7 +824,7 @@ xfs_iread(
         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
         * know that this is a new incore inode.
         */
-        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
+        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
        if (error) {
                kmem_zone_free(xfs_inode_zone, ip);
                return error;
@@ -1147,7 +1093,7 @@ xfs_ialloc(
         * the inode version number now.  This way we only do the conversion
         * here rather than here and in the flush/logging code.
         */
-        if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
+        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                ip->i_d.di_version = XFS_DINODE_VERSION_2;
                /*
@@ -1518,51 +1464,50 @@ xfs_itruncate_start(
 }
 /*
- * Shrink the file to the given new_size.  The new
+ * Shrink the file to the given new_size.  The new size must be smaller than
- * size must be smaller than the current size.
+ * the current size.  This will free up the underlying blocks in the removed
- * This will free up the underlying blocks
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
- * in the removed range after a call to xfs_itruncate_start()
- * or xfs_atruncate_start().
 *
- * The transaction passed to this routine must have made
+ * The transaction passed to this routine must have made a permanent log
- * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
- * This routine may commit the given transaction and
+ * given transaction and start new ones, so make sure everything involved in
- * start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
- * the transaction is tidy before calling here.
+ * returned to the caller to be committed.  The incoming transaction must
- * Some transaction will be returned to the caller to be
+ * already include the inode, and both inode locks must be held exclusively.
- * committed.  The incoming transaction must already include
+ * The inode must also be "held" within the transaction.  On return the inode
- * the inode, and both inode locks must be held exclusively.
+ * will be "held" within the returned transaction.  This routine does NOT
- * The inode must also be "held" within the transaction.  On
+ * require any disk space to be reserved for it within the transaction.
- * return the inode will be "held" within the returned transaction.
- * This routine does NOT require any disk space to be reserved
- * for it within the transaction.
 *
- * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
- * and it indicates the fork which is to be truncated.  For the
+ * indicates the fork which is to be truncated.  For the attribute fork we only
- * attribute fork we only support truncation to size 0.
+ * support truncation to size 0.
 *
- * We use the sync parameter to indicate whether or not the first
+ * We use the sync parameter to indicate whether or not the first transaction
- * transaction we perform might have to be synchronous.  For the attr fork,
+ * we perform might have to be synchronous.  For the attr fork, it needs to be
- * it needs to be so if the unlink of the inode is not yet known to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
- * permanent in the log.  This keeps us from freeing and reusing the
+ * This keeps us from freeing and reusing the blocks of the attribute fork
- * blocks of the attribute fork before the unlink of the inode becomes
+ * before the unlink of the inode becomes permanent.
- * permanent.
 *
- * For the data fork, we normally have to run synchronously if we're
+ * For the data fork, we normally have to run synchronously if we're being
- * being called out of the inactive path or we're being called
+ * called out of the inactive path or we're being called out of the create path
- * out of the create path where we're truncating an existing file.
+ * where we're truncating an existing file.  Either way, the truncate needs to
- * Either way, the truncate needs to be sync so blocks don't reappear
+ * be sync so blocks don't reappear in the file with altered data in case of a
- * in the file with altered data in case of a crash.  wsync filesystems
+ * crash.  wsync filesystems can run the first case async because anything that
- * can run the first case async because anything that shrinks the inode
+ * shrinks the inode has to run sync so by the time we're called here from
- * has to run sync so by the time we're called here from inactive, the
+ * inactive, the inode size is permanently set to 0.
- * inode size is permanently set to 0.
 *
- * Calls from the truncate path always need to be sync unless we're
+ * Calls from the truncate path always need to be sync unless we're in a wsync
- * in a wsync filesystem and the file has already been unlinked.
+ * filesystem and the file has already been unlinked.
 *
- * The caller is responsible for correctly setting the sync parameter.
+ * The caller is responsible for correctly setting the sync parameter.  It gets
- * It gets too hard for us to guess here which path we're being called
+ * too hard for us to guess here which path we're being called out of just
- * out of just based on inode state.
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
 */
 int
 xfs_itruncate_finish(
@@ -1741,65 +1686,51 @@ xfs_itruncate_finish(
                 */
                error = xfs_bmap_finish(tp, &free_list, &committed);
                ntp = *tp;
+                if (committed) {
+                        /* link the inode into the next xact in the chain */
+                        xfs_trans_ijoin(ntp, ip,
+                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+                        xfs_trans_ihold(ntp, ip);
+                }
                if (error) {
                        /*
-                         * If the bmap finish call encounters an error,
+                         * If the bmap finish call encounters an error, return
-                         * return to the caller where the transaction
+                         * to the caller where the transaction can be properly
-                         * can be properly aborted.  We just need to
+                         * aborted.  We just need to make sure we're not
-                         * make sure we're not holding any resources
+                         * holding any resources that we were not when we came
-                         * that we were not when we came in.
+                         * in.
                         *
-                         * Aborting from this point might lose some
+                         * Aborting from this point might lose some blocks in
-                         * blocks in the file system, but oh well.
+                         * the file system, but oh well.
                         */
                        xfs_bmap_cancel(&free_list);
-                        if (committed) {
-                                /*
-                                 * If the passed in transaction committed
-                                 * in xfs_bmap_finish(), then we want to
-                                 * add the inode to this one before returning.
-                                 * This keeps things simple for the higher
-                                 * level code, because it always knows that
-                                 * the inode is locked and held in the
-                                 * transaction that returns to it whether
-                                 * errors occur or not.  We don't mark the
-                                 * inode dirty so that this transaction can
-                                 * be easily aborted if possible.
-                                 */
-                                xfs_trans_ijoin(ntp, ip,
-                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                                xfs_trans_ihold(ntp, ip);
-                        }
                        return error;
                }
                if (committed) {
                        /*
-                         * The first xact was committed,
+                         * Mark the inode dirty so it will be logged and
-                         * so add the inode to the new one.
+                         * moved forward in the log as part of every commit.
-                         * Mark it dirty so it will be logged
-                         * and moved forward in the log as
-                         * part of every commit.
                         */
-                        xfs_trans_ijoin(ntp, ip,
-                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        xfs_trans_ihold(ntp, ip);
                        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
                }
                ntp = xfs_trans_dup(ntp);
-                (void) xfs_trans_commit(*tp, 0);
+                error = xfs_trans_commit(*tp, 0);
                *tp = ntp;
-                error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                          XFS_TRANS_PERM_LOG_RES,
+                /* link the inode into the next transaction in the chain */
-                                          XFS_ITRUNCATE_LOG_COUNT);
-                /*
-                 * Add the inode being truncated to the next chained
-                 * transaction.
-                 */
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
                xfs_trans_ihold(ntp, ip);
+                if (!error)
+                        error = xfs_trans_reserve(ntp, 0,
+                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                        XFS_TRANS_PERM_LOG_RES,
+                                        XFS_ITRUNCATE_LOG_COUNT);
                if (error)
-                        return (error);
+                        return error;
        }
        /*
         * Only update the size in the case of the data fork, but
@@ -1967,7 +1898,7 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error)
                        return error;
@@ -2075,7 +2006,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2137,7 +2068,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2172,13 +2103,6 @@ xfs_iunlink_remove(
        return 0;
 }
-STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
-{
-        return (((ip->i_itemp == NULL) ||
-                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-                (ip->i_update_core == 0));
-}
 STATIC void
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
@@ -2400,7 +2324,7 @@ xfs_ifree(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
+        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
        if (error)
                return error;
@@ -2678,14 +2602,31 @@ xfs_imap(
        fsbno = imap->im_blkno ?
                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
        imap->im_len = XFS_FSB_TO_BB(mp, len);
        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
        imap->im_ioffset = (ushort)off;
        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+        /*
+         * If the inode number maps to a block outside the bounds
+         * of the file system then return NULL rather than calling
+         * read_buf and panicing when we get an error from the
+         * driver.
+         */
+        if ((imap->im_blkno + imap->im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        (unsigned long long) imap->im_blkno,
+                        (unsigned long long) imap->im_len,
+                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+                return EINVAL;
+        }
        return 0;
 }
@@ -2826,38 +2767,41 @@ xfs_iunpin(
 }
 /*
- * This is called to wait for the given inode to be unpinned.
+ * This is called to unpin an inode. It can be directed to wait or to return
- * It will sleep until this happens.  The caller must have the
+ * immediately without waiting for the inode to be unpinned.  The caller must
- * inode locked in at least shared mode so that the buffer cannot
+ * have the inode locked in at least shared mode so that the buffer cannot be
- * be subsequently pinned once someone is waiting for it to be
+ * subsequently pinned once someone is waiting for it to be unpinned.
- * unpinned.
 */
 STATIC void
-xfs_iunpin_wait(
+__xfs_iunpin_wait(
-        xfs_inode_t     *ip)
+        xfs_inode_t     *ip,
+        int             wait)
 {
-        xfs_inode_log_item_t    *iip;
+        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        xfs_lsn_t       lsn;
        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+        if (atomic_read(&ip->i_pincount) == 0)
-        if (atomic_read(&ip->i_pincount) == 0) {
                return;
-        }
-        iip = ip->i_itemp;
+        /* Give the log a push to start the unpinning I/O */
-        if (iip && iip->ili_last_lsn) {
+        xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
-                lsn = iip->ili_last_lsn;
+                                iip->ili_last_lsn : 0, XFS_LOG_FORCE);
-        } else {
+        if (wait)
-                lsn = (xfs_lsn_t)0;
+                wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
-        }
+}
-        /*
+static inline void
-         * Give the log a push so we don't wait here too long.
+xfs_iunpin_wait(
-         */
+        xfs_inode_t     *ip)
-        xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+{
+        __xfs_iunpin_wait(ip, 1);
+}
-        wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+static inline void
+xfs_iunpin_nowait(
+        xfs_inode_t     *ip)
+{
+        __xfs_iunpin_wait(ip, 0);
 }
@@ -2932,7 +2876,7 @@ xfs_iextents_copy(
 * format indicates the current state of the fork.
 */
 /*ARGSUSED*/
-STATIC int
+STATIC void
 xfs_iflush_fork(
        xfs_inode_t             *ip,
        xfs_dinode_t            *dip,
@@ -2953,16 +2897,16 @@ xfs_iflush_fork(
        static const short      extflag[2] =
                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-        if (iip == NULL)
+        if (!iip)
-                return 0;
+                return;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        /*
         * This can happen if we gave up in iformat in an error path,
         * for the attribute fork.
         */
-        if (ifp == NULL) {
+        if (!ifp) {
                ASSERT(whichfork == XFS_ATTR_FORK);
-                return 0;
+                return;
        }
        cp = XFS_DFORK_PTR(dip, whichfork);
        mp = ip->i_mount;
@@ -3023,8 +2967,145 @@ xfs_iflush_fork(
                ASSERT(0);
                break;
        }
+}
+STATIC int
+xfs_iflush_cluster(
+        xfs_inode_t     *ip,
+        xfs_buf_t       *bp)
+{
+        xfs_mount_t             *mp = ip->i_mount;
+        xfs_perag_t             *pag = xfs_get_perag(mp, ip->i_ino);
+        unsigned long           first_index, mask;
+        int                     ilist_size;
+        xfs_inode_t             **ilist;
+        xfs_inode_t             *iq;
+        int                     nr_found;
+        int                     clcount = 0;
+        int                     bufwasdelwri;
+        int                     i;
+        ASSERT(pag->pagi_inodeok);
+        ASSERT(pag->pag_ici_init);
+        ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+        ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+        if (!ilist)
+                return 0;
+        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+        read_lock(&pag->pag_ici_lock);
+        /* really need a gang lookup range call here */
+        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+                                        first_index,
+                                        XFS_INODE_CLUSTER_SIZE(mp));
+        if (nr_found == 0)
+                goto out_free;
+        for (i = 0; i < nr_found; i++) {
+                iq = ilist[i];
+                if (iq == ip)
+                        continue;
+                /* if the inode lies outside this cluster, we're done. */
+                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                        break;
+                /*
+                 * Do an un-protected check to see if the inode is dirty and
+                 * is a candidate for flushing.  These checks will be repeated
+                 * later after the appropriate locks are acquired.
+                 */
+                if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+                        continue;
+                /*
+                 * Try to get locks.  If any are unavailable or it is pinned,
+                 * then this inode cannot be flushed and is skipped.
+                 */
+                if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+                        continue;
+                if (!xfs_iflock_nowait(iq)) {
+                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                        continue;
+                }
+                if (xfs_ipincount(iq)) {
+                        xfs_ifunlock(iq);
+                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                        continue;
+                }
+                /*
+                 * arriving here means that this inode can be flushed.  First
+                 * re-check that it's dirty before flushing.
+                 */
+                if (!xfs_inode_clean(iq)) {
+                        int     error;
+                        error = xfs_iflush_int(iq, bp);
+                        if (error) {
+                                xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                                goto cluster_corrupt_out;
+                        }
+                        clcount++;
+                } else {
+                        xfs_ifunlock(iq);
+                }
+                xfs_iunlock(iq, XFS_ILOCK_SHARED);
+        }
+        if (clcount) {
+                XFS_STATS_INC(xs_icluster_flushcnt);
+                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+        }
+out_free:
+        read_unlock(&pag->pag_ici_lock);
+        kmem_free(ilist, ilist_size);
        return 0;
+cluster_corrupt_out:
+        /*
+         * Corruption detected in the clustering loop.  Invalidate the
+         * inode buffer and shut down the filesystem.
+         */
+        read_unlock(&pag->pag_ici_lock);
+        /*
+         * Clean up the buffer.  If it was B_DELWRI, just release it --
+         * brelse can handle it with no problems.  If not, shut down the
+         * filesystem before releasing the buffer.
+         */
+        bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+        if (bufwasdelwri)
+                xfs_buf_relse(bp);
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+        if (!bufwasdelwri) {
+                /*
+                 * Just like incore_relse: if we have b_iodone functions,
+                 * mark the buffer as an error and call them.  Otherwise
+                 * mark it as stale and brelse.
+                 */
+                if (XFS_BUF_IODONE_FUNC(bp)) {
+                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+                        XFS_BUF_UNDONE(bp);
+                        XFS_BUF_STALE(bp);
+                        XFS_BUF_SHUT(bp);
+                        XFS_BUF_ERROR(bp,EIO);
+                        xfs_biodone(bp);
+                } else {
+                        XFS_BUF_STALE(bp);
+                        xfs_buf_relse(bp);
+                }
+        }
+        /*
+         * Unlocks the flush lock
+         */
+        xfs_iflush_abort(iq);
+        kmem_free(ilist, ilist_size);
+        return XFS_ERROR(EFSCORRUPTED);
 }
 /*
@@ -3046,11 +3127,7 @@ xfs_iflush(
        xfs_dinode_t            *dip;
        xfs_mount_t             *mp;
        int                     error;
-        /* REFERENCED */
+        int                     noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-        xfs_inode_t             *iq;
-        int                     clcount;        /* count of inodes clustered */
-        int                     bufwasdelwri;
-        struct hlist_node       *entry;
        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
        XFS_STATS_INC(xs_iflush_count);
@@ -3067,8 +3144,7 @@ xfs_iflush(
         * If the inode isn't dirty, then just release the inode
         * flush lock and do nothing.
         */
-        if ((ip->i_update_core == 0) &&
+        if (xfs_inode_clean(ip)) {
-            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
                ASSERT((iip != NULL) ?
                         !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
                xfs_ifunlock(ip);
@@ -3076,11 +3152,21 @@ xfs_iflush(
        }
        /*
-         * We can't flush the inode until it is unpinned, so
+         * We can't flush the inode until it is unpinned, so wait for it if we
-         * wait for it.  We know noone new can pin it, because
+         * are allowed to block.  We know noone new can pin it, because we are
-         * we are holding the inode lock shared and you need
+         * holding the inode lock shared and you need to hold it exclusively to
-         * to hold it exclusively to pin the inode.
+         * pin the inode.
+         *
+         * If we are not allowed to block, force the log out asynchronously so
+         * that when we come back the inode will be unpinned. If other inodes
+         * in the same cluster are dirty, they will probably write the inode
+         * out for us if they occur after the log force completes.
         */
+        if (noblock && xfs_ipincount(ip)) {
+                xfs_iunpin_nowait(ip);
+                xfs_ifunlock(ip);
+                return EAGAIN;
+        }
        xfs_iunpin_wait(ip);
        /*
@@ -3097,15 +3183,6 @@ xfs_iflush(
        }
        /*
-         * Get the buffer containing the on-disk inode.
-         */
-        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
-        if (error) {
-                xfs_ifunlock(ip);
-                return error;
-        }
-        /*
         * Decide how buffer will be flushed out.  This is done before
         * the call to xfs_iflush_int because this field is zeroed by it.
         */
@@ -3121,6 +3198,7 @@ xfs_iflush(
                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
                        flags = 0;
                        break;
+                case XFS_IFLUSH_ASYNC_NOBLOCK:
                case XFS_IFLUSH_ASYNC:
                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
                        flags = INT_ASYNC;
@@ -3140,6 +3218,7 @@ xfs_iflush(
                case XFS_IFLUSH_DELWRI:
                        flags = INT_DELWRI;
                        break;
+                case XFS_IFLUSH_ASYNC_NOBLOCK:
                case XFS_IFLUSH_ASYNC:
                        flags = INT_ASYNC;
                        break;
@@ -3154,94 +3233,41 @@ xfs_iflush(
        }
        /*
-         * First flush out the inode that xfs_iflush was called with.
+         * Get the buffer containing the on-disk inode.
         */
-        error = xfs_iflush_int(ip, bp);
+        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
-        if (error) {
+                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
-                goto corrupt_out;
+        if (error || !bp) {
+                xfs_ifunlock(ip);
+                return error;
        }
        /*
-         * inode clustering:
+         * First flush out the inode that xfs_iflush was called with.
-         * see if other inodes can be gathered into this write
         */
-        spin_lock(&ip->i_cluster->icl_lock);
+        error = xfs_iflush_int(ip, bp);
-        ip->i_cluster->icl_buf = bp;
+        if (error)
+                goto corrupt_out;
-        clcount = 0;
-        hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
-                if (iq == ip)
-                        continue;
-                /*
-                 * Do an un-protected check to see if the inode is dirty and
-                 * is a candidate for flushing.  These checks will be repeated
-                 * later after the appropriate locks are acquired.
-                 */
-                iip = iq->i_itemp;
-                if ((iq->i_update_core == 0) &&
-                    ((iip == NULL) ||
-                     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-                      xfs_ipincount(iq) == 0) {
-                        continue;
-                }
-                /*
-                 * Try to get locks.  If any are unavailable,
-                 * then this inode cannot be flushed and is skipped.
-                 */
-                /* get inode locks (just i_lock) */
-                if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
-                        /* get inode flush lock */
-                        if (xfs_iflock_nowait(iq)) {
-                                /* check if pinned */
-                                if (xfs_ipincount(iq) == 0) {
-                                        /* arriving here means that
-                                         * this inode can be flushed.
-                                         * first re-check that it's
-                                         * dirty
-                                         */
-                                        iip = iq->i_itemp;
-                                        if ((iq->i_update_core != 0)||
-                                            ((iip != NULL) &&
-                                             (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
-                                                clcount++;
-                                                error = xfs_iflush_int(iq, bp);
-                                                if (error) {
-                                                        xfs_iunlock(iq,
-                                                                    XFS_ILOCK_SHARED);
-                                                        goto cluster_corrupt_out;
-                                                }
-                                        } else {
-                                                xfs_ifunlock(iq);
-                                        }
-                                } else {
-                                        xfs_ifunlock(iq);
-                                }
-                        }
-                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
-                }
-        }
-        spin_unlock(&ip->i_cluster->icl_lock);
-        if (clcount) {
-                XFS_STATS_INC(xs_icluster_flushcnt);
-                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
-        }
        /*
-         * If the buffer is pinned then push on the log so we won't
+         * If the buffer is pinned then push on the log now so we won't
         * get stuck waiting in the write for too long.
         */
-        if (XFS_BUF_ISPINNED(bp)){
+        if (XFS_BUF_ISPINNED(bp))
                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-        }
+        /*
+         * inode clustering:
+         * see if other inodes can be gathered into this write
+         */
+        error = xfs_iflush_cluster(ip, bp);
+        if (error)
+                goto cluster_corrupt_out;
        if (flags & INT_DELWRI) {
                xfs_bdwrite(mp, bp);
        } else if (flags & INT_ASYNC) {
-                xfs_bawrite(mp, bp);
+                error = xfs_bawrite(mp, bp);
        } else {
                error = xfs_bwrite(mp, bp);
        }
@@ -3250,52 +3276,11 @@ xfs_iflush(
 corrupt_out:
        xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-        xfs_iflush_abort(ip);
-        /*
-         * Unlocks the flush lock
-         */
-        return XFS_ERROR(EFSCORRUPTED);
 cluster_corrupt_out:
-        /* Corruption detected in the clustering loop.  Invalidate the
-         * inode buffer and shut down the filesystem.
-         */
-        spin_unlock(&ip->i_cluster->icl_lock);
-        /*
-         * Clean up the buffer.  If it was B_DELWRI, just release it --
-         * brelse can handle it with no problems.  If not, shut down the
-         * filesystem before releasing the buffer.
-         */
-        if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
-                xfs_buf_relse(bp);
-        }
-        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-        if(!bufwasdelwri)  {
-                /*
-                 * Just like incore_relse: if we have b_iodone functions,
-                 * mark the buffer as an error and call them.  Otherwise
-                 * mark it as stale and brelse.
-                 */
-                if (XFS_BUF_IODONE_FUNC(bp)) {
-                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-                        XFS_BUF_UNDONE(bp);
-                        XFS_BUF_STALE(bp);
-                        XFS_BUF_SHUT(bp);
-                        XFS_BUF_ERROR(bp,EIO);
-                        xfs_biodone(bp);
-                } else {
-                        XFS_BUF_STALE(bp);
-                        xfs_buf_relse(bp);
-                }
-        }
-        xfs_iflush_abort(iq);
        /*
         * Unlocks the flush lock
         */
+        xfs_iflush_abort(ip);
        return XFS_ERROR(EFSCORRUPTED);
 }
@@ -3325,8 +3310,7 @@ xfs_iflush_int(
         * If the inode isn't dirty, then just release the inode
         * flush lock and do nothing.
         */
-        if ((ip->i_update_core == 0) &&
+        if (xfs_inode_clean(ip)) {
-            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
                xfs_ifunlock(ip);
                return 0;
        }
@@ -3434,9 +3418,9 @@ xfs_iflush_int(
         * has been updated, then make the conversion permanent.
         */
        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
-               XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+               xfs_sb_version_hasnlink(&mp->m_sb));
        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
-                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
                         */
@@ -3459,16 +3443,9 @@ xfs_iflush_int(
                }
        }
-        if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
+        xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
-                goto corrupt_out;
+        if (XFS_IFORK_Q(ip))
-        }
+                xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
-        if (XFS_IFORK_Q(ip)) {
-                /*
-                 * The only error from xfs_iflush_fork is on the data fork.
-                 */
-                (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
-        }
        xfs_inobp_check(mp, bp);
        /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bfcd72cbaeea..93c37697a72c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -133,19 +133,6 @@ typedef struct dm_attrs_s {
 } dm_attrs_t;
 /*
- * This is the xfs inode cluster structure.  This structure is used by
- * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
- * the same time.
- */
-typedef struct xfs_icluster {
-        struct hlist_head       icl_inodes;     /* list of inodes on cluster */
-        xfs_daddr_t             icl_blkno;      /* starting block number of
-                                                 * the cluster */
-        struct xfs_buf          *icl_buf;       /* the inode buffer */
-        spinlock_t              icl_lock;       /* inode list lock */
-} xfs_icluster_t;
-/*
 * This is the xfs in-core inode structure.
 * Most of the on-disk inode is embedded in the i_d field.
 *
@@ -240,10 +227,6 @@ typedef struct xfs_inode {
        atomic_t                i_pincount;     /* inode pin count */
        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
-#ifdef HAVE_REFCACHE
-        struct xfs_inode        **i_refcache;   /* ptr to entry in ref cache */
-        struct xfs_inode        *i_release;     /* inode to unref */
-#endif
        /* Miscellaneous state. */
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
@@ -252,8 +235,6 @@ typedef struct xfs_inode {
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
-        xfs_icluster_t          *i_cluster;     /* cluster list header */
-        struct hlist_node       i_cnode;        /* cluster link node */
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
@@ -461,6 +442,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define XFS_IFLUSH_SYNC                 3
 #define XFS_IFLUSH_ASYNC                4
 #define XFS_IFLUSH_DELWRI               5
+#define XFS_IFLUSH_ASYNC_NOBLOCK        6
 /*
 * Flags for xfs_itruncate_start().
@@ -515,7 +497,7 @@ int		xfs_finish_reclaim_all(struct xfs_mount *, int);
 */
 int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                          xfs_daddr_t, uint);
+                          xfs_daddr_t, uint, uint);
 int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                          xfs_inode_t **, xfs_daddr_t, uint);
 int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
@@ -597,7 +579,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #define xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
-extern struct kmem_zone *xfs_icluster_zone;
 extern struct kmem_zone *xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 034ca7202295..93b5db453ea2 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -40,6 +40,7 @@
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rw.h"
+#include "xfs_error.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
@@ -296,9 +297,9 @@ xfs_inode_item_format(
         */
        mp = ip->i_mount;
        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
-               XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+               xfs_sb_version_hasnlink(&mp->m_sb));
        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
-                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
                         */
@@ -813,7 +814,12 @@ xfs_inode_item_pushbuf(
                                              XFS_LOG_FORCE);
                        }
                        if (dopush) {
-                                xfs_bawrite(mp, bp);
+                                int     error;
+                                error = xfs_bawrite(mp, bp);
+                                if (error)
+                                        xfs_fs_cmn_err(CE_WARN, mp,
+                "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
+                                                        error, iip, bp);
                        } else {
                                xfs_buf_relse(bp);
                        }
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bfe92ea17952..40513077ab36 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w)
        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+        return (!ip->i_itemp ||
+                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+               !ip->i_update_core;
+}
 #ifdef __KERNEL__
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fde37f87d52f..fb3cf1191419 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,8 +802,11 @@ xfs_iomap_write_allocate(
                         */
                        nimaps = 1;
                        end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
-                        xfs_bmap_last_offset(NULL, ip, &last_block,
+                        error = xfs_bmap_last_offset(NULL, ip, &last_block,
-                                XFS_DATA_FORK);
+                                                        XFS_DATA_FORK);
+                        if (error)
+                                goto trans_cancel;
                        last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
                        if ((map_start_fsb + count_fsb) > last_block) {
                                count_fsb = last_block - map_start_fsb;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 658aab6b1bbf..eb85bdedad0c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -45,7 +45,7 @@ xfs_internal_inum(
        xfs_ino_t       ino)
 {
        return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
-                (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+                (xfs_sb_version_hasquota(&mp->m_sb) &&
                 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
 }
@@ -129,7 +129,7 @@ xfs_bulkstat_one_iget(
        return error;
 }
-STATIC int
+STATIC void
 xfs_bulkstat_one_dinode(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
@@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode(
                buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
                break;
        }
-        return 0;
 }
 STATIC int
@@ -614,7 +612,8 @@ xfs_bulkstat(
                                                        xfs_buf_relse(bp);
                                                error = xfs_itobp(mp, NULL, ip,
                                                                &dip, &bp, bno,
-                                                                XFS_IMAP_BULKSTAT);
+                                                                XFS_IMAP_BULKSTAT,
+                                                                XFS_BUF_LOCK);
                                                if (!error)
                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
                                                kmem_zone_free(xfs_inode_zone, ip);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a75edca1860f..afaee301b0ee 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,7 @@
 #include "xfs_inode.h"
 #include "xfs_rw.h"
+kmem_zone_t     *xfs_log_ticket_zone;
 #define xlog_write_adv_cnt(ptr, len, off, bytes) \
        { (ptr) += (bytes); \
@@ -73,8 +74,6 @@ STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
                                       xlog_ticket_t    *ticket,
                                       int              *continued_write,
                                       int              *logoffsetp);
-STATIC void xlog_state_put_ticket(xlog_t        *log,
-                                  xlog_ticket_t *tic);
 STATIC int  xlog_state_release_iclog(xlog_t             *log,
                                     xlog_in_core_t     *iclog);
 STATIC void xlog_state_switch_iclogs(xlog_t             *log,
@@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 /* local ticket functions */
-STATIC void             xlog_state_ticket_alloc(xlog_t *log);
 STATIC xlog_ticket_t    *xlog_ticket_get(xlog_t *log,
                                         int    unit_bytes,
                                         int    count,
@@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t	*mp,
                 */
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
-                xlog_state_put_ticket(log, ticket);
+                xlog_ticket_put(log, ticket);
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
@@ -384,7 +382,27 @@ _xfs_log_force(
                return xlog_state_sync_all(log, flags, log_flushed);
        else
                return xlog_state_sync(log, lsn, flags, log_flushed);
-}       /* xfs_log_force */
+}       /* _xfs_log_force */
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+        xfs_mount_t     *mp,
+        xfs_lsn_t       lsn,
+        uint            flags)
+{
+        int     error;
+        error = _xfs_log_force(mp, lsn, flags, NULL);
+        if (error) {
+                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                        "error %d returned.", error);
+        }
+}
 /*
 * Attaches a new iclog I/O completion callback routine during
@@ -397,12 +415,10 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
               void               *iclog_hndl,  /* iclog to hang callback off */
               xfs_log_callback_t *cb)
 {
-        xlog_t *log = mp->m_log;
        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
        int     abortflg;
-        cb->cb_next = NULL;
+        spin_lock(&iclog->ic_callback_lock);
-        spin_lock(&log->l_icloglock);
        abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
        if (!abortflg) {
                ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
@@ -411,7 +427,7 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
                *(iclog->ic_callback_tail) = cb;
                iclog->ic_callback_tail = &(cb->cb_next);
        }
-        spin_unlock(&log->l_icloglock);
+        spin_unlock(&iclog->ic_callback_lock);
        return abortflg;
 }       /* xfs_log_notify */
@@ -471,6 +487,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
                                                  client, flags);
+                if (!internal_ticket)
+                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
                *ticket = internal_ticket;
                xlog_trace_loggrant(log, internal_ticket, 
@@ -636,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
-        xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+        error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+        ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 #ifdef DEBUG
        first_iclog = iclog = log->l_iclog;
@@ -675,10 +694,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
-                iclog->ic_refcnt++;
+                atomic_inc(&iclog->ic_refcnt);
                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
-                (void) xlog_state_release_iclog(log, iclog);
+                error = xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
@@ -695,7 +714,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (tic) {
                        xlog_trace_loggrant(log, tic, "unmount rec");
                        xlog_ungrant_log_space(log, tic);
-                        xlog_state_put_ticket(log, tic);
+                        xlog_ticket_put(log, tic);
                }
        } else {
                /*
@@ -713,11 +732,11 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                 */
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
-                iclog->ic_refcnt++;
+                atomic_inc(&iclog->ic_refcnt);
                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
-                (void) xlog_state_release_iclog(log, iclog);
+                error =  xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -732,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                }
        }
-        return 0;
+        return error;
 }       /* xfs_log_unmount_write */
 /*
@@ -1090,7 +1109,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
                        size >>= 1;
                }
-                if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+                if (xfs_sb_version_haslogv2(&mp->m_sb)) {
                        /* # headers = size / 32K
                         * one header holds cycles from 32K of data
                         */
@@ -1186,13 +1205,13 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_grant_reserve_cycle = 1;
        log->l_grant_write_cycle = 1;
-        if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) {
+        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
                ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
                /* for larger sector sizes, must have v2 or external log */
                ASSERT(log->l_sectbb_log == 0 ||
                        log->l_logBBstart == 0 ||
-                        XFS_SB_VERSION_HASLOGV2(&mp->m_sb));
+                        xfs_sb_version_haslogv2(&mp->m_sb));
                ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
        }
        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
@@ -1210,7 +1229,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
        spin_lock_init(&log->l_icloglock);
        spin_lock_init(&log->l_grant_lock);
        initnsema(&log->l_flushsema, 0, "ic-flush");
-        xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1240,23 +1258,24 @@ xlog_alloc_log(xfs_mount_t	*mp,
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
                iclog->hic_data = bp->b_addr;
+#ifdef DEBUG
                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+#endif
                head = &iclog->ic_header;
                memset(head, 0, sizeof(xlog_rec_header_t));
                head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
                head->h_version = cpu_to_be32(
-                        XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+                        xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
                head->h_size = cpu_to_be32(log->l_iclog_size);
                /* new fields */
                head->h_fmt = cpu_to_be32(XLOG_FMT);
                memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
                iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
                iclog->ic_state = XLOG_STATE_ACTIVE;
                iclog->ic_log = log;
+                atomic_set(&iclog->ic_refcnt, 0);
+                spin_lock_init(&iclog->ic_callback_lock);
                iclog->ic_callback_tail = &(iclog->ic_callback);
                iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
@@ -1402,10 +1421,10 @@ xlog_sync(xlog_t		*log,
        int             roundoff;       /* roundoff to BB or stripe */
        int             split = 0;      /* split write into two regions */
        int             error;
-        int             v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb);
+        int             v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
        XFS_STATS_INC(xs_log_writes);
-        ASSERT(iclog->ic_refcnt == 0);
+        ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
        /* Add for LR header */
        count_init = log->l_iclog_hsize + iclog->ic_offset;
@@ -1538,7 +1557,6 @@ STATIC void
 xlog_dealloc_log(xlog_t *log)
 {
        xlog_in_core_t  *iclog, *next_iclog;
-        xlog_ticket_t   *tic, *next_tic;
        int             i;
        iclog = log->l_iclog;
@@ -1559,22 +1577,6 @@ xlog_dealloc_log(xlog_t *log)
        spinlock_destroy(&log->l_icloglock);
        spinlock_destroy(&log->l_grant_lock);
-        /* XXXsup take a look at this again. */
-        if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
-            !XLOG_FORCED_SHUTDOWN(log)) {
-                xfs_fs_cmn_err(CE_WARN, log->l_mp,
-                        "xlog_dealloc_log: (cnt: %d, total: %d)",
-                        log->l_ticket_cnt, log->l_ticket_tcnt);
-                /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
-        } else {
-                tic = log->l_unmount_free;
-                while (tic) {
-                        next_tic = tic->t_next;
-                        kmem_free(tic, PAGE_SIZE);
-                        tic = next_tic;
-                }
-        }
        xfs_buf_free(log->l_xbuf);
 #ifdef XFS_LOG_TRACE
        if (log->l_trace != NULL) {
@@ -1987,7 +1989,7 @@ xlog_state_clean_log(xlog_t *log)
                if (iclog->ic_state == XLOG_STATE_DIRTY) {
                        iclog->ic_state = XLOG_STATE_ACTIVE;
                        iclog->ic_offset       = 0;
-                        iclog->ic_callback      = NULL;   /* don't need to free */
+                        ASSERT(iclog->ic_callback == NULL);
                        /*
                         * If the number of ops in this iclog indicate it just
                         * contains the dummy transaction, we can
@@ -2190,37 +2192,40 @@ xlog_state_do_callback(
                                        be64_to_cpu(iclog->ic_header.h_lsn);
                                spin_unlock(&log->l_grant_lock);
-                                /*
-                                 * Keep processing entries in the callback list
-                                 * until we come around and it is empty.  We
-                                 * need to atomically see that the list is
-                                 * empty and change the state to DIRTY so that
-                                 * we don't miss any more callbacks being added.
-                                 */
-                                spin_lock(&log->l_icloglock);
                        } else {
+                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
                        }
-                        cb = iclog->ic_callback;
+                        /*
+                         * Keep processing entries in the callback list until
+                         * we come around and it is empty.  We need to
+                         * atomically see that the list is empty and change the
+                         * state to DIRTY so that we don't miss any more
+                         * callbacks being added.
+                         */
+                        spin_lock(&iclog->ic_callback_lock);
+                        cb = iclog->ic_callback;
                        while (cb) {
                                iclog->ic_callback_tail = &(iclog->ic_callback);
                                iclog->ic_callback = NULL;
-                                spin_unlock(&log->l_icloglock);
+                                spin_unlock(&iclog->ic_callback_lock);
                                /* perform callbacks in the order given */
                                for (; cb; cb = cb_next) {
                                        cb_next = cb->cb_next;
                                        cb->cb_func(cb->cb_arg, aborted);
                                }
-                                spin_lock(&log->l_icloglock);
+                                spin_lock(&iclog->ic_callback_lock);
                                cb = iclog->ic_callback;
                        }
                        loopdidcallbacks++;
                        funcdidcallbacks++;
+                        spin_lock(&log->l_icloglock);
                        ASSERT(iclog->ic_callback == NULL);
+                        spin_unlock(&iclog->ic_callback_lock);
                        if (!(iclog->ic_state & XLOG_STATE_IOERROR))
                                iclog->ic_state = XLOG_STATE_DIRTY;
@@ -2241,7 +2246,7 @@ xlog_state_do_callback(
                        repeats = 0;
                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
                                "%s: possible infinite loop (%d iterations)",
-                                __FUNCTION__, flushcnt);
+                                __func__, flushcnt);
                }
        } while (!ioerrors && loopdidcallbacks);
@@ -2309,7 +2314,7 @@ xlog_state_done_syncing(
        ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
               iclog->ic_state == XLOG_STATE_IOERROR);
-        ASSERT(iclog->ic_refcnt == 0);
+        ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
        ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
@@ -2391,7 +2396,7 @@ restart:
        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
        head = &iclog->ic_header;
-        iclog->ic_refcnt++;                     /* prevents sync */
+        atomic_inc(&iclog->ic_refcnt);  /* prevents sync */
        log_offset = iclog->ic_offset;
        /* On the 1st write to an iclog, figure out lsn.  This works
@@ -2423,12 +2428,12 @@ restart:
                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
                /* If I'm the only one writing to this iclog, sync it to disk */
-                if (iclog->ic_refcnt == 1) {
+                if (atomic_read(&iclog->ic_refcnt) == 1) {
                        spin_unlock(&log->l_icloglock);
                        if ((error = xlog_state_release_iclog(log, iclog)))
                                return error;
                } else {
-                        iclog->ic_refcnt--;
+                        atomic_dec(&iclog->ic_refcnt);
                        spin_unlock(&log->l_icloglock);
                }
                goto restart;
@@ -2792,18 +2797,6 @@ xlog_ungrant_log_space(xlog_t	     *log,
 /*
- * Atomically put back used ticket.
- */
-STATIC void
-xlog_state_put_ticket(xlog_t        *log,
-                      xlog_ticket_t *tic)
-{
-        spin_lock(&log->l_icloglock);
-        xlog_ticket_put(log, tic);
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_put_ticket */
-/*
 * Flush iclog to disk if this is the last reference to the given iclog and
 * the WANT_SYNC bit is set.
 *
@@ -2813,33 +2806,35 @@ xlog_state_put_ticket(xlog_t	    *log,
 *
 */
 STATIC int
-xlog_state_release_iclog(xlog_t         *log,
+xlog_state_release_iclog(
-                         xlog_in_core_t *iclog)
+        xlog_t          *log,
+        xlog_in_core_t  *iclog)
 {
        int             sync = 0;       /* do we sync? */
-        xlog_assign_tail_lsn(log->l_mp);
+        if (iclog->ic_state & XLOG_STATE_IOERROR)
+                return XFS_ERROR(EIO);
-        spin_lock(&log->l_icloglock);
+        ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+        if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+                return 0;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
                return XFS_ERROR(EIO);
        }
-        ASSERT(iclog->ic_refcnt > 0);
        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
               iclog->ic_state == XLOG_STATE_WANT_SYNC);
-        if (--iclog->ic_refcnt == 0 &&
+        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
-            iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+                /* update tail before writing to iclog */
+                xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
        /*
@@ -2849,11 +2844,9 @@ xlog_state_release_iclog(xlog_t		*log,
         * this iclog has consistent data, so we ignore IOERROR
         * flags after this point.
         */
-        if (sync) {
+        if (sync)
                return xlog_sync(log, iclog);
-        }
        return 0;
 }       /* xlog_state_release_iclog */
@@ -2881,7 +2874,7 @@ xlog_state_switch_iclogs(xlog_t		*log,
        log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
        /* Round up to next log-sunit */
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
            log->l_mp->m_sb.sb_logsunit > 1) {
                __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
                log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
@@ -2953,7 +2946,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
                 * previous iclog and go to sleep.
                 */
                if (iclog->ic_state == XLOG_STATE_DIRTY ||
-                    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+                    (atomic_read(&iclog->ic_refcnt) == 0
+                     && iclog->ic_offset == 0)) {
                        iclog = iclog->ic_prev;
                        if (iclog->ic_state == XLOG_STATE_ACTIVE ||
                            iclog->ic_state == XLOG_STATE_DIRTY)
@@ -2961,14 +2955,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
                        else
                                goto maybe_sleep;
                } else {
-                        if (iclog->ic_refcnt == 0) {
+                        if (atomic_read(&iclog->ic_refcnt) == 0) {
                                /* We are the only one with access to this
                                 * iclog.  Flush it out now.  There should
                                 * be a roundoff of zero to show that someone
                                 * has already taken care of the roundoff from
                                 * the previous sync.
                                 */
-                                iclog->ic_refcnt++;
+                                atomic_inc(&iclog->ic_refcnt);
                                lsn = be64_to_cpu(iclog->ic_header.h_lsn);
                                xlog_state_switch_iclogs(log, iclog, 0);
                                spin_unlock(&log->l_icloglock);
@@ -3100,7 +3094,7 @@ try_again:
                        already_slept = 1;
                        goto try_again;
                } else {
-                        iclog->ic_refcnt++;
+                        atomic_inc(&iclog->ic_refcnt);
                        xlog_state_switch_iclogs(log, iclog, 0);
                        spin_unlock(&log->l_icloglock);
                        if (xlog_state_release_iclog(log, iclog))
@@ -3172,92 +3166,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 */
 /*
- *      Algorithm doesn't take into account page size. ;-(
+ * Free a used ticket.
- */
-STATIC void
-xlog_state_ticket_alloc(xlog_t *log)
-{
-        xlog_ticket_t   *t_list;
-        xlog_ticket_t   *next;
-        xfs_caddr_t     buf;
-        uint            i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2;
-        /*
-         * The kmem_zalloc may sleep, so we shouldn't be holding the
-         * global lock.  XXXmiken: may want to use zone allocator.
-         */
-        buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP);
-        spin_lock(&log->l_icloglock);
-        /* Attach 1st ticket to Q, so we can keep track of allocated memory */
-        t_list = (xlog_ticket_t *)buf;
-        t_list->t_next = log->l_unmount_free;
-        log->l_unmount_free = t_list++;
-        log->l_ticket_cnt++;
-        log->l_ticket_tcnt++;
-        /* Next ticket becomes first ticket attached to ticket free list */
-        if (log->l_freelist != NULL) {
-                ASSERT(log->l_tail != NULL);
-                log->l_tail->t_next = t_list;
-        } else {
-                log->l_freelist = t_list;
-        }
-        log->l_ticket_cnt++;
-        log->l_ticket_tcnt++;
-        /* Cycle through rest of alloc'ed memory, building up free Q */
-        for ( ; i > 0; i--) {
-                next = t_list + 1;
-                t_list->t_next = next;
-                t_list = next;
-                log->l_ticket_cnt++;
-                log->l_ticket_tcnt++;
-        }
-        t_list->t_next = NULL;
-        log->l_tail = t_list;
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_ticket_alloc */
-/*
- * Put ticket into free list
- *
- * Assumption: log lock is held around this call.
 */
 STATIC void
 xlog_ticket_put(xlog_t          *log,
                xlog_ticket_t   *ticket)
 {
        sv_destroy(&ticket->t_sema);
+        kmem_zone_free(xfs_log_ticket_zone, ticket);
-        /*
-         * Don't think caching will make that much difference.  It's
-         * more important to make debug easier.
-         */
-#if 0
-        /* real code will want to use LIFO for caching */
-        ticket->t_next = log->l_freelist;
-        log->l_freelist = ticket;
-        /* no need to clear fields */
-#else
-        /* When we debug, it is easier if tickets are cycled */
-        ticket->t_next     = NULL;
-        if (log->l_tail) {
-                log->l_tail->t_next = ticket;
-        } else {
-                ASSERT(log->l_freelist == NULL);
-                log->l_freelist = ticket;
-        }
-        log->l_tail         = ticket;
-#endif /* DEBUG */
-        log->l_ticket_cnt++;
 }       /* xlog_ticket_put */
 /*
- * Grab ticket off freelist or allocation some more
+ * Allocate and initialise a new log ticket.
 */
 STATIC xlog_ticket_t *
 xlog_ticket_get(xlog_t          *log,
@@ -3269,21 +3190,9 @@ xlog_ticket_get(xlog_t		*log,
        xlog_ticket_t   *tic;
        uint            num_headers;
- alloc:
+        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
-        if (log->l_freelist == NULL)
+        if (!tic)
-                xlog_state_ticket_alloc(log);           /* potentially sleep */
+                return NULL;
-        spin_lock(&log->l_icloglock);
-        if (log->l_freelist == NULL) {
-                spin_unlock(&log->l_icloglock);
-                goto alloc;
-        }
-        tic             = log->l_freelist;
-        log->l_freelist = tic->t_next;
-        if (log->l_freelist == NULL)
-                log->l_tail = NULL;
-        log->l_ticket_cnt--;
-        spin_unlock(&log->l_icloglock);
        /*
         * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3334,7 +3243,7 @@ xlog_ticket_get(xlog_t		*log,
        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
        /* for roundoff padding for transaction data and one for commit record */
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
            log->l_mp->m_sb.sb_logsunit > 1) {
                /* log su roundoff */
                unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
@@ -3611,8 +3520,8 @@ xfs_log_force_umount(
         * before we mark the filesystem SHUTDOWN and wake
         * everybody up to tell the bad news.
         */
-        spin_lock(&log->l_grant_lock);
        spin_lock(&log->l_icloglock);
+        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        XFS_BUF_DONE(mp->m_sb_bp);
        /*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4cdac048df5e..d1d678ecb63e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -142,8 +142,9 @@ int	  _xfs_log_force(struct xfs_mount *mp,
                         xfs_lsn_t      lsn,
                         uint           flags,
                         int            *log_forced);
-#define xfs_log_force(mp, lsn, flags) \
+void      xfs_log_force(struct xfs_mount        *mp,
-        _xfs_log_force(mp, lsn, flags, NULL);
+                        xfs_lsn_t               lsn,
+                        uint                    flags);
 int       xfs_log_mount(struct xfs_mount        *mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e008233ee249..8952a392b5f3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -49,10 +49,10 @@ struct xfs_mount;
 #define XLOG_HEADER_SIZE        512
 #define XLOG_REC_SHIFT(log) \
-        BTOBB(1 << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
+        BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 #define XLOG_TOTAL_REC_SHIFT(log) \
-        BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
+        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
@@ -242,7 +242,7 @@ typedef struct xlog_res {
 typedef struct xlog_ticket {
        sv_t               t_sema;       /* sleep on this semaphore      : 20 */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header {
 * - ic_offset is the current number of bytes written to in this iclog.
 * - ic_refcnt is bumped when someone is writing to the log.
 * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ *      - ic_callback_*
+ *      - ic_refcnt
+ *      - fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_iclog_fields {
        sv_t                    ic_forcesema;
@@ -332,17 +345,22 @@ typedef struct xlog_iclog_fields {
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
        struct log              *ic_log;
-        xfs_log_callback_t      *ic_callback;
-        xfs_log_callback_t      **ic_callback_tail;
-#ifdef XFS_LOG_TRACE
-        struct ktrace           *ic_trace;
-#endif
        int                     ic_size;
        int                     ic_offset;
-        int                     ic_refcnt;
        int                     ic_bwritecnt;
        ushort_t                ic_state;
        char                    *ic_datap;      /* pointer to iclog data */
+#ifdef XFS_LOG_TRACE
+        struct ktrace           *ic_trace;
+#endif
+        /* Callback structures need their own cacheline */
+        spinlock_t              ic_callback_lock ____cacheline_aligned_in_smp;
+        xfs_log_callback_t      *ic_callback;
+        xfs_log_callback_t      **ic_callback_tail;
+        /* reference counts need their own cacheline */
+        atomic_t                ic_refcnt ____cacheline_aligned_in_smp;
 } xlog_iclog_fields_t;
 typedef union xlog_in_core2 {
@@ -366,6 +384,7 @@ typedef struct xlog_in_core {
 #define ic_bp           hic_fields.ic_bp
 #define ic_log          hic_fields.ic_log
 #define ic_callback     hic_fields.ic_callback
+#define ic_callback_lock hic_fields.ic_callback_lock
 #define ic_callback_tail hic_fields.ic_callback_tail
 #define ic_trace        hic_fields.ic_trace
 #define ic_size         hic_fields.ic_size
@@ -383,43 +402,46 @@ typedef struct xlog_in_core {
 * that round off problems won't occur when releasing partial reservations.
 */
 typedef struct log {
+        /* The following fields don't need locking */
+        struct xfs_mount        *l_mp;          /* mount point */
+        struct xfs_buf          *l_xbuf;        /* extra buffer for log
+                                                 * wrapping */
+        struct xfs_buftarg      *l_targ;        /* buftarg of log */
+        uint                    l_flags;
+        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+        struct xfs_buf_cancel   **l_buf_cancel_table;
+        int                     l_iclog_hsize;  /* size of iclog header */
+        int                     l_iclog_heads;  /* # of iclog header sectors */
+        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectbb_mask;  /* sector size (in BBs)
+                                                 * alignment mask */
+        int                     l_iclog_size;   /* size of log in bytes */
+        int                     l_iclog_size_log; /* log power size of log */
+        int                     l_iclog_bufs;   /* number of iclog buffers */
+        xfs_daddr_t             l_logBBstart;   /* start block of log */
+        int                     l_logsize;      /* size of log in bytes */
+        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sema_t                  l_flushsema;    /* iclog flushing semaphore */
+        sema_t                  l_flushsema ____cacheline_aligned_in_smp;
+                                                /* iclog flushing semaphore */
        int                     l_flushcnt;     /* # of procs waiting on this
                                                 * sema */
-        int                     l_ticket_cnt;   /* free ticket count */
-        int                     l_ticket_tcnt;  /* total ticket count */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
-        xlog_ticket_t           *l_freelist;    /* free list of tickets */
-        xlog_ticket_t           *l_unmount_free;/* kmem_free these addresses */
-        xlog_ticket_t           *l_tail;        /* free list of tickets */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
                                                 * buffers */
        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
-        struct xfs_mount        *l_mp;          /* mount point */
-        struct xfs_buf          *l_xbuf;        /* extra buffer for log
-                                                 * wrapping */
-        struct xfs_buftarg      *l_targ;        /* buftarg of log */
-        xfs_daddr_t             l_logBBstart;   /* start block of log */
-        int                     l_logsize;      /* size of log in bytes */
-        int                     l_logBBsize;    /* size of log in BB chunks */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        int                     l_iclog_size;   /* size of log in bytes */
-        int                     l_iclog_size_log; /* log power size of log */
-        int                     l_iclog_bufs;   /* number of iclog buffers */
-        /* The following field are used for debugging; need to hold icloglock */
-        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
        /* The following block of fields are changed while holding grant_lock */
-        spinlock_t              l_grant_lock;
+        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
        xlog_ticket_t           *l_reserve_headq;
        xlog_ticket_t           *l_write_headq;
        int                     l_grant_reserve_cycle;
@@ -427,19 +449,16 @@ typedef struct log {
        int                     l_grant_write_cycle;
        int                     l_grant_write_bytes;
-        /* The following fields don't need locking */
 #ifdef XFS_LOG_TRACE
        struct ktrace           *l_trace;
        struct ktrace           *l_grant_trace;
 #endif
-        uint                    l_flags;
-        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+        /* The following field are used for debugging; need to hold icloglock */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+#ifdef DEBUG
-        int                     l_iclog_hsize;  /* size of iclog header */
+        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
-        int                     l_iclog_heads;  /* # of iclog header sectors */
+#endif
-        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
-        uint                    l_sectbb_mask;  /* sector size (in BBs)
-                                                 * alignment mask */
 } xlog_t;
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
@@ -459,6 +478,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void      xlog_put_bp(struct xfs_buf *);
 extern int       xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
+extern kmem_zone_t      *xfs_log_ticket_zone;
 /* iclog tracing */
 #define XLOG_TRACE_GRAB_FLUSH  1
 #define XLOG_TRACE_REL_FLUSH   2
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b82d5d4d2462..e65ab4af0955 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
+#include "xfs_utils.h"
 STATIC int      xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int      xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -120,7 +121,8 @@ xlog_bread(
        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
-        if ((error = xfs_iowait(bp)))
+        error = xfs_iowait(bp);
+        if (error)
                xfs_ioerror_alert("xlog_bread", log->l_mp,
                                  bp, XFS_BUF_ADDR(bp));
        return error;
@@ -191,7 +193,7 @@ xlog_header_check_dump(
 {
        int                     b;
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
+        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
        for (b = 0; b < 16; b++)
                cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
@@ -478,7 +480,7 @@ xlog_find_verify_log_record(
         * reset last_blk.  Only when last_blk points in the middle of a log
         * record do we update last_blk.
         */
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                uint    h_size = be32_to_cpu(head->h_size);
                xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
@@ -888,7 +890,7 @@ xlog_find_tail(
         * unmount record if there is one, so we pass the lsn of the
         * unmount record rather than the block after it.
         */
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                int     h_size = be32_to_cpu(rhead->h_size);
                int     h_version = be32_to_cpu(rhead->h_version);
@@ -1101,7 +1103,7 @@ xlog_add_record(
        recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
        recp->h_cycle = cpu_to_be32(cycle);
        recp->h_version = cpu_to_be32(
-                        XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+                        xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
        recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
        recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
        recp->h_fmt = cpu_to_be32(XLOG_FMT);
@@ -1160,10 +1162,14 @@ xlog_write_log_records(
                if (j == 0 && (start_block + endcount > ealign)) {
                        offset = XFS_BUF_PTR(bp);
                        balign = BBTOB(ealign - start_block);
-                        XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
+                        error = XFS_BUF_SET_PTR(bp, offset + balign,
-                        if ((error = xlog_bread(log, ealign, sectbb, bp)))
+                                                BBTOB(sectbb));
+                        if (!error)
+                                error = xlog_bread(log, ealign, sectbb, bp);
+                        if (!error)
+                                error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+                        if (error)
                                break;
-                        XFS_BUF_SET_PTR(bp, offset, bufblks);
                }
                offset = xlog_align(log, start_block, endcount, bp);
@@ -2280,7 +2286,9 @@ xlog_recover_do_inode_trans(
                 * invalidate the buffer when we write it out below.
                 */
                imap.im_blkno = 0;
-                xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+                error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+                if (error)
+                        goto error;
        }
        /*
@@ -2964,7 +2972,7 @@ xlog_recover_process_data(
 * Process an extent free intent item that was recovered from
 * the log.  We need to free the extents that it describes.
 */
-STATIC void
+STATIC int
 xlog_recover_process_efi(
        xfs_mount_t             *mp,
        xfs_efi_log_item_t      *efip)
@@ -2972,6 +2980,7 @@ xlog_recover_process_efi(
        xfs_efd_log_item_t      *efdp;
        xfs_trans_t             *tp;
        int                     i;
+        int                     error = 0;
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
@@ -2995,23 +3004,32 @@ xlog_recover_process_efi(
                         * free the memory associated with it.
                         */
                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
-                        return;
+                        return XFS_ERROR(EIO);
                }
        }
        tp = xfs_trans_alloc(mp, 0);
-        xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+        if (error)
+                goto abort_error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
                extp = &(efip->efi_format.efi_extents[i]);
-                xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+                error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+                if (error)
+                        goto abort_error;
                xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
                                         extp->ext_len);
        }
        efip->efi_flags |= XFS_EFI_RECOVERED;
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        return error;
+abort_error:
+        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+        return error;
 }
 /*
@@ -3059,7 +3077,7 @@ xlog_recover_check_ail(
 * everything already in the AIL, we stop processing as soon as
 * we see something other than an EFI in the AIL.
 */
-STATIC void
+STATIC int
 xlog_recover_process_efis(
        xlog_t                  *log)
 {
@@ -3067,6 +3085,7 @@ xlog_recover_process_efis(
        xfs_efi_log_item_t      *efip;
        int                     gen;
        xfs_mount_t             *mp;
+        int                     error = 0;
        mp = log->l_mp;
        spin_lock(&mp->m_ail_lock);
@@ -3091,11 +3110,14 @@ xlog_recover_process_efis(
                }
                spin_unlock(&mp->m_ail_lock);
-                xlog_recover_process_efi(mp, efip);
+                error = xlog_recover_process_efi(mp, efip);
+                if (error)
+                        return error;
                spin_lock(&mp->m_ail_lock);
                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
        }
        spin_unlock(&mp->m_ail_lock);
+        return error;
 }
 /*
@@ -3115,21 +3137,18 @@ xlog_recover_clear_agi_bucket(
        int             error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-        xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        if (!error)
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                goto out_abort;
-                return;
-        }
+        error = EINVAL;
        agi = XFS_BUF_TO_AGI(agibp);
-        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
+        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
-                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                goto out_abort;
-                return;
-        }
        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
@@ -3137,7 +3156,17 @@ xlog_recover_clear_agi_bucket(
        xfs_trans_log_buf(tp, agibp, offset,
                          (offset + sizeof(xfs_agino_t) - 1));
-        (void) xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        if (error)
+                goto out_error;
+        return;
+out_abort:
+        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+                        "failed to clear agi %d. Continuing.", agno);
+        return;
 }
 /*
@@ -3214,7 +3243,8 @@ xlog_recover_process_iunlinks(
                                         * next inode in the bucket.
                                         */
                                        error = xfs_itobp(mp, NULL, ip, &dip,
-                                                        &ibp, 0, 0);
+                                                        &ibp, 0, 0,
+                                                        XFS_BUF_LOCK);
                                        ASSERT(error || (dip != NULL));
                                }
@@ -3247,7 +3277,7 @@ xlog_recover_process_iunlinks(
                                        if (ip->i_d.di_mode == 0)
                                                xfs_iput_new(ip, 0);
                                        else
-                                                VN_RELE(XFS_ITOV(ip));
+                                                IRELE(ip);
                                } else {
                                        /*
                                         * We can't read in the inode
@@ -3348,7 +3378,7 @@ xlog_pack_data(
                dp += BBSIZE;
        }
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
                for ( ; i < BTOBB(size); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3388,7 +3418,7 @@ xlog_unpack_data_checksum(
                            be32_to_cpu(rhead->h_chksum), chksum);
                    cmn_err(CE_DEBUG,
 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
-                    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                            cmn_err(CE_DEBUG,
                                "XFS: LogR this is a LogV2 filesystem\n");
                    }
@@ -3415,7 +3445,7 @@ xlog_unpack_data(
                dp += BBSIZE;
        }
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                xhdr = (xlog_in_core_2_t *)rhead;
                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3445,7 +3475,7 @@ xlog_valid_rec_header(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
                xlog_warn("XFS: %s: unrecognised log version (%d).",
-                        __FUNCTION__, be32_to_cpu(rhead->h_version));
+                        __func__, be32_to_cpu(rhead->h_version));
                return XFS_ERROR(EIO);
        }
@@ -3494,7 +3524,7 @@ xlog_do_recovery_pass(
         * Read the header of the tail block and get the iclog buffer size from
         * h_size.  Use this to tell how many sectors make up the log header.
         */
-        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                /*
                 * When using variable length iclogs, read first sector of
                 * iclog header and extract the header size from it.  Get a
@@ -3604,15 +3634,19 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
+                                wrapped_hblks = hblks - split_hblks;
                                bufaddr = XFS_BUF_PTR(hbp);
-                                XFS_BUF_SET_PTR(hbp,
+                                error = XFS_BUF_SET_PTR(hbp,
                                                bufaddr + BBTOB(split_hblks),
                                                BBTOB(hblks - split_hblks));
-                                wrapped_hblks = hblks - split_hblks;
+                                if (!error)
-                                error = xlog_bread(log, 0, wrapped_hblks, hbp);
+                                        error = xlog_bread(log, 0,
+                                                        wrapped_hblks, hbp);
+                                if (!error)
+                                        error = XFS_BUF_SET_PTR(hbp, bufaddr,
+                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
-                                XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
                                if (!offset)
                                        offset = xlog_align(log, 0,
                                                        wrapped_hblks, hbp);
@@ -3664,13 +3698,18 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                bufaddr = XFS_BUF_PTR(dbp);
-                                XFS_BUF_SET_PTR(dbp,
+                                error = XFS_BUF_SET_PTR(dbp,
                                                bufaddr + BBTOB(split_bblks),
                                                BBTOB(bblks - split_bblks));
-                                if ((error = xlog_bread(log, wrapped_hblks,
+                                if (!error)
-                                                bblks - split_bblks, dbp)))
+                                        error = xlog_bread(log, wrapped_hblks,
+                                                        bblks - split_bblks,
+                                                        dbp);
+                                if (!error)
+                                        error = XFS_BUF_SET_PTR(dbp, bufaddr,
+                                                        h_size);
+                                if (error)
                                        goto bread_err2;
-                                XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
                                if (!offset)
                                        offset = xlog_align(log, wrapped_hblks,
                                                bblks - split_bblks, dbp);
@@ -3826,7 +3865,8 @@ xlog_do_recover(
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
        xfsbdstrat(log->l_mp, bp);
-        if ((error = xfs_iowait(bp))) {
+        error = xfs_iowait(bp);
+        if (error) {
                xfs_ioerror_alert("xlog_do_recover",
                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
                ASSERT(0);
@@ -3838,7 +3878,7 @@ xlog_do_recover(
        sbp = &log->l_mp->m_sb;
        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
-        ASSERT(XFS_SB_GOOD_VERSION(sbp));
+        ASSERT(xfs_sb_good_version(sbp));
        xfs_buf_relse(bp);
        /* We've re-read the superblock so re-initialize per-cpu counters */
@@ -3917,7 +3957,14 @@ xlog_recover_finish(
         * rather than accepting new requests.
         */
        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
-                xlog_recover_process_efis(log);
+                int     error;
+                error = xlog_recover_process_efis(log);
+                if (error) {
+                        cmn_err(CE_ALERT,
+                                "Failed to recover EFIs on filesystem: %s",
+                                log->l_mp->m_fsname);
+                        return error;
+                }
                /*
                 * Sync the log to get all the EFIs out of the AIL.
                 * This isn't absolutely necessary, but it helps in
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6409b3762995..2fec452afbcc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,8 +43,9 @@
 #include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
+#include "xfs_utils.h"
-STATIC void     xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
+STATIC int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int      xfs_uuid_mount(xfs_mount_t *);
 STATIC void     xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
@@ -57,7 +58,7 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 STATIC void     xfs_icsb_sync_counters(xfs_mount_t *);
 STATIC int      xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
                                                int64_t, int);
-STATIC int      xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
@@ -119,6 +120,7 @@ static const struct {
    { offsetof(xfs_sb_t, sb_logsectsize),0 },
    { offsetof(xfs_sb_t, sb_logsunit),   0 },
    { offsetof(xfs_sb_t, sb_features2),  0 },
+    { offsetof(xfs_sb_t, sb_bad_features2), 0 },
    { sizeof(xfs_sb_t),                  0 }
 };
@@ -225,7 +227,7 @@ xfs_mount_validate_sb(
                return XFS_ERROR(EWRONGFS);
        }
-        if (!XFS_SB_GOOD_VERSION(sbp)) {
+        if (!xfs_sb_good_version(sbp)) {
                xfs_fs_mount_cmn_err(flags, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
@@ -300,7 +302,7 @@ xfs_mount_validate_sb(
        /*
         * Version 1 directory format has never worked on Linux.
         */
-        if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) {
+        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
                xfs_fs_mount_cmn_err(flags,
                        "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
@@ -449,6 +451,7 @@ xfs_sb_from_disk(
        to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
        to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
        to->sb_features2 = be32_to_cpu(from->sb_features2);
+        to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
 }
 /*
@@ -781,7 +784,7 @@ xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
                 * Update superblock with new values
                 * and log changes
                 */
-                if (XFS_SB_VERSION_HASDALIGN(sbp)) {
+                if (xfs_sb_version_hasdalign(sbp)) {
                        if (sbp->sb_unit != mp->m_dalign) {
                                sbp->sb_unit = mp->m_dalign;
                                *update_flags |= XFS_SB_UNIT;
@@ -792,7 +795,7 @@ xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
                        }
                }
        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
-                    XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
+                    xfs_sb_version_hasdalign(&mp->m_sb)) {
                        mp->m_dalign = sbp->sb_unit;
                        mp->m_swidth = sbp->sb_width;
        }
@@ -869,7 +872,7 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 STATIC void
 xfs_set_inoalignment(xfs_mount_t *mp)
 {
-        if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
+        if (xfs_sb_version_hasalign(&mp->m_sb) &&
            mp->m_sb.sb_inoalignmt >=
            XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
                mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
@@ -954,7 +957,6 @@ xfs_mountfs(
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
-        bhv_vnode_t     *rvp = NULL;
        __uint64_t      resblks;
        __int64_t       update_flags = 0LL;
        uint            quotamount, quotaflags;
@@ -962,14 +964,41 @@ xfs_mountfs(
        int             uuid_mounted = 0;
        int             error = 0;
-        if (mp->m_sb_bp == NULL) {
-                error = xfs_readsb(mp, mfsi_flags);
-                if (error)
-                        return error;
-        }
        xfs_mount_common(mp, sbp);
        /*
+         * Check for a mismatched features2 values.  Older kernels
+         * read & wrote into the wrong sb offset for sb_features2
+         * on some platforms due to xfs_sb_t not being 64bit size aligned
+         * when sb_features2 was added, which made older superblock
+         * reading/writing routines swap it as a 64-bit value.
+         *
+         * For backwards compatibility, we make both slots equal.
+         *
+         * If we detect a mismatched field, we OR the set bits into the
+         * existing features2 field in case it has already been modified; we
+         * don't want to lose any features.  We then update the bad location
+         * with the ORed value so that older kernels will see any features2
+         * flags, and mark the two fields as needing updates once the
+         * transaction subsystem is online.
+         */
+        if (xfs_sb_has_mismatched_features2(sbp)) {
+                cmn_err(CE_WARN,
+                        "XFS: correcting sb_features alignment problem");
+                sbp->sb_features2 |= sbp->sb_bad_features2;
+                sbp->sb_bad_features2 = sbp->sb_features2;
+                update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
+                /*
+                 * Re-check for ATTR2 in case it was found in bad_features2
+                 * slot.
+                 */
+                if (xfs_sb_version_hasattr2(&mp->m_sb))
+                        mp->m_flags |= XFS_MOUNT_ATTR2;
+        }
+        /*
         * Check if sb_agblocks is aligned at stripe boundary
         * If sb_agblocks is NOT aligned turn off m_dalign since
         * allocator alignment is within an ag, therefore ag has
@@ -1129,7 +1158,6 @@ xfs_mountfs(
        }
        ASSERT(rip != NULL);
-        rvp = XFS_ITOV(rip);
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
                cmn_err(CE_WARN, "XFS: corrupted root inode");
@@ -1159,11 +1187,15 @@ xfs_mountfs(
        }
        /*
-         * If fs is not mounted readonly, then update the superblock
+         * If fs is not mounted readonly, then update the superblock changes.
-         * unit and width changes.
         */
-        if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY))
+        if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_mount_log_sbunit(mp, update_flags);
+                error = xfs_mount_log_sb(mp, update_flags);
+                if (error) {
+                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
+                        goto error4;
+                }
+        }
        /*
         * Initialise the XFS quota management subsystem for this mount
@@ -1200,12 +1232,15 @@ xfs_mountfs(
         *
         * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
         * This may drive us straight to ENOSPC on mount, but that implies
-         * we were already there on the last unmount.
+         * we were already there on the last unmount. Warn if this occurs.
         */
        resblks = mp->m_sb.sb_dblocks;
        do_div(resblks, 20);
        resblks = min_t(__uint64_t, resblks, 1024);
-        xfs_reserve_blocks(mp, &resblks, NULL);
+        error = xfs_reserve_blocks(mp, &resblks, NULL);
+        if (error)
+                cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
+                                "Continuing without a reserve pool.");
        return 0;
@@ -1213,7 +1248,7 @@ xfs_mountfs(
        /*
         * Free up the root inode.
         */
-        VN_RELE(rvp);
+        IRELE(rip);
 error3:
        xfs_log_unmount_dealloc(mp);
 error2:
@@ -1241,6 +1276,7 @@ int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
        __uint64_t      resblks;
+        int             error = 0;
        /*
         * We can potentially deadlock here if we have an inode cluster
@@ -1284,9 +1320,15 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
         * value does not matter....
         */
        resblks = 0;
-        xfs_reserve_blocks(mp, &resblks, NULL);
+        error = xfs_reserve_blocks(mp, &resblks, NULL);
+        if (error)
+                cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+                                "Freespace may not be correct on next mount.");
-        xfs_log_sbcount(mp, 1);
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
@@ -1378,9 +1420,8 @@ xfs_log_sbcount(
        xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
        if (sync)
                xfs_trans_set_sync(tp);
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        return error;
-        return 0;
 }
 STATIC void
@@ -1429,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNASYNC(sbp);
                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
-                /* Nevermind errors we might get here. */
                error = xfs_iowait(sbp);
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
@@ -1875,25 +1915,30 @@ xfs_uuid_unmount(
 /*
 * Used to log changes to the superblock unit and width fields which could
- * be altered by the mount options. Only the first superblock is updated.
+ * be altered by the mount options, as well as any potential sb_features2
+ * fixup. Only the first superblock is updated.
 */
-STATIC void
+STATIC int
-xfs_mount_log_sbunit(
+xfs_mount_log_sb(
        xfs_mount_t     *mp,
        __int64_t       fields)
 {
        xfs_trans_t     *tp;
+        int             error;
-        ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
+        ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
+                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-        if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-                                XFS_DEFAULT_LOG_COUNT)) {
+                                XFS_DEFAULT_LOG_COUNT);
+        if (error) {
                xfs_trans_cancel(tp, 0);
-                return;
+                return error;
        }
        xfs_mod_sb(tp, fields);
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
+        return error;
 }
@@ -2154,7 +2199,7 @@ xfs_icsb_counter_disabled(
        return test_bit(field, &mp->m_icsb_counters);
 }
-STATIC int
+STATIC void
 xfs_icsb_disable_counter(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field)
@@ -2172,7 +2217,7 @@ xfs_icsb_disable_counter(
         * the m_icsb_mutex.
         */
        if (xfs_icsb_counter_disabled(mp, field))
-                return 0;
+                return;
        xfs_icsb_lock_all_counters(mp);
        if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
@@ -2195,8 +2240,6 @@ xfs_icsb_disable_counter(
        }
        xfs_icsb_unlock_all_counters(mp);
-        return 0;
 }
 STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f7c620ec6e69..1ed575110ff0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,17 +66,17 @@ struct xfs_mru_cache;
 * Prototypes and functions for the Data Migration subsystem.
 */
-typedef int     (*xfs_send_data_t)(int, bhv_vnode_t *,
+typedef int     (*xfs_send_data_t)(int, struct xfs_inode *,
-                        xfs_off_t, size_t, int, bhv_vrwlock_t *);
+                        xfs_off_t, size_t, int, int *);
 typedef int     (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int     (*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t);
+typedef int     (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-                        bhv_vnode_t *,
+                        struct xfs_inode *, dm_right_t,
-                        dm_right_t, bhv_vnode_t *, dm_right_t,
+                        struct xfs_inode *, dm_right_t,
-                        char *, char *, mode_t, int, int);
+                        const char *, const char *, mode_t, int, int);
 typedef int     (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
                        char *, char *);
-typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *,
+typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
                        dm_right_t, mode_t, int, int);
 typedef struct xfs_dmops {
@@ -88,20 +88,20 @@ typedef struct xfs_dmops {
        xfs_send_unmount_t      xfs_send_unmount;
 } xfs_dmops_t;
-#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
+#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
-        (*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock)
+        (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
 #define XFS_SEND_MMAP(mp, vma,fl) \
        (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, vp,right) \
+#define XFS_SEND_DESTROY(mp, ip,right) \
-        (*(mp)->m_dm_ops->xfs_send_destroy)(vp,right)
+        (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
 #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
        (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
        (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
 #define XFS_SEND_MOUNT(mp,right,path,name) \
        (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \
+#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
-        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl)
+        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
 /*
@@ -220,7 +220,7 @@ extern void	xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
 #endif
 typedef struct xfs_ail {
-        xfs_ail_entry_t         xa_ail;
+        struct list_head        xa_ail;
        uint                    xa_gen;
        struct task_struct      *xa_task;
        xfs_lsn_t               xa_target;
@@ -366,7 +366,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_SMALL_INUMS   (1ULL << 15)    /* users wants 32bit inodes */
 #define XFS_MOUNT_NOUUID        (1ULL << 16)    /* ignore uuid during mount */
 #define XFS_MOUNT_BARRIER       (1ULL << 17)
-#define XFS_MOUNT_IDELETE       (1ULL << 18)    /* delete empty inode clusters*/
+#define XFS_MOUNT_IKEEP         (1ULL << 18)    /* keep empty inode clusters*/
 #define XFS_MOUNT_SWALLOC       (1ULL << 19)    /* turn on stripe width
                                                 * allocation */
 #define XFS_MOUNT_RDONLY        (1ULL << 20)    /* read-only fs */
@@ -401,7 +401,7 @@ typedef struct xfs_mount {
 /*
 * Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used. 
+ * "largeio" mount option is used.
 *
 * If compatibility mode is specified, simply return the basic unit of caching
 * so that we don't get inefficient read/modify/write I/O from user apps.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7eb157a59f9e..ee371890d85d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -36,7 +36,6 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
 #include "xfs_vnodeops.h"
@@ -84,25 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip;
 */
 STATIC int
 xfs_lock_for_rename(
-        xfs_inode_t     *dp1,   /* old (source) directory inode */
+        xfs_inode_t     *dp1,   /* in: old (source) directory inode */
-        xfs_inode_t     *dp2,   /* new (target) directory inode */
+        xfs_inode_t     *dp2,   /* in: new (target) directory inode */
-        bhv_vname_t     *vname1,/* old entry name */
+        xfs_inode_t     *ip1,   /* in: inode of old entry */
-        bhv_vname_t     *vname2,/* new entry name */
+        struct xfs_name *name2, /* in: new entry name */
-        xfs_inode_t     **ipp1, /* inode of old entry */
+        xfs_inode_t     **ipp2, /* out: inode of new entry, if it
-        xfs_inode_t     **ipp2, /* inode of new entry, if it
                                   already exists, NULL otherwise. */
-        xfs_inode_t     **i_tab,/* array of inode returned, sorted */
+        xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
-        int             *num_inodes)  /* number of inodes in array */
+        int             *num_inodes)  /* out: number of inodes in array */
 {
-        xfs_inode_t             *ip1, *ip2, *temp;
+        xfs_inode_t             *ip2 = NULL;
+        xfs_inode_t             *temp;
        xfs_ino_t               inum1, inum2;
        int                     error;
        int                     i, j;
        uint                    lock_mode;
        int                     diff_dirs = (dp1 != dp2);
-        ip2 = NULL;
        /*
         * First, find out the current inums of the entries so that we
         * can determine the initial locking order.  We'll have to
@@ -110,27 +107,20 @@ xfs_lock_for_rename(
         * to see if we still have the right inodes, directories, etc.
         */
        lock_mode = xfs_ilock_map_shared(dp1);
-        error = xfs_get_dir_entry(vname1, &ip1);
+        IHOLD(ip1);
-        if (error) {
+        xfs_itrace_ref(ip1);
-                xfs_iunlock_map_shared(dp1, lock_mode);
-                return error;
-        }
        inum1 = ip1->i_ino;
-        ASSERT(ip1);
-        xfs_itrace_ref(ip1);
        /*
         * Unlock dp1 and lock dp2 if they are different.
         */
        if (diff_dirs) {
                xfs_iunlock_map_shared(dp1, lock_mode);
                lock_mode = xfs_ilock_map_shared(dp2);
        }
-        error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2);
+        error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
        if (error == ENOENT) {          /* target does not need to exist. */
                inum2 = 0;
        } else if (error) {
@@ -162,6 +152,7 @@ xfs_lock_for_rename(
                *num_inodes = 4;
                i_tab[3] = ip2;
        }
+        *ipp2 = i_tab[3];
        /*
         * Sort the elements via bubble sort.  (Remember, there are at
@@ -199,21 +190,6 @@ xfs_lock_for_rename(
                xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
        }
-        /*
-         * Set the return value. Null out any unused entries in i_tab.
-         */
-        *ipp1 = *ipp2 = NULL;
-        for (i=0; i < *num_inodes; i++) {
-                if (i_tab[i]->i_ino == inum1) {
-                        *ipp1 = i_tab[i];
-                }
-                if (i_tab[i]->i_ino == inum2) {
-                        *ipp2 = i_tab[i];
-                }
-        }
-        for (;i < 4; i++) {
-                i_tab[i] = NULL;
-        }
        return 0;
 }
@@ -223,13 +199,13 @@ xfs_lock_for_rename(
 int
 xfs_rename(
        xfs_inode_t     *src_dp,
-        bhv_vname_t     *src_vname,
+        struct xfs_name *src_name,
-        bhv_vnode_t     *target_dir_vp,
+        xfs_inode_t     *src_ip,
-        bhv_vname_t     *target_vname)
+        xfs_inode_t     *target_dp,
+        struct xfs_name *target_name)
 {
-        bhv_vnode_t     *src_dir_vp = XFS_ITOV(src_dp);
        xfs_trans_t     *tp;
-        xfs_inode_t     *target_dp, *src_ip, *target_ip;
+        xfs_inode_t     *target_ip;
        xfs_mount_t     *mp = src_dp->i_mount;
        int             new_parent;             /* moving to a new dir */
        int             src_is_directory;       /* src_name is a directory */
@@ -243,29 +219,16 @@ xfs_rename(
        int             spaceres;
        int             target_link_zero = 0;
        int             num_inodes;
-        char            *src_name = VNAME(src_vname);
-        char            *target_name = VNAME(target_vname);
-        int             src_namelen = VNAMELEN(src_vname);
-        int             target_namelen = VNAMELEN(target_vname);
        xfs_itrace_entry(src_dp);
-        xfs_itrace_entry(xfs_vtoi(target_dir_vp));
+        xfs_itrace_entry(target_dp);
-        /*
-         * Find the XFS behavior descriptor for the target directory
-         * vnode since it was not handed to us.
-         */
-        target_dp = xfs_vtoi(target_dir_vp);
-        if (target_dp == NULL) {
-                return XFS_ERROR(EXDEV);
-        }
        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
            DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-                                        src_dir_vp, DM_RIGHT_NULL,
+                                        src_dp, DM_RIGHT_NULL,
-                                        target_dir_vp, DM_RIGHT_NULL,
+                                        target_dp, DM_RIGHT_NULL,
-                                        src_name, target_name,
+                                        src_name->name, target_name->name,
                                        0, 0, 0);
                if (error) {
                        return error;
@@ -282,10 +245,8 @@ xfs_rename(
         * does not exist in the source directory.
         */
        tp = NULL;
-        error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
+        error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
-                        target_vname, &src_ip, &target_ip, inodes,
+                                        &target_ip, inodes, &num_inodes);
-                        &num_inodes);
        if (error) {
                /*
                 * We have nothing locked, no inode references, and
@@ -331,7 +292,7 @@ xfs_rename(
        XFS_BMAP_INIT(&free_list, &first_block);
        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
        if (error == ENOSPC) {
@@ -365,10 +326,10 @@ xfs_rename(
         * them when they unlock the inodes.  Also, we need to be careful
         * not to add an inode to the transaction more than once.
         */
-        VN_HOLD(src_dir_vp);
+        IHOLD(src_dp);
        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
        if (new_parent) {
-                VN_HOLD(target_dir_vp);
+                IHOLD(target_dp);
                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
        }
        if ((src_ip != src_dp) && (src_ip != target_dp)) {
@@ -389,9 +350,8 @@ xfs_rename(
                 * If there's no space reservation, check the entry will
                 * fit before actually inserting it.
                 */
-                if (spaceres == 0 &&
+                error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
-                    (error = xfs_dir_canenter(tp, target_dp, target_name,
+                if (error)
-                                                target_namelen)))
                        goto error_return;
                /*
                 * If target does not exist and the rename crosses
@@ -399,8 +359,8 @@ xfs_rename(
                 * to account for the ".." reference from the new entry.
                 */
                error = xfs_dir_createname(tp, target_dp, target_name,
-                                           target_namelen, src_ip->i_ino,
+                                                src_ip->i_ino, &first_block,
-                                           &first_block, &free_list, spaceres);
+                                                &free_list, spaceres);
                if (error == ENOSPC)
                        goto error_return;
                if (error)
@@ -439,7 +399,7 @@ xfs_rename(
                 * name at the destination directory, remove it first.
                 */
                error = xfs_dir_replace(tp, target_dp, target_name,
-                                        target_namelen, src_ip->i_ino,
+                                        src_ip->i_ino,
                                        &first_block, &free_list, spaceres);
                if (error)
                        goto abort_return;
@@ -476,7 +436,8 @@ xfs_rename(
                 * Rewrite the ".." entry to point to the new
                 * directory.
                 */
-                error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+                                        target_dp->i_ino,
                                        &first_block, &free_list, spaceres);
                ASSERT(error != EEXIST);
                if (error)
@@ -512,8 +473,8 @@ xfs_rename(
                        goto abort_return;
        }
-        error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
+        error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
-                        src_ip->i_ino, &first_block, &free_list, spaceres);
+                                        &first_block, &free_list, spaceres);
        if (error)
                goto abort_return;
        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -580,10 +541,8 @@ xfs_rename(
         * the vnode references.
         */
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (target_ip != NULL) {
+        if (target_ip != NULL)
-                xfs_refcache_purge_ip(target_ip);
                IRELE(target_ip);
-        }
        /*
         * Let interposed file systems know about removed links.
         */
@@ -598,9 +557,9 @@ std_return:
        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
            DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
                (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-                                        src_dir_vp, DM_RIGHT_NULL,
+                                        src_dp, DM_RIGHT_NULL,
-                                        target_dir_vp, DM_RIGHT_NULL,
+                                        target_dp, DM_RIGHT_NULL,
-                                        src_name, target_name,
+                                        src_name->name, target_name->name,
                                        0, error, 0);
        }
        return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca83ddf72af4..a0dc6e5bc5b9 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -44,6 +44,7 @@
 #include "xfs_rw.h"
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
+#include "xfs_utils.h"
 /*
@@ -73,6 +74,18 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
 */
 /*
+ * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
+ */
+STATIC int
+xfs_lowbit32(
+        __uint32_t      v)
+{
+        if (v)
+                return ffs(v) - 1;
+        return -1;
+}
+/*
 * Allocate space to the bitmap or summary file, and zero it, for growfs.
 */
 STATIC int                              /* error */
@@ -111,14 +124,14 @@ xfs_growfs_rt_alloc(
                                XFS_GROWRTALLOC_LOG_RES(mp), 0,
                                XFS_TRANS_PERM_LOG_RES,
                                XFS_DEFAULT_PERM_LOG_COUNT)))
-                        goto error_exit;
+                        goto error_cancel;
                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
                /*
                 * Lock the inode.
                 */
                if ((error = xfs_trans_iget(mp, tp, ino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        goto error_exit;
+                        goto error_cancel;
                XFS_BMAP_INIT(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
@@ -131,14 +144,16 @@ xfs_growfs_rt_alloc(
                if (!error && nmap < 1)
                        error = XFS_ERROR(ENOSPC);
                if (error)
-                        goto error_exit;
+                        goto error_cancel;
                /*
                 * Free any blocks freed up in the transaction, then commit.
                 */
                error = xfs_bmap_finish(&tp, &flist, &committed);
                if (error)
-                        goto error_exit;
+                        goto error_cancel;
-                xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                if (error)
+                        goto error;
                /*
                 * Now we need to clear the allocated blocks.
                 * Do this one block per transaction, to keep it simple.
@@ -153,13 +168,13 @@ xfs_growfs_rt_alloc(
                         */
                        if ((error = xfs_trans_reserve(tp, 0,
                                        XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
-                                goto error_exit;
+                                goto error_cancel;
                        /*
                         * Lock the bitmap inode.
                         */
                        if ((error = xfs_trans_iget(mp, tp, ino, 0,
                                                        XFS_ILOCK_EXCL, &ip)))
-                                goto error_exit;
+                                goto error_cancel;
                        /*
                         * Get a buffer for the block.
                         */
@@ -168,14 +183,16 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = XFS_ERROR(EIO);
-                                goto error_exit;
+                                goto error_cancel;
                        }
                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
                        /*
                         * Commit the transaction.
                         */
-                        xfs_trans_commit(tp, 0);
+                        error = xfs_trans_commit(tp, 0);
+                        if (error)
+                                goto error;
                }
                /*
                 * Go on to the next extent, if any.
@@ -183,8 +200,9 @@ xfs_growfs_rt_alloc(
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
-error_exit:
+error_cancel:
        xfs_trans_cancel(tp, cancelflags);
+error:
        return error;
 }
@@ -432,7 +450,6 @@ xfs_rtallocate_extent_near(
        }
        bbno = XFS_BITTOBLOCK(mp, bno);
        i = 0;
-        ASSERT(minlen != 0);
        log2len = xfs_highbit32(minlen);
        /*
         * Loop over all bitmap blocks (bbno + i is current block).
@@ -601,8 +618,6 @@ xfs_rtallocate_extent_size(
        xfs_suminfo_t   sum;            /* summary information for extents */
        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
-        ASSERT(maxlen != 0);
        /*
         * Loop over all the levels starting with maxlen.
         * At each level, look at all the bitmap blocks, to see if there
@@ -660,9 +675,6 @@ xfs_rtallocate_extent_size(
                *rtblock = NULLRTBLOCK;
                return 0;
        }
-        ASSERT(minlen != 0);
-        ASSERT(maxlen != 0);
        /*
         * Loop over sizes, from maxlen down to minlen.
         * This time, when we do the allocations, allow smaller ones
@@ -1869,6 +1881,7 @@ xfs_growfs_rt(
        xfs_trans_t     *tp;            /* transaction pointer */
        sbp = &mp->m_sb;
+        cancelflags = 0;
        /*
         * Initial error checking.
         */
@@ -1948,7 +1961,6 @@ xfs_growfs_rt(
                                  nsbp->sb_blocksize * nsbp->sb_rextsize);
                nsbp->sb_rextents = nsbp->sb_rblocks;
                do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
-                ASSERT(nsbp->sb_rextents != 0);
                nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
                nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
                nrsumsize =
@@ -2036,13 +2048,15 @@ xfs_growfs_rt(
                 */
                mp->m_rsumlevels = nrsumlevels;
                mp->m_rsumsize = nrsumsize;
-                /*
-                 * Commit the transaction.
+                error = xfs_trans_commit(tp, 0);
-                 */
+                if (error) {
-                xfs_trans_commit(tp, 0);
+                        tp = NULL;
+                        break;
+                }
        }
-        if (error)
+        if (error && tp)
                xfs_trans_cancel(tp, cancelflags);
        /*
@@ -2273,7 +2287,7 @@ xfs_rtmount_inodes(
        ASSERT(sbp->sb_rsumino != NULLFSINO);
        error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
        if (error) {
-                VN_RELE(XFS_ITOV(mp->m_rbmip));
+                IRELE(mp->m_rbmip);
                return error;
        }
        ASSERT(mp->m_rsumip != NULL);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index cd3ece6cc918..b0f31c09a76d 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -126,11 +126,11 @@ xfs_write_sync_logforce(
                 * when we return.
                 */
                if (iip && iip->ili_last_lsn) {
-                        xfs_log_force(mp, iip->ili_last_lsn,
+                        error = _xfs_log_force(mp, iip->ili_last_lsn,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC);
+                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
                } else if (xfs_ipincount(ip) > 0) {
-                        xfs_log_force(mp, (xfs_lsn_t)0,
+                        error = _xfs_log_force(mp, (xfs_lsn_t)0,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC);
+                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
                }
        } else {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 94660b1a6ccc..d904efe7f871 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -89,6 +89,7 @@ struct xfs_mount;
 /*
 * Superblock - in core version.  Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
 */
 typedef struct xfs_sb {
        __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
@@ -145,10 +146,21 @@ typedef struct xfs_sb {
        __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
        __uint32_t      sb_logsunit;    /* stripe unit size for the log */
        __uint32_t      sb_features2;   /* additional feature bits */
+        /*
+         * bad features2 field as a result of failing to pad the sb
+         * structure to 64 bits. Some machines will be using this field
+         * for features2 bits. Easiest just to mark it bad and not use
+         * it for anything else.
+         */
+        __uint32_t      sb_bad_features2;
+        /* must be padded to 64 bit alignment */
 } xfs_sb_t;
 /*
- * Superblock - on disk version.  Must match the in core version below.
+ * Superblock - on disk version.  Must match the in core version above.
+ * Must be padded to 64 bit alignment.
 */
 typedef struct xfs_dsb {
        __be32          sb_magicnum;    /* magic number == XFS_SB_MAGIC */
@@ -205,6 +217,15 @@ typedef struct xfs_dsb {
        __be16          sb_logsectsize; /* sector size for the log, bytes */
        __be32          sb_logsunit;    /* stripe unit size for the log */
        __be32          sb_features2;   /* additional feature bits */
+        /*
+         * bad features2 field as a result of failing to pad the sb
+         * structure to 64 bits. Some machines will be using this field
+         * for features2 bits. Easiest just to mark it bad and not use
+         * it for anything else.
+         */
+        __be32  sb_bad_features2;
+        /* must be padded to 64 bit alignment */
 } xfs_dsb_t;
 /*
@@ -223,7 +244,7 @@ typedef enum {
        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-        XFS_SBS_FEATURES2,
+        XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2,
        XFS_SBS_FIELDCOUNT
 } xfs_sb_field_t;
@@ -248,13 +269,15 @@ typedef enum {
 #define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
 #define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
 #define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
 #define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
 #define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
 #define XFS_SB_MOD_BITS         \
        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
-         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2)
+         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+         XFS_SB_BAD_FEATURES2)
 /*
@@ -271,7 +294,6 @@ typedef enum {
 #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-#define XFS_SB_GOOD_VERSION(sbp)        xfs_sb_good_version(sbp)
 #ifdef __KERNEL__
 static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 {
@@ -297,7 +319,15 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 }
 #endif /* __KERNEL__ */
-#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v)
+/*
+ * Detect a mismatched features2 field.  Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
+{
+        return (sbp->sb_bad_features2 != sbp->sb_features2);
+}
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
        return ((((v) == XFS_SB_VERSION_1) ? \
@@ -308,7 +338,6 @@ static inline unsigned xfs_sb_version_tonew(unsigned v)
                XFS_SB_VERSION_4);
 }
-#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v)
 static inline unsigned xfs_sb_version_toold(unsigned v)
 {
        return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
@@ -320,7 +349,6 @@ static inline unsigned xfs_sb_version_toold(unsigned v)
                                XFS_SB_VERSION_1)));
 }
-#define XFS_SB_VERSION_HASATTR(sbp)     xfs_sb_version_hasattr(sbp)
 static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
 {
        return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
@@ -329,7 +357,6 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
                  ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
 }
-#define XFS_SB_VERSION_ADDATTR(sbp)     xfs_sb_version_addattr(sbp)
 static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 {
        (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
@@ -339,7 +366,6 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
                        (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)));
 }
-#define XFS_SB_VERSION_HASNLINK(sbp)    xfs_sb_version_hasnlink(sbp)
 static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
 {
        return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
@@ -347,7 +373,6 @@ static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
                  ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
 }
-#define XFS_SB_VERSION_ADDNLINK(sbp)    xfs_sb_version_addnlink(sbp)
 static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
 {
        (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
@@ -355,115 +380,63 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
                ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT));
 }
-#define XFS_SB_VERSION_HASQUOTA(sbp)    xfs_sb_version_hasquota(sbp)
 static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
 }
-#define XFS_SB_VERSION_ADDQUOTA(sbp)    xfs_sb_version_addquota(sbp)
 static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 {
        (sbp)->sb_versionnum = \
                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
                        ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
-                        (XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \
+                        (xfs_sb_version_tonew((sbp)->sb_versionnum) | \
                         XFS_SB_VERSION_QUOTABIT));
 }
-#define XFS_SB_VERSION_HASALIGN(sbp)    xfs_sb_version_hasalign(sbp)
 static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
 }
-#define XFS_SB_VERSION_SUBALIGN(sbp)    xfs_sb_version_subalign(sbp)
-static inline void xfs_sb_version_subalign(xfs_sb_t *sbp)
-{
-        (sbp)->sb_versionnum = \
-         XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT);
-}
-#define XFS_SB_VERSION_HASDALIGN(sbp)   xfs_sb_version_hasdalign(sbp)
 static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
 }
-#define XFS_SB_VERSION_ADDDALIGN(sbp)   xfs_sb_version_adddalign(sbp)
-static inline int xfs_sb_version_adddalign(xfs_sb_t *sbp)
-{
-        return (sbp)->sb_versionnum = \
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT);
-}
-#define XFS_SB_VERSION_HASSHARED(sbp)   xfs_sb_version_hasshared(sbp)
 static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
 }
-#define XFS_SB_VERSION_ADDSHARED(sbp)   xfs_sb_version_addshared(sbp)
-static inline int xfs_sb_version_addshared(xfs_sb_t *sbp)
-{
-        return (sbp)->sb_versionnum = \
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT);
-}
-#define XFS_SB_VERSION_SUBSHARED(sbp)   xfs_sb_version_subshared(sbp)
-static inline int xfs_sb_version_subshared(xfs_sb_t *sbp)
-{
-        return (sbp)->sb_versionnum = \
-                ((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT);
-}
-#define XFS_SB_VERSION_HASDIRV2(sbp)    xfs_sb_version_hasdirv2(sbp)
 static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
 }
-#define XFS_SB_VERSION_HASLOGV2(sbp)   xfs_sb_version_haslogv2(sbp)
 static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
-#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)        xfs_sb_version_hasextflgbit(sbp)
 static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
 }
-#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)        xfs_sb_version_addextflgbit(sbp)
-static inline int xfs_sb_version_addextflgbit(xfs_sb_t *sbp)
-{
-        return (sbp)->sb_versionnum = \
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT);
-}
-#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)        xfs_sb_version_subextflgbit(sbp)
-static inline int xfs_sb_version_subextflgbit(xfs_sb_t *sbp)
-{
-        return (sbp)->sb_versionnum = \
-                ((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT);
-}
-#define XFS_SB_VERSION_HASSECTOR(sbp)   xfs_sb_version_hassector(sbp)
 static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
                ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
-#define XFS_SB_VERSION_HASMOREBITS(sbp) xfs_sb_version_hasmorebits(sbp)
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
@@ -476,24 +449,22 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro:
 *
 * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp)
- *      ((XFS_SB_VERSION_HASMOREBITS(sbp) &&
+ *      ((xfs_sb_version_hasmorebits(sbp) &&
 *       ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
 */
 static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_HASMOREBITS(sbp) &&      \
+        return (xfs_sb_version_hasmorebits(sbp) &&      \
                ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
 }
-#define XFS_SB_VERSION_HASATTR2(sbp)    xfs_sb_version_hasattr2(sbp)
 static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_HASMOREBITS(sbp)) &&     \
+        return (xfs_sb_version_hasmorebits(sbp)) &&     \
                ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
 }
-#define XFS_SB_VERSION_ADDATTR2(sbp)    xfs_sb_version_addattr2(sbp)
 static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 {
        ((sbp)->sb_versionnum = \
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7f40628d85c7..0804207c7391 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,13 +113,8 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
-typedef struct xfs_ail_entry {
-        struct xfs_log_item     *ail_forw;      /* AIL forw pointer */
-        struct xfs_log_item     *ail_back;      /* AIL back pointer */
-} xfs_ail_entry_t;
 typedef struct xfs_log_item {
-        xfs_ail_entry_t                 li_ail;         /* AIL pointers */
+        struct list_head                li_ail;         /* AIL pointers */
        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
@@ -341,7 +336,6 @@ typedef struct xfs_trans {
        unsigned int            t_rtx_res;      /* # of rt extents resvd */
        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-        sema_t                  t_sema;         /* sema for commit completion */
        xfs_lsn_t               t_lsn;          /* log seq num of start of
                                                 * transaction. */
        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4d6330eddc8d..1f77c00af566 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,13 +28,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 #else
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -57,7 +57,7 @@ xfs_trans_tail_ail(
        xfs_log_item_t  *lip;
        spin_lock(&mp->m_ail_lock);
-        lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+        lip = xfs_ail_min(&mp->m_ail);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
@@ -91,7 +91,7 @@ xfs_trans_push_ail(
 {
        xfs_log_item_t          *lip;
-        lip = xfs_ail_min(&mp->m_ail.xa_ail);
+        lip = xfs_ail_min(&mp->m_ail);
        if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
                if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
                        xfsaild_wakeup(mp, threshold_lsn);
@@ -111,15 +111,17 @@ xfs_trans_first_push_ail(
 {
        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+        lip = xfs_ail_min(&mp->m_ail);
        *gen = (int)mp->m_ail.xa_gen;
        if (lsn == 0)
                return lip;
-        while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0))
+        list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
-                lip = lip->li_ail.ail_forw;
+                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+                        return lip;
+        }
-        return lip;
+        return NULL;
 }
 /*
@@ -261,16 +263,19 @@ xfsaild_push(
                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
        }
-        /*
+        if (!count) {
-         * We reached the target so wait a bit longer for I/O to complete and
+                /* We're past our target or empty, so idle */
-         * remove pushed items from the AIL before we start the next scan from
+                tout = 1000;
-         * the start of the AIL.
+        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
-         */
+                /*
-        if ((XFS_LSN_CMP(lsn, target) >= 0)) {
+                 * We reached the target so wait a bit longer for I/O to
+                 * complete and remove pushed items from the AIL before we
+                 * start the next scan from the start of the AIL.
+                 */
                tout += 20;
                last_pushed_lsn = 0;
        } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
-                   (count && ((stuck * 100) / count > 90))) {
+                   ((stuck * 100) / count > 90)) {
                /*
                 * Either there is a lot of contention on the AIL or we
                 * are stuck due to operations in progress. "Stuck" in this
@@ -326,7 +331,7 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-        min_lip = xfs_ail_min(&mp->m_ail.xa_ail);
+        min_lip = xfs_ail_min(&mp->m_ail);
        if (min_lip == lip)
                xfs_log_move_tail(mp, 1);
@@ -354,15 +359,13 @@ xfs_trans_update_ail(
        xfs_log_item_t  *lip,
        xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
 {
-        xfs_ail_entry_t         *ailp;
        xfs_log_item_t          *dlip=NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
-        ailp = &(mp->m_ail.xa_ail);
+        mlip = xfs_ail_min(&mp->m_ail);
-        mlip = xfs_ail_min(ailp);
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                dlip = xfs_ail_delete(ailp, lip);
+                dlip = xfs_ail_delete(&mp->m_ail, lip);
                ASSERT(dlip == lip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
@@ -370,11 +373,11 @@ xfs_trans_update_ail(
        lip->li_lsn = lsn;
-        xfs_ail_insert(ailp, lip);
+        xfs_ail_insert(&mp->m_ail, lip);
        mp->m_ail.xa_gen++;
        if (mlip == dlip) {
-                mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+                mlip = xfs_ail_min(&mp->m_ail);
                spin_unlock(&mp->m_ail_lock);
                xfs_log_move_tail(mp, mlip->li_lsn);
        } else {
@@ -404,14 +407,12 @@ xfs_trans_delete_ail(
        xfs_mount_t     *mp,
        xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
 {
-        xfs_ail_entry_t         *ailp;
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                ailp = &(mp->m_ail.xa_ail);
+                mlip = xfs_ail_min(&mp->m_ail);
-                mlip = xfs_ail_min(ailp);
+                dlip = xfs_ail_delete(&mp->m_ail, lip);
-                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
@@ -420,7 +421,7 @@ xfs_trans_delete_ail(
                mp->m_ail.xa_gen++;
                if (mlip == dlip) {
-                        mlip = xfs_ail_min(&(mp->m_ail.xa_ail));
+                        mlip = xfs_ail_min(&mp->m_ail);
                        spin_unlock(&mp->m_ail_lock);
                        xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
                } else {
@@ -437,7 +438,7 @@ xfs_trans_delete_ail(
                else {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
-                                        __FUNCTION__);
+                                        __func__);
                        spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
@@ -458,7 +459,7 @@ xfs_trans_first_ail(
 {
        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&(mp->m_ail.xa_ail));
+        lip = xfs_ail_min(&mp->m_ail);
        *gen = (int)mp->m_ail.xa_gen;
        return lip;
@@ -482,9 +483,9 @@ xfs_trans_next_ail(
        ASSERT(mp && lip && gen);
        if (mp->m_ail.xa_gen == *gen) {
-                nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip);
+                nlip = xfs_ail_next(&mp->m_ail, lip);
        } else {
-                nlip = xfs_ail_min(&(mp->m_ail).xa_ail);
+                nlip = xfs_ail_min(&mp->m_ail);
                *gen = (int)mp->m_ail.xa_gen;
                if (restarts != NULL) {
                        XFS_STATS_INC(xs_push_ail_restarts);
@@ -514,8 +515,7 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-        mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail;
+        INIT_LIST_HEAD(&mp->m_ail.xa_ail);
-        mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail;
        return xfsaild_start(mp);
 }
@@ -534,7 +534,7 @@ xfs_trans_ail_destroy(
 */
 STATIC void
 xfs_ail_insert(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -543,27 +543,22 @@ xfs_ail_insert(
        /*
         * If the list is empty, just insert the item.
         */
-        if (base->ail_back == (xfs_log_item_t*)base) {
+        if (list_empty(&ailp->xa_ail)) {
-                base->ail_forw = lip;
+                list_add(&lip->li_ail, &ailp->xa_ail);
-                base->ail_back = lip;
-                lip->li_ail.ail_forw = (xfs_log_item_t*)base;
-                lip->li_ail.ail_back = (xfs_log_item_t*)base;
                return;
        }
-        next_lip = base->ail_back;
+        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-        while ((next_lip != (xfs_log_item_t*)base) &&
+                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
+                        break;
-                next_lip = next_lip->li_ail.ail_back;
        }
-        ASSERT((next_lip == (xfs_log_item_t*)base) ||
+        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-        lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
-        lip->li_ail.ail_back = next_lip;
-        next_lip->li_ail.ail_forw = lip;
-        lip->li_ail.ail_forw->li_ail.ail_back = lip;
-        xfs_ail_check(base, lip);
+        list_add(&lip->li_ail, &next_lip->li_ail);
+        xfs_ail_check(ailp, lip);
        return;
 }
@@ -573,15 +568,13 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
-        xfs_ail_check(base, lip);
+        xfs_ail_check(ailp, lip);
-        lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
-        lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
+        list_del(&lip->li_ail);
-        lip->li_ail.ail_forw = NULL;
-        lip->li_ail.ail_back = NULL;
        return lip;
 }
@@ -592,14 +585,13 @@ xfs_ail_delete(
 */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-        xfs_ail_entry_t *base)
+        xfs_ail_t       *ailp)
 /* ARGSUSED */
 {
-        register xfs_log_item_t *forw = base->ail_forw;
+        if (list_empty(&ailp->xa_ail))
-        if (forw == (xfs_log_item_t*)base) {
                return NULL;
-        }
-        return forw;
+        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
 }
 /*
@@ -609,15 +601,14 @@ xfs_ail_min(
 */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
-        if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+        if (lip->li_ail.next == &ailp->xa_ail)
                return NULL;
-        }
-        return lip->li_ail.ail_forw;
+        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
 }
 #ifdef DEBUG
@@ -626,57 +617,40 @@ xfs_ail_next(
 */
 STATIC void
 xfs_ail_check(
-        xfs_ail_entry_t *base,
+        xfs_ail_t       *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
-        prev_lip = base->ail_forw;
+        if (list_empty(&ailp->xa_ail))
-        if (prev_lip == (xfs_log_item_t*)base) {
-                /*
-                 * Make sure the pointers are correct when the list
-                 * is empty.
-                 */
-                ASSERT(base->ail_back == (xfs_log_item_t*)base);
                return;
-        }
        /*
         * Check the next and previous entries are valid.
         */
        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-        prev_lip = lip->li_ail.ail_back;
+        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
-        if (prev_lip != (xfs_log_item_t*)base) {
+        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(prev_lip->li_ail.ail_forw == lip);
                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-        }
-        prev_lip = lip->li_ail.ail_forw;
+        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-        if (prev_lip != (xfs_log_item_t*)base) {
+        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(prev_lip->li_ail.ail_back == lip);
                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-        }
 #ifdef XFS_TRANS_DEBUG
        /*
-         * Walk the list checking forward and backward pointers,
+         * Walk the list checking lsn ordering, and that every entry has the
-         * lsn ordering, and that every entry has the XFS_LI_IN_AIL
+         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * flag set. This is really expensive, so only do it when
+         * when specifically debugging the transaction subsystem.
-         * specifically debugging the transaction subsystem.
         */
-        prev_lip = (xfs_log_item_t*)base;
+        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        while (lip != (xfs_log_item_t*)base) {
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (prev_lip != (xfs_log_item_t*)base) {
+                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(prev_lip->li_ail.ail_forw == lip);
                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                }
-                ASSERT(lip->li_ail.ail_back == prev_lip);
                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
                prev_lip = lip;
-                lip = lip->li_ail.ail_forw;
        }
-        ASSERT(lip == (xfs_log_item_t*)base);
-        ASSERT(base->ail_back == prev_lip);
 #endif /* XFS_TRANS_DEBUG */
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b898022b..cb0c5839154b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
        if (tp == NULL) {
                bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
                if (!bp)
-                        return XFS_ERROR(ENOMEM);
+                        return (flags & XFS_BUF_TRYLOCK) ?
+                                        EAGAIN : XFS_ERROR(ENOMEM);
                if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -353,17 +354,15 @@ xfs_trans_read_buf(
                        ASSERT(!XFS_BUF_ISASYNC(bp));
                        XFS_BUF_READ(bp);
                        xfsbdstrat(tp->t_mountp, bp);
-                        xfs_iowait(bp);
+                        error = xfs_iowait(bp);
-                        if (XFS_BUF_GETERROR(bp) != 0) {
+                        if (error) {
                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                                  bp, blkno);
-                                error = XFS_BUF_GETERROR(bp);
                                xfs_buf_relse(bp);
                                /*
-                                 * We can gracefully recover from most
+                                 * We can gracefully recover from most read
-                                 * read errors. Ones we can't are those
+                                 * errors. Ones we can't are those that happen
-                                 * that happen after the transaction's
+                                 * after the transaction's already dirty.
-                                 * already dirty.
                                 */
                                if (tp->t_flags & XFS_TRANS_DIRTY)
                                        xfs_force_shutdown(tp->t_mountp,
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 5c89be475464..0f5191644ab2 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -160,4 +160,9 @@ typedef enum {
        XFS_BTNUM_MAX
 } xfs_btnum_t;
+struct xfs_name {
+        const char      *name;
+        int             len;
+};
 #endif  /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 45d740df53b7..2b8dc7e40772 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -40,34 +40,12 @@
 #include "xfs_itable.h"
 #include "xfs_utils.h"
-/*
- * xfs_get_dir_entry is used to get a reference to an inode given
- * its parent directory inode and the name of the file.  It does
- * not lock the child inode, and it unlocks the directory before
- * returning.  The directory's generation number is returned for
- * use by a later call to xfs_lock_dir_and_entry.
- */
-int
-xfs_get_dir_entry(
-        bhv_vname_t     *dentry,
-        xfs_inode_t     **ipp)
-{
-        bhv_vnode_t     *vp;
-        vp = VNAME_TO_VNODE(dentry);
-        *ipp = xfs_vtoi(vp);
-        if (!*ipp)
-                return XFS_ERROR(ENOENT);
-        VN_HOLD(vp);
-        return 0;
-}
 int
 xfs_dir_lookup_int(
        xfs_inode_t     *dp,
        uint            lock_mode,
-        bhv_vname_t     *dentry,
+        struct xfs_name *name,
        xfs_ino_t       *inum,
        xfs_inode_t     **ipp)
 {
@@ -75,7 +53,7 @@ xfs_dir_lookup_int(
        xfs_itrace_entry(dp);
-        error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
+        error = xfs_dir_lookup(NULL, dp, name, inum);
        if (!error) {
                /*
                 * Unlock the directory. We do this because we can't
@@ -339,10 +317,10 @@ xfs_bump_ino_vers2(
        ip->i_d.di_onlink = 0;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        mp = tp->t_mountp;
-        if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+        if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                spin_lock(&mp->m_sb_lock);
-                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-                        XFS_SB_VERSION_ADDNLINK(&mp->m_sb);
+                        xfs_sb_version_addnlink(&mp->m_sb);
                        spin_unlock(&mp->m_sb_lock);
                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
                } else {
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f857fcccb723..175b126d2cab 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,15 +21,14 @@
 #define IRELE(ip)       VN_RELE(XFS_ITOV(ip))
 #define IHOLD(ip)       VN_HOLD(XFS_ITOV(ip))
-extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
+extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
-extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *,
+                                xfs_ino_t *, xfs_inode_t **);
-                                xfs_inode_t **);
+extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
-extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
                                xfs_inode_t **, int *);
-extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
 #endif  /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 413587f02155..fc48158fe479 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_refcache.h"
 #include "xfs_buf_item.h"
 #include "xfs_log_priv.h"
 #include "xfs_dir2_trace.h"
@@ -56,6 +55,7 @@
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
 #include "xfs_vfsops.h"
+#include "xfs_utils.h"
 int __init
@@ -69,15 +69,17 @@ xfs_init(void)
        /*
         * Initialize all of the zone allocators we use.
         */
+        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+                                                "xfs_log_ticket");
        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-                                                 "xfs_bmap_free_item");
+                                                "xfs_bmap_free_item");
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-                                            "xfs_btree_cur");
+                                                "xfs_btree_cur");
-        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+        xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
-        xfs_da_state_zone =
+                                                "xfs_da_state");
-                kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
        xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
        xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
        xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
        xfs_mru_cache_init();
        xfs_filestream_init();
@@ -113,9 +115,6 @@ xfs_init(void)
        xfs_ili_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
                                        KM_ZONE_SPREAD, NULL);
-        xfs_icluster_zone =
-                kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
-                                        KM_ZONE_SPREAD, NULL);
        /*
         * Allocate global trace buffers.
@@ -153,11 +152,9 @@ xfs_cleanup(void)
        extern kmem_zone_t      *xfs_inode_zone;
        extern kmem_zone_t      *xfs_efd_zone;
        extern kmem_zone_t      *xfs_efi_zone;
-        extern kmem_zone_t      *xfs_icluster_zone;
        xfs_cleanup_procfs();
        xfs_sysctl_unregister();
-        xfs_refcache_destroy();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
        xfs_acl_zone_destroy(xfs_acl_zone);
@@ -189,7 +186,6 @@ xfs_cleanup(void)
        kmem_zone_destroy(xfs_efi_zone);
        kmem_zone_destroy(xfs_ifork_zone);
        kmem_zone_destroy(xfs_ili_zone);
-        kmem_zone_destroy(xfs_icluster_zone);
 }
 /*
@@ -281,8 +277,8 @@ xfs_start_flags(
                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
        }
-        if (ap->flags & XFSMNT_IDELETE)
+        if (ap->flags & XFSMNT_IKEEP)
-                mp->m_flags |= XFS_MOUNT_IDELETE;
+                mp->m_flags |= XFS_MOUNT_IKEEP;
        if (ap->flags & XFSMNT_DIRSYNC)
                mp->m_flags |= XFS_MOUNT_DIRSYNC;
        if (ap->flags & XFSMNT_ATTR2)
@@ -330,7 +326,7 @@ xfs_finish_flags(
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
        /* Fail a mount where the logbuf is smaller then the log stripe */
-        if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
                if ((ap->logbufsize <= 0) &&
                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
@@ -349,9 +345,8 @@ xfs_finish_flags(
                }
        }
-        if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) {
+        if (xfs_sb_version_hasattr2(&mp->m_sb))
                mp->m_flags |= XFS_MOUNT_ATTR2;
-        }
        /*
         * prohibit r/w mounts of read-only filesystems
@@ -366,7 +361,7 @@ xfs_finish_flags(
         * check for shared mount.
         */
        if (ap->flags & XFSMNT_SHARED) {
-                if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb))
+                if (!xfs_sb_version_hasshared(&mp->m_sb))
                        return XFS_ERROR(EINVAL);
                /*
@@ -512,7 +507,7 @@ xfs_mount(
        if (!error && logdev && logdev != ddev) {
                unsigned int    log_sector_size = BBSIZE;
-                if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb))
+                if (xfs_sb_version_hassector(&mp->m_sb))
                        log_sector_size = mp->m_sb.sb_logsectsize;
                error = xfs_setsize_buftarg(mp->m_logdev_targp,
                                            mp->m_sb.sb_blocksize,
@@ -574,7 +569,7 @@ xfs_unmount(
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                error = XFS_SEND_PREUNMOUNT(mp,
-                                rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
+                                rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
                                NULL, NULL, 0, 0,
                                (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
                                        0:DM_FLAGS_UNWANTED);
@@ -585,11 +580,6 @@ xfs_unmount(
                                        0 : DM_FLAGS_UNWANTED;
        }
 #endif
-        /*
-         * First blow any referenced inode from this file system
-         * out of the reference cache, and delete the timer.
-         */
-        xfs_refcache_purge_mp(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
@@ -608,7 +598,7 @@ xfs_unmount(
        /*
         * Drop the reference count
         */
-        VN_RELE(rvp);
+        IRELE(rip);
        /*
         * If we're forcing a shutdown, typically because of a media error,
@@ -630,7 +620,7 @@ out:
                /* Note: mp structure must still exist for
                 * XFS_SEND_UNMOUNT() call.
                 */
-                XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL,
+                XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
                        DM_RIGHT_NULL, 0, error, unmount_event_flags);
        }
        if (xfs_unmountfs_needed) {
@@ -647,13 +637,12 @@ out:
        return XFS_ERROR(error);
 }
-STATIC int
+STATIC void
 xfs_quiesce_fs(
        xfs_mount_t             *mp)
 {
        int                     count = 0, pincount;
-        xfs_refcache_purge_mp(mp);
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
        xfs_finish_reclaim_all(mp, 0);
@@ -672,8 +661,6 @@ xfs_quiesce_fs(
                        count++;
                }
        } while (count < 2);
-        return 0;
 }
 /*
@@ -685,6 +672,8 @@ void
 xfs_attr_quiesce(
        xfs_mount_t     *mp)
 {
+        int     error = 0;
        /* wait for all modifications to complete */
        while (atomic_read(&mp->m_active_trans) > 0)
                delay(100);
@@ -695,7 +684,11 @@ xfs_attr_quiesce(
        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
        /* Push the superblock and write an unmount record */
-        xfs_log_sbcount(mp, 1);
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
 }
@@ -791,8 +784,8 @@ xfs_unmount_flush(
                goto fscorrupt_out2;
        if (rbmip) {
-                VN_RELE(XFS_ITOV(rbmip));
+                IRELE(rbmip);
-                VN_RELE(XFS_ITOV(rsumip));
+                IRELE(rsumip);
        }
        xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1170,10 +1163,10 @@ xfs_sync_inodes(
                         * above, then wait until after we've unlocked
                         * the inode to release the reference.  This is
                         * because we can be already holding the inode
-                         * lock when VN_RELE() calls xfs_inactive().
+                         * lock when IRELE() calls xfs_inactive().
                         *
                         * Make sure to drop the mount lock before calling
-                         * VN_RELE() so that we don't trip over ourselves if
+                         * IRELE() so that we don't trip over ourselves if
                         * we have to go for the mount lock again in the
                         * inactive code.
                         */
@@ -1181,7 +1174,7 @@ xfs_sync_inodes(
                                IPOINTER_INSERT(ip, mp);
                        }
-                        VN_RELE(vp);
+                        IRELE(ip);
                        vnode_refed = B_FALSE;
                }
@@ -1324,30 +1317,8 @@ xfs_syncsub(
        }
        /*
-         * If this is the periodic sync, then kick some entries out of
-         * the reference cache.  This ensures that idle entries are
-         * eventually kicked out of the cache.
-         */
-        if (flags & SYNC_REFCACHE) {
-                if (flags & SYNC_WAIT)
-                        xfs_refcache_purge_mp(mp);
-                else
-                        xfs_refcache_purge_some(mp);
-        }
-        /*
-         * If asked, update the disk superblock with incore counter values if we
-         * are using non-persistent counters so that they don't get too far out
-         * of sync if we crash or get a forced shutdown. We don't want to force
-         * this to disk, just get a transaction into the iclogs....
-         */
-        if (flags & SYNC_SUPER)
-                xfs_log_sbcount(mp, 0);
-        /*
         * Now check to see if the log needs a "dummy" transaction.
         */
        if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
                xfs_trans_t *tp;
                xfs_inode_t *ip;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 51305242ff8c..6650601c64f7 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -48,7 +48,6 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_rtalloc.h"
-#include "xfs_refcache.h"
 #include "xfs_trans_space.h"
 #include "xfs_log_priv.h"
 #include "xfs_filestream.h"
@@ -327,7 +326,7 @@ xfs_setattr(
                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
                    !(flags & ATTR_DMI)) {
                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
+                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
                                vap->va_size, 0, dmflags, NULL);
                        if (code) {
                                lock_flags = 0;
@@ -634,6 +633,15 @@ xfs_setattr(
         * Truncate file.  Must have write permission and not be a directory.
         */
        if (mask & XFS_AT_SIZE) {
+                /*
+                 * Only change the c/mtime if we are changing the size
+                 * or we are explicitly asked to change it. This handles
+                 * the semantic difference between truncate() and ftruncate()
+                 * as implemented in the VFS.
+                 */
+                if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+                        timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
                if (vap->va_size > ip->i_size) {
                        xfs_igrow_finish(tp, ip, vap->va_size,
                            !(flags & ATTR_DMI));
@@ -662,10 +670,6 @@ xfs_setattr(
                         */
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
-                /*
-                 * Have to do this even if the file's size doesn't change.
-                 */
-                timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
        }
        /*
@@ -877,7 +881,7 @@ xfs_setattr(
        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
            !(flags & ATTR_DMI)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
                                        NULL, DM_RIGHT_NULL, NULL, NULL,
                                        0, 0, AT_DELAY_FLAG(flags));
        }
@@ -1443,28 +1447,22 @@ xfs_inactive_attrs(
        tp = *tpp;
        mp = ip->i_mount;
        ASSERT(ip->i_d.di_forkoff != 0);
-        xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        if (error)
+                goto error_unlock;
        error = xfs_attr_inactive(ip);
-        if (error) {
+        if (error)
-                *tpp = NULL;
+                goto error_unlock;
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error; /* goto out */
-        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
        error = xfs_trans_reserve(tp, 0,
                                  XFS_IFREE_LOG_RES(mp),
                                  0, XFS_TRANS_PERM_LOG_RES,
                                  XFS_INACTIVE_LOG_COUNT);
-        if (error) {
+        if (error)
-                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                goto error_cancel;
-                xfs_trans_cancel(tp, 0);
-                *tpp = NULL;
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error;
-        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1475,6 +1473,14 @@ xfs_inactive_attrs(
        *tpp = tp;
        return 0;
+error_cancel:
+        ASSERT(XFS_FORCED_SHUTDOWN(mp));
+        xfs_trans_cancel(tp, 0);
+error_unlock:
+        *tpp = NULL;
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
 }
 int
@@ -1520,12 +1526,6 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
        }
-#ifdef HAVE_REFCACHE
-        /* If we are in the NFS reference cache then don't do this now */
-        if (ip->i_refcache)
-                return 0;
-#endif
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -1588,9 +1588,8 @@ xfs_inactive(
        mp = ip->i_mount;
-        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
+        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
-                (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
+                XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
-        }
        error = 0;
@@ -1744,11 +1743,18 @@ xfs_inactive(
                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
                /*
-                 * Just ignore errors at this point.  There is
+                 * Just ignore errors at this point.  There is nothing we can
-                 * nothing we can do except to try to keep going.
+                 * do except to try to keep going. Make sure it's not a silent
+                 * error.
                 */
-                (void) xfs_bmap_finish(&tp,  &free_list, &committed);
+                error = xfs_bmap_finish(&tp,  &free_list, &committed);
-                (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                if (error)
+                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                                "xfs_bmap_finish() returned error %d", error);
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                if (error)
+                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                                "xfs_trans_commit() returned error %d", error);
        }
        /*
         * Release the dquots held by inode, if any.
@@ -1765,8 +1771,8 @@ xfs_inactive(
 int
 xfs_lookup(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *name,
-        bhv_vnode_t             **vpp)
+        xfs_inode_t             **ipp)
 {
        xfs_inode_t             *ip;
        xfs_ino_t               e_inum;
@@ -1779,9 +1785,9 @@ xfs_lookup(
                return XFS_ERROR(EIO);
        lock_mode = xfs_ilock_map_shared(dp);
-        error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
+        error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
        if (!error) {
-                *vpp = XFS_ITOV(ip);
+                *ipp = ip;
                xfs_itrace_ref(ip);
        }
        xfs_iunlock_map_shared(dp, lock_mode);
@@ -1791,19 +1797,16 @@ xfs_lookup(
 int
 xfs_create(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *name,
        mode_t                  mode,
        xfs_dev_t               rdev,
-        bhv_vnode_t             **vpp,
+        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        char                    *name = VNAME(dentry);
+        xfs_mount_t             *mp = dp->i_mount;
-        xfs_mount_t             *mp = dp->i_mount;
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
        xfs_inode_t             *ip;
-        bhv_vnode_t             *vp = NULL;
        xfs_trans_t             *tp;
-        int                     error;
+        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        boolean_t               unlock_dp_on_error = B_FALSE;
@@ -1813,17 +1816,14 @@ xfs_create(
        xfs_prid_t              prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
-        int                     namelen;
-        ASSERT(!*vpp);
+        ASSERT(!*ipp);
        xfs_itrace_entry(dp);
-        namelen = VNAMELEN(dentry);
        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                dir_vp, DM_RIGHT_NULL, NULL,
+                                dp, DM_RIGHT_NULL, NULL,
-                                DM_RIGHT_NULL, name, NULL,
+                                DM_RIGHT_NULL, name->name, NULL,
                                mode, 0, 0);
                if (error)
@@ -1855,7 +1855,7 @@ xfs_create(
        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+        resblks = XFS_CREATE_SPACE_RES(mp, name->len);
        /*
         * Initially assume that the file does not exist and
         * reserve the resources for that case.  If that is not
@@ -1888,7 +1888,8 @@ xfs_create(
        if (error)
                goto error_return;
-        if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
+        error = xfs_dir_canenter(tp, dp, name, resblks);
+        if (error)
                goto error_return;
        error = xfs_dir_ialloc(&tp, dp, mode, 1,
                        rdev, credp, prid, resblks > 0,
@@ -1914,11 +1915,11 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        VN_HOLD(dir_vp);
+        IHOLD(dp);
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
-        error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks ?
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
@@ -1952,7 +1953,6 @@ xfs_create(
         * vnode to the caller, we bump the vnode ref count now.
         */
        IHOLD(ip);
-        vp = XFS_ITOV(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error) {
@@ -1970,17 +1970,17 @@ xfs_create(
        XFS_QM_DQRELE(mp, udqp);
        XFS_QM_DQRELE(mp, gdqp);
-        *vpp = vp;
+        *ipp = ip;
        /* Fallthrough to std_return with error = 0  */
 std_return:
-        if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
+        if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                        dir_vp, DM_RIGHT_NULL,
+                        dp, DM_RIGHT_NULL,
-                        *vpp ? vp:NULL,
+                        *ipp ? ip : NULL,
-                        DM_RIGHT_NULL, name, NULL,
+                        DM_RIGHT_NULL, name->name, NULL,
                        mode, error, 0);
        }
        return error;
@@ -2272,46 +2272,32 @@ int remove_which_error_return = 0;
 int
 xfs_remove(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry)
+        struct xfs_name         *name,
+        xfs_inode_t             *ip)
 {
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
-        char                    *name = VNAME(dentry);
        xfs_mount_t             *mp = dp->i_mount;
-        xfs_inode_t             *ip;
        xfs_trans_t             *tp = NULL;
        int                     error = 0;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        int                     dm_di_mode = 0;
        int                     link_zero;
        uint                    resblks;
-        int                     namelen;
        xfs_itrace_entry(dp);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        namelen = VNAMELEN(dentry);
-        if (!xfs_get_dir_entry(dentry, &ip)) {
-                dm_di_mode = ip->i_d.di_mode;
-                IRELE(ip);
-        }
        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
-                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL, name->name, NULL,
-                                        name, NULL, dm_di_mode, 0, 0);
+                                        ip->i_d.di_mode, 0, 0);
                if (error)
                        return error;
        }
-        /* From this point on, return through std_return */
-        ip = NULL;
        /*
         * We need to get a reference to ip before we get our log
         * reservation. The reason for this is that we cannot call
@@ -2324,13 +2310,7 @@ xfs_remove(
         * when we call xfs_iget.  Instead we get an unlocked reference
         * to the inode before getting our log reservation.
         */
-        error = xfs_get_dir_entry(dentry, &ip);
+        IHOLD(ip);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
-                goto std_return;
-        }
-        dm_di_mode = ip->i_d.di_mode;
        xfs_itrace_entry(ip);
        xfs_itrace_ref(ip);
@@ -2398,7 +2378,7 @@ xfs_remove(
         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
         */
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, 0);
        if (error) {
                ASSERT(error != ENOENT);
@@ -2449,14 +2429,6 @@ xfs_remove(
        }
        /*
-         * Before we drop our extra reference to the inode, purge it
-         * from the refcache if it is there.  By waiting until afterwards
-         * to do the IRELE, we ensure that we won't go inactive in the
-         * xfs_refcache_purge_ip routine (although that would be OK).
-         */
-        xfs_refcache_purge_ip(ip);
-        /*
         * If we are using filestreams, kill the stream association.
         * If the file is still open it may get a new one but that
         * will get killed on last close in xfs_close() so we don't
@@ -2472,9 +2444,9 @@ xfs_remove(
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-                                dir_vp, DM_RIGHT_NULL,
+                                dp, DM_RIGHT_NULL,
                                NULL, DM_RIGHT_NULL,
-                                name, NULL, dm_di_mode, error, 0);
+                                name->name, NULL, ip->i_d.di_mode, error, 0);
        }
        return error;
@@ -2495,14 +2467,6 @@ xfs_remove(
        cancel_flags |= XFS_TRANS_ABORT;
        xfs_trans_cancel(tp, cancel_flags);
-        /*
-         * Before we drop our extra reference to the inode, purge it
-         * from the refcache if it is there.  By waiting until afterwards
-         * to do the IRELE, we ensure that we won't go inactive in the
-         * xfs_refcache_purge_ip routine (although that would be OK).
-         */
-        xfs_refcache_purge_ip(ip);
        IRELE(ip);
        goto std_return;
@@ -2511,12 +2475,10 @@ xfs_remove(
 int
 xfs_link(
        xfs_inode_t             *tdp,
-        bhv_vnode_t             *src_vp,
+        xfs_inode_t             *sip,
-        bhv_vname_t             *dentry)
+        struct xfs_name         *target_name)
 {
-        bhv_vnode_t             *target_dir_vp = XFS_ITOV(tdp);
        xfs_mount_t             *mp = tdp->i_mount;
-        xfs_inode_t             *sip = xfs_vtoi(src_vp);
        xfs_trans_t             *tp;
        xfs_inode_t             *ips[2];
        int                     error;
@@ -2525,23 +2487,20 @@ xfs_link(
        int                     cancel_flags;
        int                     committed;
        int                     resblks;
-        char                    *target_name = VNAME(dentry);
-        int                     target_namelen;
        xfs_itrace_entry(tdp);
-        xfs_itrace_entry(xfs_vtoi(src_vp));
+        xfs_itrace_entry(sip);
-        target_namelen = VNAMELEN(dentry);
+        ASSERT(!S_ISDIR(sip->i_d.di_mode));
-        ASSERT(!VN_ISDIR(src_vp));
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
        if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-                                        target_dir_vp, DM_RIGHT_NULL,
+                                        tdp, DM_RIGHT_NULL,
-                                        src_vp, DM_RIGHT_NULL,
+                                        sip, DM_RIGHT_NULL,
-                                        target_name, NULL, 0, 0, 0);
+                                        target_name->name, NULL, 0, 0, 0);
                if (error)
                        return error;
        }
@@ -2556,7 +2515,7 @@ xfs_link(
        tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
        if (error == ENOSPC) {
@@ -2584,8 +2543,8 @@ xfs_link(
         * xfs_trans_cancel will both unlock the inodes and
         * decrement the associated ref counts.
         */
-        VN_HOLD(src_vp);
+        IHOLD(sip);
-        VN_HOLD(target_dir_vp);
+        IHOLD(tdp);
        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
@@ -2608,15 +2567,14 @@ xfs_link(
                goto error_return;
        }
-        if (resblks == 0 &&
+        error = xfs_dir_canenter(tp, tdp, target_name, resblks);
-            (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
+        if (error)
                goto error_return;
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
+        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
-                                   sip->i_ino, &first_block, &free_list,
+                                        &first_block, &free_list, resblks);
-                                   resblks);
        if (error)
                goto abort_return;
        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2650,9 +2608,9 @@ xfs_link(
 std_return:
        if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-                                target_dir_vp, DM_RIGHT_NULL,
+                                tdp, DM_RIGHT_NULL,
-                                src_vp, DM_RIGHT_NULL,
+                                sip, DM_RIGHT_NULL,
-                                target_name, NULL, 0, error, 0);
+                                target_name->name, NULL, 0, error, 0);
        }
        return error;
@@ -2669,17 +2627,13 @@ std_return:
 int
 xfs_mkdir(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *dir_name,
        mode_t                  mode,
-        bhv_vnode_t             **vpp,
+        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
-        char                    *dir_name = VNAME(dentry);
-        int                     dir_namelen = VNAMELEN(dentry);
        xfs_mount_t             *mp = dp->i_mount;
        xfs_inode_t             *cdp;   /* inode of created dir */
-        bhv_vnode_t             *cvp;   /* vnode of created dir */
        xfs_trans_t             *tp;
        int                     cancel_flags;
        int                     error;
@@ -2700,8 +2654,8 @@ xfs_mkdir(
        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                        dir_vp, DM_RIGHT_NULL, NULL,
+                                        dp, DM_RIGHT_NULL, NULL,
-                                        DM_RIGHT_NULL, dir_name, NULL,
+                                        DM_RIGHT_NULL, dir_name->name, NULL,
                                        mode, 0, 0);
                if (error)
                        return error;
@@ -2730,7 +2684,7 @@ xfs_mkdir(
        tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+        resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
        error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
                                  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
        if (error == ENOSPC) {
@@ -2762,8 +2716,8 @@ xfs_mkdir(
        if (error)
                goto error_return;
-        if (resblks == 0 &&
+        error = xfs_dir_canenter(tp, dp, dir_name, resblks);
-            (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
+        if (error)
                goto error_return;
        /*
         * create the directory inode.
@@ -2786,15 +2740,15 @@ xfs_mkdir(
         * from here on will result in the transaction cancel
         * unlocking dp so don't do it explicitly in the error path.
         */
-        VN_HOLD(dir_vp);
+        IHOLD(dp);
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        XFS_BMAP_INIT(&free_list, &first_block);
-        error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
+        error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
-                                   &first_block, &free_list, resblks ?
+                                        &first_block, &free_list, resblks ?
-                                   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != ENOSPC);
                goto error1;
@@ -2817,11 +2771,9 @@ xfs_mkdir(
        if (error)
                goto error2;
-        cvp = XFS_ITOV(cdp);
        created = B_TRUE;
-        *vpp = cvp;
+        *ipp = cdp;
        IHOLD(cdp);
        /*
@@ -2858,10 +2810,10 @@ std_return:
        if ((created || (error != 0 && dm_event_sent != 0)) &&
            DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
-                                        created ? XFS_ITOV(cdp):NULL,
+                                        created ? cdp : NULL,
                                        DM_RIGHT_NULL,
-                                        dir_name, NULL,
+                                        dir_name->name, NULL,
                                        mode, error, 0);
        }
        return error;
@@ -2885,20 +2837,17 @@ std_return:
 int
 xfs_rmdir(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry)
+        struct xfs_name         *name,
+        xfs_inode_t             *cdp)
 {
        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
-        char                    *name = VNAME(dentry);
-        int                     namelen = VNAMELEN(dentry);
        xfs_mount_t             *mp = dp->i_mount;
-        xfs_inode_t             *cdp;   /* child directory */
        xfs_trans_t             *tp;
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        int                     cancel_flags;
        int                     committed;
-        int                     dm_di_mode = S_IFDIR;
        int                     last_cdp_link;
        uint                    resblks;
@@ -2907,24 +2856,15 @@ xfs_rmdir(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (!xfs_get_dir_entry(dentry, &cdp)) {
-                dm_di_mode = cdp->i_d.di_mode;
-                IRELE(cdp);
-        }
        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL, name->name,
-                                        name, NULL, dm_di_mode, 0, 0);
+                                        NULL, cdp->i_d.di_mode, 0, 0);
                if (error)
                        return XFS_ERROR(error);
        }
-        /* Return through std_return after this point. */
-        cdp = NULL;
        /*
         * We need to get a reference to cdp before we get our log
         * reservation.  The reason for this is that we cannot call
@@ -2937,13 +2877,7 @@ xfs_rmdir(
         * when we call xfs_iget.  Instead we get an unlocked reference
         * to the inode before getting our log reservation.
         */
-        error = xfs_get_dir_entry(dentry, &cdp);
+        IHOLD(cdp);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
-                goto std_return;
-        }
-        mp = dp->i_mount;
-        dm_di_mode = cdp->i_d.di_mode;
        /*
         * Get the dquots for the inodes.
@@ -3020,7 +2954,7 @@ xfs_rmdir(
                goto error_return;
        }
-        error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+        error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
                goto error1;
@@ -3098,9 +3032,9 @@ xfs_rmdir(
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
                                        NULL, DM_RIGHT_NULL,
-                                        name, NULL, dm_di_mode,
+                                        name->name, NULL, cdp->i_d.di_mode,
                                        error, 0);
        }
        return error;
@@ -3118,13 +3052,12 @@ xfs_rmdir(
 int
 xfs_symlink(
        xfs_inode_t             *dp,
-        bhv_vname_t             *dentry,
+        struct xfs_name         *link_name,
-        char                    *target_path,
+        const char              *target_path,
        mode_t                  mode,
-        bhv_vnode_t             **vpp,
+        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-        bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp;
        xfs_inode_t             *ip;
@@ -3140,17 +3073,15 @@ xfs_symlink(
        int                     nmaps;
        xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
        xfs_daddr_t             d;
-        char                    *cur_chunk;
+        const char              *cur_chunk;
        int                     byte_cnt;
        int                     n;
        xfs_buf_t               *bp;
        xfs_prid_t              prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
-        char                    *link_name = VNAME(dentry);
-        int                     link_namelen;
-        *vpp = NULL;
+        *ipp = NULL;
        error = 0;
        ip = NULL;
        tp = NULL;
@@ -3160,44 +3091,17 @@ xfs_symlink(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        link_namelen = VNAMELEN(dentry);
        /*
         * Check component lengths of the target path name.
         */
        pathlen = strlen(target_path);
        if (pathlen >= MAXPATHLEN)      /* total string too long */
                return XFS_ERROR(ENAMETOOLONG);
-        if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
-                int len, total;
-                char *path;
-                for (total = 0, path = target_path; total < pathlen;) {
-                        /*
-                         * Skip any slashes.
-                         */
-                        while(*path == '/') {
-                                total++;
-                                path++;
-                        }
-                        /*
-                         * Count up to the next slash or end of path.
-                         * Error out if the component is bigger than MAXNAMELEN.
-                         */
-                        for(len = 0; *path != '/' && total < pathlen;total++, path++) {
-                                if (++len >= MAXNAMELEN) {
-                                        error = ENAMETOOLONG;
-                                        return error;
-                                }
-                        }
-                }
-        }
        if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-                                        link_name, target_path, 0, 0, 0);
+                                        link_name->name, target_path, 0, 0, 0);
                if (error)
                        return error;
        }
@@ -3229,7 +3133,7 @@ xfs_symlink(
                fs_blocks = 0;
        else
                fs_blocks = XFS_B_TO_FSB(mp, pathlen);
-        resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
        error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
                        XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
        if (error == ENOSPC && fs_blocks == 0) {
@@ -3263,8 +3167,8 @@ xfs_symlink(
        /*
         * Check for ability to enter directory entry, if no space reserved.
         */
-        if (resblks == 0 &&
+        error = xfs_dir_canenter(tp, dp, link_name, resblks);
-            (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
+        if (error)
                goto error_return;
        /*
         * Initialize the bmap freelist prior to calling either
@@ -3289,7 +3193,7 @@ xfs_symlink(
         * transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        VN_HOLD(dir_vp);
+        IHOLD(dp);
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
@@ -3356,8 +3260,8 @@ xfs_symlink(
        /*
         * Create the directory entry for the symlink.
         */
-        error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
+        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
-                                   &first_block, &free_list, resblks);
+                                        &first_block, &free_list, resblks);
        if (error)
                goto error1;
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3399,19 +3303,14 @@ xfs_symlink(
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-                                        dir_vp, DM_RIGHT_NULL,
+                                        dp, DM_RIGHT_NULL,
-                                        error ? NULL : XFS_ITOV(ip),
+                                        error ? NULL : ip,
-                                        DM_RIGHT_NULL, link_name, target_path,
+                                        DM_RIGHT_NULL, link_name->name,
-                                        0, error, 0);
+                                        target_path, 0, error, 0);
        }
-        if (!error) {
+        if (!error)
-                bhv_vnode_t *vp;
+                *ipp = ip;
-                ASSERT(ip);
-                vp = XFS_ITOV(ip);
-                *vpp = vp;
-        }
        return error;
 error2:
@@ -3431,60 +3330,11 @@ std_return:
 }
 int
-xfs_rwlock(
-        xfs_inode_t     *ip,
-        bhv_vrwlock_t   locktype)
-{
-        if (S_ISDIR(ip->i_d.di_mode))
-                return 1;
-        if (locktype == VRWLOCK_WRITE) {
-                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        } else if (locktype == VRWLOCK_TRY_READ) {
-                return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
-        } else if (locktype == VRWLOCK_TRY_WRITE) {
-                return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
-        } else {
-                ASSERT((locktype == VRWLOCK_READ) ||
-                       (locktype == VRWLOCK_WRITE_DIRECT));
-                xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        }
-        return 1;
-}
-void
-xfs_rwunlock(
-        xfs_inode_t     *ip,
-        bhv_vrwlock_t   locktype)
-{
-        if (S_ISDIR(ip->i_d.di_mode))
-                return;
-        if (locktype == VRWLOCK_WRITE) {
-                /*
-                 * In the write case, we may have added a new entry to
-                 * the reference cache.  This might store a pointer to
-                 * an inode to be released in this inode.  If it is there,
-                 * clear the pointer and release the inode after unlocking
-                 * this one.
-                 */
-                xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
-        } else {
-                ASSERT((locktype == VRWLOCK_READ) ||
-                       (locktype == VRWLOCK_WRITE_DIRECT));
-                xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        }
-        return;
-}
-int
 xfs_inode_flush(
        xfs_inode_t     *ip,
        int             flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
-        xfs_inode_log_item_t *iip = ip->i_itemp;
        int             error = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
@@ -3494,33 +3344,9 @@ xfs_inode_flush(
         * Bypass inodes which have already been cleaned by
         * the inode flush clustering code inside xfs_iflush
         */
-        if ((ip->i_update_core == 0) &&
+        if (xfs_inode_clean(ip))
-            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
                return 0;
-        if (flags & FLUSH_LOG) {
-                if (iip && iip->ili_last_lsn) {
-                        xlog_t          *log = mp->m_log;
-                        xfs_lsn_t       sync_lsn;
-                        int             log_flags = XFS_LOG_FORCE;
-                        spin_lock(&log->l_grant_lock);
-                        sync_lsn = log->l_last_sync_lsn;
-                        spin_unlock(&log->l_grant_lock);
-                        if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
-                                if (flags & FLUSH_SYNC)
-                                        log_flags |= XFS_LOG_SYNC;
-                                error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
-                                if (error)
-                                        return error;
-                        }
-                        if (ip->i_update_core == 0)
-                                return 0;
-                }
-        }
        /*
         * We make this non-blocking if the inode is contended,
         * return EAGAIN to indicate to the caller that they
@@ -3528,30 +3354,22 @@ xfs_inode_flush(
         * blocking on inodes inside another operation right
         * now, they get caught later by xfs_sync.
         */
-        if (flags & FLUSH_INODE) {
+        if (flags & FLUSH_SYNC) {
-                int     flush_flags;
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                xfs_iflock(ip);
-                if (flags & FLUSH_SYNC) {
+        } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                        xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                        xfs_iflock(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                                return EAGAIN;
-                        }
-                } else {
                        return EAGAIN;
                }
+        } else {
-                if (flags & FLUSH_SYNC)
+                return EAGAIN;
-                        flush_flags = XFS_IFLUSH_SYNC;
-                else
-                        flush_flags = XFS_IFLUSH_ASYNC;
-                error = xfs_iflush(ip, flush_flags);
-                xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
+        error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+                                                    : XFS_IFLUSH_ASYNC_NOBLOCK);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return error;
 }
@@ -3694,12 +3512,12 @@ xfs_finish_reclaim(
         * We get the flush lock regardless, though, just to make sure
         * we don't free it while it is being flushed.
         */
-        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (!locked) {
-                if (!locked) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_iflock(ip);
-                        xfs_iflock(ip);
+        }
-                }
+        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                if (ip->i_update_core ||
                    ((ip->i_itemp != NULL) &&
                     (ip->i_itemp->ili_format.ilf_fields != 0))) {
@@ -3719,17 +3537,11 @@ xfs_finish_reclaim(
                ASSERT(ip->i_update_core == 0);
                ASSERT(ip->i_itemp == NULL ||
                       ip->i_itemp->ili_format.ilf_fields == 0);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        } else if (locked) {
-                /*
-                 * We are not interested in doing an iflush if we're
-                 * in the process of shutting down the filesystem forcibly.
-                 * So, just reclaim the inode.
-                 */
-                xfs_ifunlock(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
+        xfs_ifunlock(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 reclaim:
        xfs_ireclaim(ip);
        return 0;
@@ -3845,9 +3657,8 @@ xfs_alloc_file_space(
                end_dmi_offset = offset+len;
                if (end_dmi_offset > ip->i_size)
                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
-                        offset, end_dmi_offset - offset,
+                                      end_dmi_offset - offset, 0, NULL);
-                        0, NULL);
                if (error)
                        return error;
        }
@@ -3956,8 +3767,8 @@ dmapi_enospc_check:
        if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-                                XFS_ITOV(ip), DM_RIGHT_NULL,
+                                ip, DM_RIGHT_NULL,
-                                XFS_ITOV(ip), DM_RIGHT_NULL,
+                                ip, DM_RIGHT_NULL,
                                NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
                if (error == 0)
                        goto retry;     /* Maybe DMAPI app. has made space */
@@ -4021,7 +3832,8 @@ xfs_zero_remaining_bytes(
                XFS_BUF_READ(bp);
                XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
                xfsbdstrat(mp, bp);
-                if ((error = xfs_iowait(bp))) {
+                error = xfs_iowait(bp);
+                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
                                          mp, bp, XFS_BUF_ADDR(bp));
                        break;
@@ -4033,7 +3845,8 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNREAD(bp);
                XFS_BUF_WRITE(bp);
                xfsbdstrat(mp, bp);
-                if ((error = xfs_iowait(bp))) {
+                error = xfs_iowait(bp);
+                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
                                          mp, bp, XFS_BUF_ADDR(bp));
                        break;
@@ -4102,7 +3915,7 @@ xfs_free_file_space(
            DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
                if (end_dmi_offset > ip->i_size)
                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
                                offset, end_dmi_offset - offset,
                                AT_DELAY_FLAG(attr_flags), NULL);
                if (error)
@@ -4132,7 +3945,7 @@ xfs_free_file_space(
         * actually need to zero the extent edges.  Otherwise xfs_bunmapi
         * will take care of it for us.
         */
-        if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+        if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                nimap = 1;
                error = xfs_bmapi(NULL, ip, startoffset_fsb,
                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 4e3970f0e5e3..24c53923dc2c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -23,31 +23,32 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
                xfs_off_t stop);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry,
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-                bhv_vnode_t **vpp);
+                struct xfs_inode **ipp);
-int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode,
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-                xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp);
+                xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
-int xfs_remove(struct xfs_inode *dp, bhv_vname_t        *dentry);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
-int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp,
+                struct xfs_inode *ip);
-                bhv_vname_t *dentry);
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
-int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry,
+                struct xfs_name *target_name);
-                mode_t mode, bhv_vnode_t **vpp, struct cred *credp);
+int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry);
+                mode_t mode, struct xfs_inode **ipp, struct cred *credp);
+int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
+                struct xfs_inode *cdp);
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
-int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry,
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-                char *target_path, mode_t mode, bhv_vnode_t **vpp,
+                const char *target_path, mode_t mode, struct xfs_inode **ipp,
                struct cred *credp);
-int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
-void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
                xfs_flock64_t *bf, xfs_off_t offset,
                struct cred *credp, int attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname,
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
-                bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname);
+                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+                struct xfs_name *target_name);
 int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
                int *valuelenp, int flags, cred_t *cred);
 int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
author	David Woodhouse <dwmw2@infradead.org>	2008-04-22 07:34:25 -0400
committer	David Woodhouse <dwmw2@infradead.org>	2008-04-22 07:34:25 -0400
commit	f838bad1b3be8ca0c785ee0e0c570dfda74cf377 (patch)
tree	5a842a8056a708cfad55a20fa8ab733dd94b0903 /fs
parent	dd919660aacdf4adfcd279556aa03e595f7f0fc2 (diff)
parent	807501475fce0ebe68baedf87f202c3e4ee0d12c (diff)