Merge branch 'linus' into tracing/kmemtrace

Conflicts: mm/slub.c
author: Ingo Molnar <mingo@elte.hu> 2008-12-31 02:14:29 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-12-31 02:14:29 -0500
commit: 5fdf7e5975a0b0f6a0370655612c5dca3fd6311b (patch)
tree: 639c536e818c6ace974aa285ba94576df0353b01 /fs
parent: 7a51cffbd10886c0557677dd916c090097c691ef (diff)
parent: 6a94cb73064c952255336cc57731904174b2c58f (diff)
139 files changed, 12319 insertions, 14278 deletions
diff --git a/fs/aio.c b/fs/aio.c
index f658441d5666..d6f89d3c15e8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
        kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
+static void ctx_rcu_free(struct rcu_head *head)
+{
+        struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+        unsigned nr_events = ctx->max_reqs;
+        kmem_cache_free(kioctx_cachep, ctx);
+        if (nr_events) {
+                spin_lock(&aio_nr_lock);
+                BUG_ON(aio_nr - nr_events > aio_nr);
+                aio_nr -= nr_events;
+                spin_unlock(&aio_nr_lock);
+        }
+}
 /* __put_ioctx
 *      Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 */
 static void __put_ioctx(struct kioctx *ctx)
 {
-        unsigned nr_events = ctx->max_reqs;
        BUG_ON(ctx->reqs_active);
        cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
        mmdrop(ctx->mm);
        ctx->mm = NULL;
        pr_debug("__put_ioctx: freeing %p\n", ctx);
-        kmem_cache_free(kioctx_cachep, ctx);
+        call_rcu(&ctx->rcu_head, ctx_rcu_free);
-        if (nr_events) {
-                spin_lock(&aio_nr_lock);
-                BUG_ON(aio_nr - nr_events > aio_nr);
-                aio_nr -= nr_events;
-                spin_unlock(&aio_nr_lock);
-        }
 }
 #define get_ioctx(kioctx) do {                                          \
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
        struct mm_struct *mm;
        struct kioctx *ctx;
+        int did_sync = 0;
        /* Prevent overflows */
        if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
                goto out_freectx;
        /* limit the number of system wide aios */
-        spin_lock(&aio_nr_lock);
+        do {
-        if (aio_nr + ctx->max_reqs > aio_max_nr ||
+                spin_lock_bh(&aio_nr_lock);
-            aio_nr + ctx->max_reqs < aio_nr)
+                if (aio_nr + nr_events > aio_max_nr ||
-                ctx->max_reqs = 0;
+                    aio_nr + nr_events < aio_nr)
-        else
+                        ctx->max_reqs = 0;
-                aio_nr += ctx->max_reqs;
+                else
-        spin_unlock(&aio_nr_lock);
+                        aio_nr += ctx->max_reqs;
+                spin_unlock_bh(&aio_nr_lock);
+                if (ctx->max_reqs || did_sync)
+                        break;
+                /* wait for rcu callbacks to have completed before giving up */
+                synchronize_rcu();
+                did_sync = 1;
+                ctx->max_reqs = nr_events;
+        } while (1);
        if (ctx->max_reqs == 0)
                goto out_cleanup;
        /* now link into global list. */
-        write_lock(&mm->ioctx_list_lock);
+        spin_lock(&mm->ioctx_lock);
-        ctx->next = mm->ioctx_list;
+        hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
-        mm->ioctx_list = ctx;
+        spin_unlock(&mm->ioctx_lock);
-        write_unlock(&mm->ioctx_list_lock);
        dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
 */
 void exit_aio(struct mm_struct *mm)
 {
-        struct kioctx *ctx = mm->ioctx_list;
+        struct kioctx *ctx;
-        mm->ioctx_list = NULL;
-        while (ctx) {
+        while (!hlist_empty(&mm->ioctx_list)) {
-                struct kioctx *next = ctx->next;
+                ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
-                ctx->next = NULL;
+                hlist_del_rcu(&ctx->list);
                aio_cancel_all(ctx);
                wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
                                atomic_read(&ctx->users), ctx->dead,
                                ctx->reqs_active);
                put_ioctx(ctx);
-                ctx = next;
        }
 }
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
-        struct kioctx *ioctx;
+        struct mm_struct *mm = current->mm;
-        struct mm_struct *mm;
+        struct kioctx *ctx = NULL;
+        struct hlist_node *n;
-        mm = current->mm;
+        rcu_read_lock();
-        read_lock(&mm->ioctx_list_lock);
-        for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
+        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
+                if (ctx->user_id == ctx_id && !ctx->dead) {
-                        get_ioctx(ioctx);
+                        get_ioctx(ctx);
                        break;
                }
-        read_unlock(&mm->ioctx_list_lock);
+        }
-        return ioctx;
+        rcu_read_unlock();
+        return ctx;
 }
 /*
@@ -1215,19 +1232,14 @@ out:
 static void io_destroy(struct kioctx *ioctx)
 {
        struct mm_struct *mm = current->mm;
-        struct kioctx **tmp;
        int was_dead;
        /* delete the entry from the list is someone else hasn't already */
-        write_lock(&mm->ioctx_list_lock);
+        spin_lock(&mm->ioctx_lock);
        was_dead = ioctx->dead;
        ioctx->dead = 1;
-        for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
+        hlist_del_rcu(&ioctx->list);
-             tmp = &(*tmp)->next)
+        spin_unlock(&mm->ioctx_lock);
-                ;
-        if (*tmp)
-                *tmp = ioctx->next;
-        write_unlock(&mm->ioctx_list_lock);
        dprintk("aio_release(%p)\n", ioctx);
        if (likely(!was_dead))
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c962ac..77ebc3c263d6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
-        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
        mempool_free(bip, bs->bio_integrity_pool);
        bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index df99c882b807..711cee103602 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,7 +31,11 @@
 DEFINE_TRACE(block_split);
-static struct kmem_cache *bio_slab __read_mostly;
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS         4
 static mempool_t *bio_split_pool __read_mostly;
@@ -40,9 +44,8 @@ static mempool_t *bio_split_pool __read_mostly;
 * break badly! cannot be bigger than what you can fit into an
 * unsigned short
 */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
        BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -53,12 +56,121 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 */
 struct bio_set *fs_bio_set;
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+        struct kmem_cache *slab;
+        unsigned int slab_ref;
+        unsigned int slab_size;
+        char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+        unsigned int sz = sizeof(struct bio) + extra_size;
+        struct kmem_cache *slab = NULL;
+        struct bio_slab *bslab;
+        unsigned int i, entry = -1;
+        mutex_lock(&bio_slab_lock);
+        i = 0;
+        while (i < bio_slab_nr) {
+                struct bio_slab *bslab = &bio_slabs[i];
+                if (!bslab->slab && entry == -1)
+                        entry = i;
+                else if (bslab->slab_size == sz) {
+                        slab = bslab->slab;
+                        bslab->slab_ref++;
+                        break;
+                }
+                i++;
+        }
+        if (slab)
+                goto out_unlock;
+        if (bio_slab_nr == bio_slab_max && entry == -1) {
+                bio_slab_max <<= 1;
+                bio_slabs = krealloc(bio_slabs,
+                                     bio_slab_max * sizeof(struct bio_slab),
+                                     GFP_KERNEL);
+                if (!bio_slabs)
+                        goto out_unlock;
+        }
+        if (entry == -1)
+                entry = bio_slab_nr++;
+        bslab = &bio_slabs[entry];
+        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+        slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+        if (!slab)
+                goto out_unlock;
+        printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+        bslab->slab = slab;
+        bslab->slab_ref = 1;
+        bslab->slab_size = sz;
+out_unlock:
+        mutex_unlock(&bio_slab_lock);
+        return slab;
+}
+static void bio_put_slab(struct bio_set *bs)
+{
+        struct bio_slab *bslab = NULL;
+        unsigned int i;
+        mutex_lock(&bio_slab_lock);
+        for (i = 0; i < bio_slab_nr; i++) {
+                if (bs->bio_slab == bio_slabs[i].slab) {
+                        bslab = &bio_slabs[i];
+                        break;
+                }
+        }
+        if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+                goto out;
+        WARN_ON(!bslab->slab_ref);
+        if (--bslab->slab_ref)
+                goto out;
+        kmem_cache_destroy(bslab->slab);
+        bslab->slab = NULL;
+out:
+        mutex_unlock(&bio_slab_lock);
+}
 unsigned int bvec_nr_vecs(unsigned short idx)
 {
        return bvec_slabs[idx].nr_vecs;
 }
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+        BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+        if (idx == BIOVEC_MAX_IDX)
+                mempool_free(bv, bs->bvec_pool);
+        else {
+                struct biovec_slab *bvs = bvec_slabs + idx;
+                kmem_cache_free(bvs->slab, bv);
+        }
+}
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+                              struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -67,60 +179,85 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
         * If not, this is a bio_kmalloc() allocation and just do a
         * kzalloc() for the exact number of vecs right away.
         */
-        if (bs) {
+        if (!bs)
+                bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
+        /*
+         * see comment near bvec_array define!
+         */
+        switch (nr) {
+        case 1:
+                *idx = 0;
+                break;
+        case 2 ... 4:
+                *idx = 1;
+                break;
+        case 5 ... 16:
+                *idx = 2;
+                break;
+        case 17 ... 64:
+                *idx = 3;
+                break;
+        case 65 ... 128:
+                *idx = 4;
+                break;
+        case 129 ... BIO_MAX_PAGES:
+                *idx = 5;
+                break;
+        default:
+                return NULL;
+        }
+        /*
+         * idx now points to the pool we want to allocate from. only the
+         * 1-vec entry pool is mempool backed.
+         */
+        if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+                bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+        } else {
+                struct biovec_slab *bvs = bvec_slabs + *idx;
+                gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
                /*
-                 * see comment near bvec_array define!
+                 * Make this allocation restricted and don't dump info on
+                 * allocation failures, since we'll fallback to the mempool
+                 * in case of failure.
                 */
-                switch (nr) {
+                __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-                case 1:
-                        *idx = 0;
-                        break;
-                case 2 ... 4:
-                        *idx = 1;
-                        break;
-                case 5 ... 16:
-                        *idx = 2;
-                        break;
-                case 17 ... 64:
-                        *idx = 3;
-                        break;
-                case 65 ... 128:
-                        *idx = 4;
-                        break;
-                case 129 ... BIO_MAX_PAGES:
-                        *idx = 5;
-                        break;
-                default:
-                        return NULL;
-                }
                /*
-                 * idx now points to the pool we want to allocate from
+                 * Try a slab allocation. If this fails and __GFP_WAIT
+                 * is set, retry with the 1-entry mempool
                 */
-                bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-                if (bvl)
+                if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
-                        memset(bvl, 0,
+                        *idx = BIOVEC_MAX_IDX;
-                                bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+                        goto fallback;
-        } else
+                }
-                bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+        }
        return bvl;
 }
-void bio_free(struct bio *bio, struct bio_set *bio_set)
+void bio_free(struct bio *bio, struct bio_set *bs)
 {
-        if (bio->bi_io_vec) {
+        void *p;
-                const int pool_idx = BIO_POOL_IDX(bio);
-                BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
+        if (bio_has_allocated_vec(bio))
+                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
-                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
-        }
        if (bio_integrity(bio))
-                bio_integrity_free(bio, bio_set);
+                bio_integrity_free(bio, bs);
+        /*
+         * If we have front padding, adjust the bio pointer before freeing
+         */
+        p = bio;
+        if (bs->front_pad)
+                p -= bs->front_pad;
-        mempool_free(bio, bio_set->bio_pool);
+        mempool_free(p, bs->bio_pool);
 }
 /*
@@ -133,7 +270,8 @@ static void bio_fs_destructor(struct bio *bio)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
-        kfree(bio->bi_io_vec);
+        if (bio_has_allocated_vec(bio))
+                kfree(bio->bi_io_vec);
        kfree(bio);
 }
@@ -157,16 +295,20 @@ void bio_init(struct bio *bio)
 *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
 *   fall back to just using @kmalloc to allocate the required memory.
 *
- *   allocate bio and iovecs from the memory pools specified by the
+ *   Note that the caller must set ->bi_destructor on succesful return
- *   bio_set structure, or @kmalloc if none given.
+ *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   count drops to zero.
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-        struct bio *bio;
+        struct bio *bio = NULL;
+        if (bs) {
+                void *p = mempool_alloc(bs->bio_pool, gfp_mask);
-        if (bs)
+                if (p)
-                bio = mempool_alloc(bs->bio_pool, gfp_mask);
+                        bio = p + bs->front_pad;
-        else
+        } else
                bio = kmalloc(sizeof(*bio), gfp_mask);
        if (likely(bio)) {
@@ -176,7 +318,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                if (likely(nr_iovecs)) {
                        unsigned long uninitialized_var(idx);
-                        bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+                        if (nr_iovecs <= BIO_INLINE_VECS) {
+                                idx = 0;
+                                bvl = bio->bi_inline_vecs;
+                                nr_iovecs = BIO_INLINE_VECS;
+                        } else {
+                                bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
+                                                        bs);
+                                nr_iovecs = bvec_nr_vecs(idx);
+                        }
                        if (unlikely(!bvl)) {
                                if (bs)
                                        mempool_free(bio, bs->bio_pool);
@@ -186,7 +336,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                                goto out;
                        }
                        bio->bi_flags |= idx << BIO_POOL_OFFSET;
-                        bio->bi_max_vecs = bvec_nr_vecs(idx);
+                        bio->bi_max_vecs = nr_iovecs;
                }
                bio->bi_io_vec = bvl;
        }
@@ -1346,30 +1496,18 @@ EXPORT_SYMBOL(bio_sector_offset);
 */
 static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
-        int i;
+        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+        bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-                struct biovec_slab *bp = bvec_slabs + i;
+        if (!bs->bvec_pool)
-                mempool_t **bvp = bs->bvec_pools + i;
+                return -ENOMEM;
-                *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
-                if (!*bvp)
-                        return -ENOMEM;
-        }
        return 0;
 }
 static void biovec_free_pools(struct bio_set *bs)
 {
-        int i;
+        mempool_destroy(bs->bvec_pool);
-        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-                mempool_t *bvp = bs->bvec_pools[i];
-                if (bvp)
-                        mempool_destroy(bvp);
-        }
 }
 void bioset_free(struct bio_set *bs)
@@ -1379,25 +1517,49 @@ void bioset_free(struct bio_set *bs)
        bioset_integrity_free(bs);
        biovec_free_pools(bs);
+        bio_put_slab(bs);
        kfree(bs);
 }
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:  Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:  Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
-        struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+        unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+        struct bio_set *bs;
+        bs = kzalloc(sizeof(*bs), GFP_KERNEL);
        if (!bs)
                return NULL;
-        bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
+        bs->front_pad = front_pad;
+        bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+        if (!bs->bio_slab) {
+                kfree(bs);
+                return NULL;
+        }
+        bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
        if (!bs->bio_pool)
                goto bad;
-        if (bioset_integrity_create(bs, bio_pool_size))
+        if (bioset_integrity_create(bs, pool_size))
                goto bad;
-        if (!biovec_create_pools(bs, bvec_pool_size))
+        if (!biovec_create_pools(bs, pool_size))
                return bs;
 bad:
@@ -1421,12 +1583,16 @@ static void __init biovec_init_slabs(void)
 static int __init init_bio(void)
 {
-        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_slab_max = 2;
+        bio_slab_nr = 0;
+        bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+        if (!bio_slabs)
+                panic("bio: can't allocate bios\n");
        bio_integrity_init_slab();
        biovec_init_slabs();
-        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
+        fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa1152..776ae091d3b0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
        page_cache_release(page);
 }
+static int quiet_error(struct buffer_head *bh)
+{
+        if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+                return 0;
+        return 1;
+}
 static void buffer_io_error(struct buffer_head *bh)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
                        bdevname(bh->b_bdev, b),
                        (unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -394,7 +402,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
-                if (printk_ratelimit())
+                if (!quiet_error(bh))
                        buffer_io_error(bh);
                SetPageError(page);
        }
@@ -455,7 +463,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (printk_ratelimit()) {
+                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -2913,6 +2921,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
                set_bit(BH_Eopnotsupp, &bh->b_state);
        }
+        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+                set_bit(BH_Quiet, &bh->b_state);
        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
        bio_put(bio);
 }
diff --git a/fs/exec.c b/fs/exec.c
index 1f59ea079cbb..02d2e120542d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -773,7 +773,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        struct task_struct *leader = NULL;
        int count;
        if (thread_group_empty(tsk))
@@ -811,7 +810,7 @@ static int de_thread(struct task_struct *tsk)
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
-                leader = tsk->group_leader;
+                struct task_struct *leader = tsk->group_leader;
                sig->notify_count = -1; /* for exit_notify() */
                for (;;) {
@@ -863,8 +862,9 @@ static int de_thread(struct task_struct *tsk)
                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
                write_unlock_irq(&tasklist_lock);
+                release_task(leader);
        }
        sig->group_exit_task = NULL;
@@ -873,8 +873,6 @@ static int de_thread(struct task_struct *tsk)
 no_thread_group:
        exit_itimers(sig);
        flush_itimer_signals();
-        if (leader)
-                release_task(leader);
        if (atomic_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65dbe..04158ad74dbb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1721,7 +1721,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
        /* small i_blocks in vfs inode? */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * CONFIG_LSF is not enabled implies the inode
+                 * CONFIG_LBD is not enabled implies the inode
                 * i_block represent total blocks in 512 bytes
                 * 32 == size of vfs inode i_blocks * 8
                 */
@@ -1764,7 +1764,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LSF is not enabled
+                 * !has_huge_files or CONFIG_LBD is not enabled
                 * implies the inode i_block represent total blocks in
                 * 512 bytes 32 == size of vfs inode i_blocks * 8
                 */
@@ -2021,13 +2021,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (has_huge_files) {
                /*
                 * Large file size enabled file system can only be
-                 * mount if kernel is build with CONFIG_LSF
+                 * mount if kernel is build with CONFIG_LBD
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LSF.\n", sb->s_id);
+                                        "without CONFIG_LBD.\n", sb->s_id);
                        goto failed_mount;
                }
        }
diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba1397..098a2443196f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -108,84 +108,100 @@ static void wake_up_inode(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_LOCK);
 }
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb          - superblock inode belongs to.
+ * @inode       - inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
-        struct inode *inode;
-        if (sb->s_op->alloc_inode)
-                inode = sb->s_op->alloc_inode(sb);
-        else
-                inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
-        if (inode) {
+        struct address_space * const mapping = &inode->i_data;
-                struct address_space * const mapping = &inode->i_data;
+        inode->i_sb = sb;
-                inode->i_sb = sb;
+        inode->i_blkbits = sb->s_blocksize_bits;
-                inode->i_blkbits = sb->s_blocksize_bits;
+        inode->i_flags = 0;
-                inode->i_flags = 0;
+        atomic_set(&inode->i_count, 1);
-                atomic_set(&inode->i_count, 1);
+        inode->i_op = &empty_iops;
-                inode->i_op = &empty_iops;
+        inode->i_fop = &empty_fops;
-                inode->i_fop = &empty_fops;
+        inode->i_nlink = 1;
-                inode->i_nlink = 1;
+        atomic_set(&inode->i_writecount, 0);
-                atomic_set(&inode->i_writecount, 0);
+        inode->i_size = 0;
-                inode->i_size = 0;
+        inode->i_blocks = 0;
-                inode->i_blocks = 0;
+        inode->i_bytes = 0;
-                inode->i_bytes = 0;
+        inode->i_generation = 0;
-                inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
-                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+        memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
-                inode->i_pipe = NULL;
+        inode->i_pipe = NULL;
-                inode->i_bdev = NULL;
+        inode->i_bdev = NULL;
-                inode->i_cdev = NULL;
+        inode->i_cdev = NULL;
-                inode->i_rdev = 0;
+        inode->i_rdev = 0;
-                inode->dirtied_when = 0;
+        inode->dirtied_when = 0;
-                if (security_inode_alloc(inode)) {
+        if (security_inode_alloc(inode)) {
-                        if (inode->i_sb->s_op->destroy_inode)
+                if (inode->i_sb->s_op->destroy_inode)
-                                inode->i_sb->s_op->destroy_inode(inode);
+                        inode->i_sb->s_op->destroy_inode(inode);
-                        else
+                else
-                                kmem_cache_free(inode_cachep, (inode));
+                        kmem_cache_free(inode_cachep, (inode));
-                        return NULL;
+                return NULL;
-                }
+        }
-                spin_lock_init(&inode->i_lock);
+        spin_lock_init(&inode->i_lock);
-                lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
-                mutex_init(&inode->i_mutex);
+        mutex_init(&inode->i_mutex);
-                lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+        lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
-                init_rwsem(&inode->i_alloc_sem);
+        init_rwsem(&inode->i_alloc_sem);
-                lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+        lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
-                mapping->a_ops = &empty_aops;
+        mapping->a_ops = &empty_aops;
-                mapping->host = inode;
+        mapping->host = inode;
-                mapping->flags = 0;
+        mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
-                mapping->assoc_mapping = NULL;
+        mapping->assoc_mapping = NULL;
-                mapping->backing_dev_info = &default_backing_dev_info;
+        mapping->backing_dev_info = &default_backing_dev_info;
-                mapping->writeback_index = 0;
+        mapping->writeback_index = 0;
-                /*
+        /*
-                 * If the block_device provides a backing_dev_info for client
+         * If the block_device provides a backing_dev_info for client
-                 * inodes then use that.  Otherwise the inode share the bdev's
+         * inodes then use that.  Otherwise the inode share the bdev's
-                 * backing_dev_info.
+         * backing_dev_info.
-                 */
+         */
-                if (sb->s_bdev) {
+        if (sb->s_bdev) {
-                        struct backing_dev_info *bdi;
+                struct backing_dev_info *bdi;
-                        bdi = sb->s_bdev->bd_inode_backing_dev_info;
+                bdi = sb->s_bdev->bd_inode_backing_dev_info;
-                        if (!bdi)
+                if (!bdi)
-                                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+                        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                        mapping->backing_dev_info = bdi;
+                mapping->backing_dev_info = bdi;
-                }
-                inode->i_private = NULL;
-                inode->i_mapping = mapping;
        }
+        inode->i_private = NULL;
+        inode->i_mapping = mapping;
        return inode;
 }
+EXPORT_SYMBOL(inode_init_always);
+static struct inode *alloc_inode(struct super_block *sb)
+{
+        struct inode *inode;
+        if (sb->s_op->alloc_inode)
+                inode = sb->s_op->alloc_inode(sb);
+        else
+                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+        if (inode)
+                return inode_init_always(sb, inode);
+        return NULL;
+}
 void destroy_inode(struct inode *inode) 
 {
@@ -196,6 +212,7 @@ void destroy_inode(struct inode *inode)
        else
                kmem_cache_free(inode_cachep, (inode));
 }
+EXPORT_SYMBOL(destroy_inode);
 /*
@@ -534,6 +551,49 @@ repeat:
        return node ? inode : NULL;
 }
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+                        struct inode *inode)
+{
+        inodes_stat.nr_inodes++;
+        list_add(&inode->i_list, &inode_in_use);
+        list_add(&inode->i_sb_list, &sb->s_inodes);
+        if (head)
+                hlist_add_head(&inode->i_hash, head);
+}
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb          - superblock inode belongs to.
+ * @inode       - inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        spin_lock(&inode_lock);
+        __inode_add_to_lists(sb, head, inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
 /**
 *      new_inode       - obtain an inode
 *      @sb: superblock
@@ -561,9 +621,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                inodes_stat.nr_inodes++;
+                __inode_add_to_lists(sb, NULL, inode);
-                list_add(&inode->i_list, &inode_in_use);
-                list_add(&inode->i_sb_list, &sb->s_inodes);
                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
@@ -622,10 +680,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
                        if (set(inode, data))
                                goto set_failed;
-                        inodes_stat.nr_inodes++;
+                        __inode_add_to_lists(sb, head, inode);
-                        list_add(&inode->i_list, &inode_in_use);
-                        list_add(&inode->i_sb_list, &sb->s_inodes);
-                        hlist_add_head(&inode->i_hash, head);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
@@ -671,10 +726,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        inodes_stat.nr_inodes++;
+                        __inode_add_to_lists(sb, head, inode);
-                        list_add(&inode->i_list, &inode_in_use);
-                        list_add(&inode->i_sb_list, &sb->s_inodes);
-                        hlist_add_head(&inode->i_hash, head);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
@@ -698,16 +750,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
        return inode;
 }
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -1292,6 +1334,7 @@ int inode_wait(void *word)
        schedule();
        return 0;
 }
+EXPORT_SYMBOL(inode_wait);
 /*
 * If we try to find an inode in the inode hash while it is being
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 210339784b56..b00ee9f05a06 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,8 +59,14 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_size >= IDATASIZE) {
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_mapping->a_ops = &jfs_aops;
-                } else
+                } else {
                        inode->i_op = &jfs_symlink_inode_operations;
+                        /*
+                         * The inline data should be null-terminated, but
+                         * don't let on-disk corruption crash the kernel
+                         */
+                        JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+                }
        } else {
                inode->i_op = &jfs_file_inode_operations;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8307dd64bf46..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/smp_lock.h>
+#include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -60,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
        host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
                                   nlm_init->protocol, nlm_version,
-                                   nlm_init->hostname);
+                                   nlm_init->hostname, nlm_init->noresvport);
        if (host == NULL) {
                lockd_down();
                return ERR_PTR(-ENOLCK);
@@ -191,11 +192,15 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 void
 nlmclnt_recovery(struct nlm_host *host)
 {
+        struct task_struct *task;
        if (!host->h_reclaiming++) {
                nlm_get_host(host);
-                __module_get(THIS_MODULE);
+                task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
-                if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0)
+                if (IS_ERR(task))
-                        module_put(THIS_MODULE);
+                        printk(KERN_ERR "lockd: unable to spawn reclaimer "
+                                "thread. Locks for %s won't be reclaimed! "
+                                "(%ld)\n", host->h_name, PTR_ERR(task));
        }
 }
@@ -207,7 +212,6 @@ reclaimer(void *ptr)
        struct file_lock *fl, *next;
        u32 nsmstate;
-        daemonize("%s-reclaim", host->h_name);
        allow_signal(SIGKILL);
        down_write(&host->h_rwsem);
@@ -233,7 +237,12 @@ restart:
        list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
                list_del_init(&fl->fl_u.nfs_fl.list);
-                /* Why are we leaking memory here? --okir */
+                /*
+                 * sending this thread a SIGKILL will result in any unreclaimed
+                 * locks being removed from the h_granted list. This means that
+                 * the kernel will not attempt to reclaim them again if a new
+                 * reclaimer thread is spawned for this host.
+                 */
                if (signalled())
                        continue;
                if (nlmclnt_reclaim(host, fl) != 0)
@@ -261,5 +270,5 @@ restart:
        nlm_release_host(host);
        lockd_down();
        unlock_kernel();
-        module_put_and_exit(0);
+        return 0;
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index e05d04416037..abdebf76b820 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -48,6 +48,7 @@ struct nlm_lookup_host_info {
        const size_t            hostname_len;   /* it's length */
        const struct sockaddr   *src_sap;       /* our address (optional) */
        const size_t            src_len;        /* it's length */
+        const int               noresvport;     /* use non-priv port */
 };
 /*
@@ -222,6 +223,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        host->h_nsmstate   = 0;                 /* real NSM state */
        host->h_nsmhandle  = nsm;
        host->h_server     = ni->server;
+        host->h_noresvport = ni->noresvport;
        hlist_add_head(&host->h_hash, chain);
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
@@ -272,6 +274,7 @@ nlm_destroy_host(struct nlm_host *host)
 * @protocol: transport protocol to use
 * @version: NLM protocol version
 * @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
 *
 * Returns an nlm_host structure that matches the passed-in
 * [server address, transport protocol, NLM version, server hostname].
@@ -281,7 +284,9 @@ nlm_destroy_host(struct nlm_host *host)
 struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const size_t salen,
                                     const unsigned short protocol,
-                                     const u32 version, const char *hostname)
+                                     const u32 version,
+                                     const char *hostname,
+                                     int noresvport)
 {
        const struct sockaddr source = {
                .sa_family      = AF_UNSPEC,
@@ -296,6 +301,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname_len   = strlen(hostname),
                .src_sap        = &source,
                .src_len        = sizeof(source),
+                .noresvport     = noresvport,
        };
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
@@ -417,6 +423,8 @@ nlm_bind_host(struct nlm_host *host)
                 */
                if (!host->h_server)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+                if (host->h_noresvport)
+                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 56b076736b56..252d80163d02 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -45,7 +45,7 @@
 static struct svc_program       nlmsvc_program;
 struct nlmsvc_binding *         nlmsvc_ops;
-EXPORT_SYMBOL(nlmsvc_ops);
+EXPORT_SYMBOL_GPL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int             nlmsvc_users;
@@ -300,7 +300,7 @@ out:
        mutex_unlock(&nlmsvc_mutex);
        return error;
 }
-EXPORT_SYMBOL(lockd_up);
+EXPORT_SYMBOL_GPL(lockd_up);
 /*
 * Decrement the user count and bring down lockd if we're the last.
@@ -329,7 +329,7 @@ lockd_down(void)
 out:
        mutex_unlock(&nlmsvc_mutex);
 }
-EXPORT_SYMBOL(lockd_down);
+EXPORT_SYMBOL_GPL(lockd_down);
 #ifdef CONFIG_SYSCTL
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c2e9cfd9e5a4..3e634f2a1083 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #include <net/inet_sock.h>
@@ -182,10 +183,34 @@ void nfs_callback_down(void)
        mutex_unlock(&nfs_callback_mutex);
 }
+static int check_gss_callback_principal(struct nfs_client *clp,
+                                        struct svc_rqst *rqstp)
+{
+        struct rpc_clnt *r = clp->cl_rpcclient;
+        char *p = svc_gss_principal(rqstp);
+        /*
+         * It might just be a normal user principal, in which case
+         * userspace won't bother to tell us the name at all.
+         */
+        if (p == NULL)
+                return SVC_DENIED;
+        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+        if (memcmp(p, "nfs@", 4) != 0)
+                return SVC_DENIED;
+        p += 4;
+        if (strcmp(p, r->cl_server) != 0)
+                return SVC_DENIED;
+        return SVC_OK;
+}
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+        int ret = SVC_OK;
        /* Don't talk to strangers */
        clp = nfs_find_client(svc_addr(rqstp), 4);
@@ -194,21 +219,22 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        dprintk("%s: %s NFSv4 callback!\n", __func__,
                        svc_print_addr(rqstp, buf, sizeof(buf)));
-        nfs_put_client(clp);
        switch (rqstp->rq_authop->flavour) {
                case RPC_AUTH_NULL:
                        if (rqstp->rq_proc != CB_NULL)
-                                return SVC_DENIED;
+                                ret = SVC_DENIED;
                        break;
                case RPC_AUTH_UNIX:
                        break;
                case RPC_AUTH_GSS:
-                        /* FIXME: RPCSEC_GSS handling? */
+                        ret = check_gss_callback_principal(clp, rqstp);
+                        break;
                default:
-                        return SVC_DENIED;
+                        ret = SVC_DENIED;
        }
-        return SVC_OK;
+        nfs_put_client(clp);
+        return ret;
 }
 /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7547600b6174..9b728f3565a1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        init_rwsem(&clp->cl_sem);
        INIT_LIST_HEAD(&clp->cl_delegations);
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
@@ -224,31 +223,54 @@ void nfs_put_client(struct nfs_client *clp)
        }
 }
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-                                 const struct sockaddr_in *sa2)
+static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
 {
-        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+        switch (sa->sa_family) {
+                default:
+                        return NULL;
+                case AF_INET6:
+                        return &((const struct sockaddr_in6 *)sa)->sin6_addr;
+                        break;
+                case AF_INET:
+                        ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
+                                        addr_mapped);
+                        return addr_mapped;
+        }
 }
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-                                 const struct sockaddr_in6 *sa2)
+                const struct sockaddr *sa2)
+{
+        const struct in6_addr *addr1;
+        const struct in6_addr *addr2;
+        struct in6_addr addr1_mapped;
+        struct in6_addr addr2_mapped;
+        addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
+        if (likely(addr1 != NULL)) {
+                addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
+                if (likely(addr2 != NULL))
+                        return ipv6_addr_equal(addr1, addr2);
+        }
+        return 0;
+}
+#else
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+                                 const struct sockaddr_in *sa2)
 {
-        return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
 }
 static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
                                 const struct sockaddr *sa2)
 {
-        switch (sa1->sa_family) {
+        if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
-        case AF_INET:
+                return 0;
-                return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+        return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-                                (const struct sockaddr_in *)sa2);
+                        (const struct sockaddr_in *)sa2);
-        case AF_INET6:
-                return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
-                                (const struct sockaddr_in6 *)sa2);
-        }
-        BUG();
 }
+#endif
 /*
 * Find a client by IP address and protocol version
@@ -270,8 +292,6 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
                if (clp->rpc_ops->version != nfsversion)
                        continue;
-                if (addr->sa_family != clap->sa_family)
-                        continue;
                /* Match only the IP address, not the port number */
                if (!nfs_sockaddr_match_ipaddr(addr, clap))
                        continue;
@@ -305,8 +325,6 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
                if (clp->rpc_ops->version != nfsvers)
                        continue;
-                if (sap->sa_family != clap->sa_family)
-                        continue;
                /* Match only the IP address, not the port number */
                if (!nfs_sockaddr_match_ipaddr(sap, clap))
                        continue;
@@ -470,7 +488,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 static int nfs_create_rpc_client(struct nfs_client *clp,
                                 const struct rpc_timeout *timeparms,
                                 rpc_authflavor_t flavor,
-                                 int flags)
+                                 int discrtry, int noresvport)
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
@@ -482,9 +500,13 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
                .program        = &nfs_program,
                .version        = clp->rpc_ops->version,
                .authflavor     = flavor,
-                .flags          = flags,
        };
+        if (discrtry)
+                args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+        if (noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        if (!IS_ERR(clp->cl_rpcclient))
                return 0;
@@ -522,6 +544,8 @@ static int nfs_start_lockd(struct nfs_server *server)
                .protocol       = server->flags & NFS_MOUNT_TCP ?
                                                IPPROTO_TCP : IPPROTO_UDP,
                .nfs_version    = clp->rpc_ops->version,
+                .noresvport     = server->flags & NFS_MOUNT_NORESVPORT ?
+                                        1 : 0,
        };
        if (nlm_init.nfs_version > 3)
@@ -623,7 +647,8 @@ static int nfs_init_client(struct nfs_client *clp,
         * Create a client RPC handle for doing FSSTAT with UNIX auth only
         * - RFC 2623, sec 2.3.2
         */
-        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
+        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+                                      0, data->flags & NFS_MOUNT_NORESVPORT);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -965,7 +990,8 @@ error:
 static int nfs4_init_client(struct nfs_client *clp,
                const struct rpc_timeout *timeparms,
                const char *ip_addr,
-                rpc_authflavor_t authflavour)
+                rpc_authflavor_t authflavour,
+                int flags)
 {
        int error;
@@ -979,7 +1005,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        clp->rpc_ops = &nfs_v4_clientops;
        error = nfs_create_rpc_client(clp, timeparms, authflavour,
-                                        RPC_CLNT_CREATE_DISCRTRY);
+                                      1, flags & NFS_MOUNT_NORESVPORT);
        if (error < 0)
                goto error;
        memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1030,7 +1056,8 @@ static int nfs4_set_client(struct nfs_server *server,
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
+        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
+                                        server->flags);
        if (error < 0)
                goto error_put;
@@ -1059,6 +1086,10 @@ static int nfs4_init_server(struct nfs_server *server,
        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
                        data->timeo, data->retrans);
+        /* Initialise the client representation from the mount data */
+        server->flags = data->flags;
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
        /* Get a client record */
        error = nfs4_set_client(server,
                        data->nfs_server.hostname,
@@ -1071,10 +1102,6 @@ static int nfs4_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
-        /* Initialise the client representation from the mount data */
-        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
        if (data->wsize)
@@ -1177,6 +1204,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        parent_server = NFS_SB(data->sb);
        parent_client = parent_server->nfs_client;
+        /* Initialise the client representation from the parent server */
+        nfs_server_copy_userdata(server, parent_server);
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
        /* Get a client representation.
         * Note: NFSv4 always uses TCP, */
        error = nfs4_set_client(server, data->hostname,
@@ -1189,10 +1220,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        if (error < 0)
                goto error;
-        /* Initialise the client representation from the parent server */
-        nfs_server_copy_userdata(server, parent_server);
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
        error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
        if (error < 0)
                goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cc563cfa6940..968225a88015 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -43,6 +43,27 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
                put_rpccred(cred);
 }
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+        struct nfs_delegation *delegation;
+        int ret = 0;
+        flags &= FMODE_READ|FMODE_WRITE;
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        if (delegation != NULL && (delegation->type & flags) == flags) {
+                nfs_mark_delegation_referenced(delegation);
+                ret = 1;
+        }
+        rcu_read_unlock();
+        return ret;
+}
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
 {
        struct inode *inode = state->inode;
@@ -119,7 +140,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
        delegation->maxsize = res->maxsize;
        oldcred = delegation->cred;
        delegation->cred = get_rpccred(cred);
-        delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
+        clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
        NFS_I(inode)->delegation_state = delegation->type;
        smp_wmb();
        put_rpccred(oldcred);
@@ -134,19 +155,35 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
        return res;
 }
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+        struct inode *inode = NULL;
+        spin_lock(&delegation->lock);
+        if (delegation->inode != NULL)
+                inode = igrab(delegation->inode);
+        spin_unlock(&delegation->lock);
+        return inode;
+}
 static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
 {
        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
        if (delegation == NULL)
                goto nomatch;
+        spin_lock(&delegation->lock);
        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch;
+                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
+        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
+        spin_unlock(&delegation->lock);
        return delegation;
+nomatch_unlock:
+        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
@@ -172,6 +209,8 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        delegation->change_attr = nfsi->change_attr;
        delegation->cred = get_rpccred(cred);
        delegation->inode = inode;
+        delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+        spin_lock_init(&delegation->lock);
        spin_lock(&clp->cl_lock);
        if (rcu_dereference(nfsi->delegation) != NULL) {
@@ -226,22 +265,47 @@ static void nfs_msync_inode(struct inode *inode)
 */
 static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        nfs_msync_inode(inode);
-        down_read(&clp->cl_sem);
        /* Guard against new delegated open calls */
        down_write(&nfsi->rwsem);
        nfs_delegation_claim_opens(inode, &delegation->stateid);
        up_write(&nfsi->rwsem);
-        up_read(&clp->cl_sem);
        nfs_msync_inode(inode);
        return nfs_do_return_delegation(inode, delegation, 1);
 }
 /*
+ * Return all delegations that have been marked for return
+ */
+void nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+        struct nfs_delegation *delegation;
+        struct inode *inode;
+restart:
+        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                        continue;
+                inode = nfs_delegation_grab_inode(delegation);
+                if (inode == NULL)
+                        continue;
+                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                spin_unlock(&clp->cl_lock);
+                rcu_read_unlock();
+                if (delegation != NULL)
+                        __nfs_inode_return_delegation(inode, delegation);
+                iput(inode);
+                goto restart;
+        }
+        rcu_read_unlock();
+}
+/*
 * This function returns the delegation without reclaiming opens
 * or protecting against delegation reclaims.
 * It is therefore really only safe to be called from
@@ -279,83 +343,55 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
+static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+{
+        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
 /*
 * Return all delegations associated to a super block
 */
-void nfs_return_all_delegations(struct super_block *sb)
+void nfs_super_return_all_delegations(struct super_block *sb)
 {
        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
        struct nfs_delegation *delegation;
-        struct inode *inode;
        if (clp == NULL)
                return;
-restart:
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if (delegation->inode->i_sb != sb)
+                spin_lock(&delegation->lock);
-                        continue;
+                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
-                inode = igrab(delegation->inode);
+                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                if (inode == NULL)
+                spin_unlock(&delegation->lock);
-                        continue;
-                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation != NULL)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
+        nfs_client_return_marked_delegations(clp);
 }
-static int nfs_do_expire_all_delegations(void *ptr)
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
 {
-        struct nfs_client *clp = ptr;
        struct nfs_delegation *delegation;
-        struct inode *inode;
-        allow_signal(SIGKILL);
-restart:
-        if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
-                goto out;
-        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
-                goto out;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                inode = igrab(delegation->inode);
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                if (inode == NULL)
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                        continue;
-                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
-out:
+}
-        nfs_put_client(clp);
-        module_put_and_exit(0);
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+        if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+                nfs4_schedule_state_manager(clp);
 }
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
-        struct task_struct *task;
+        nfs_client_mark_return_all_delegations(clp);
+        nfs_delegation_run_state_manager(clp);
-        __module_get(THIS_MODULE);
-        atomic_inc(&clp->cl_count);
-        task = kthread_run(nfs_do_expire_all_delegations, clp,
-                                "%s-delegreturn",
-                                rpc_peeraddr2str(clp->cl_rpcclient,
-                                                        RPC_DISPLAY_ADDR));
-        if (!IS_ERR(task))
-                return;
-        nfs_put_client(clp);
-        module_put(THIS_MODULE);
 }
 /*
@@ -363,68 +399,29 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
-        struct inode *inode;
        if (clp == NULL)
                return;
-restart:
+        nfs_client_mark_return_all_delegations(clp);
+}
+static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+{
+        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                inode = igrab(delegation->inode);
+                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
-                if (inode == NULL)
                        continue;
-                spin_lock(&clp->cl_lock);
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation != NULL)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
 }
-struct recall_threadargs {
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
-        struct inode *inode;
-        struct nfs_client *clp;
-        const nfs4_stateid *stateid;
-        struct completion started;
-        int result;
-};
-static int recall_thread(void *data)
 {
-        struct recall_threadargs *args = (struct recall_threadargs *)data;
+        nfs_client_mark_return_unreferenced_delegations(clp);
-        struct inode *inode = igrab(args->inode);
+        nfs_delegation_run_state_manager(clp);
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_delegation *delegation;
-        daemonize("nfsv4-delegreturn");
-        nfs_msync_inode(inode);
-        down_read(&clp->cl_sem);
-        down_write(&nfsi->rwsem);
-        spin_lock(&clp->cl_lock);
-        delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
-        if (delegation != NULL)
-                args->result = 0;
-        else
-                args->result = -ENOENT;
-        spin_unlock(&clp->cl_lock);
-        complete(&args->started);
-        nfs_delegation_claim_opens(inode, args->stateid);
-        up_write(&nfsi->rwsem);
-        up_read(&clp->cl_sem);
-        nfs_msync_inode(inode);
-        if (delegation != NULL)
-                nfs_do_return_delegation(inode, delegation, 1);
-        iput(inode);
-        module_put_and_exit(0);
 }
 /*
@@ -432,22 +429,20 @@ static int recall_thread(void *data)
 */
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
 {
-        struct recall_threadargs data = {
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
-                .inode = inode,
+        struct nfs_delegation *delegation;
-                .stateid = stateid,
-        };
-        int status;
-        init_completion(&data.started);
+        rcu_read_lock();
-        __module_get(THIS_MODULE);
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        status = kernel_thread(recall_thread, &data, CLONE_KERNEL);
+        if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
-        if (status < 0)
+                                sizeof(delegation->stateid.data)) != 0) {
-                goto out_module_put;
+                rcu_read_unlock();
-        wait_for_completion(&data.started);
+                return -ENOENT;
-        return data.result;
+        }
-out_module_put:
+        nfs_mark_return_delegation(clp, delegation);
-        module_put(THIS_MODULE);
+        rcu_read_unlock();
-        return status;
+        nfs_delegation_run_state_manager(clp);
+        return 0;
 }
 /*
@@ -459,10 +454,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
        struct inode *res = NULL;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+                spin_lock(&delegation->lock);
+                if (delegation->inode != NULL &&
+                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
                        res = igrab(delegation->inode);
-                        break;
                }
+                spin_unlock(&delegation->lock);
+                if (res != NULL)
+                        break;
        }
        rcu_read_unlock();
        return res;
@@ -476,7 +475,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
-                delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
        rcu_read_unlock();
 }
@@ -486,17 +485,22 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct inode *inode;
 restart:
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
+                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                        continue;
+                inode = nfs_delegation_grab_inode(delegation);
+                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL)
                        nfs_free_delegation(delegation);
+                iput(inode);
                goto restart;
        }
        rcu_read_unlock();
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index f1c5e2a5d88e..09f383795174 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -17,14 +17,20 @@ struct nfs_delegation {
        struct rpc_cred *cred;
        struct inode *inode;
        nfs4_stateid stateid;
-        int type;
+        fmode_t type;
-#define NFS_DELEGATION_NEED_RECLAIM 1
-        long flags;
        loff_t maxsize;
        __u64 change_attr;
+        unsigned long flags;
+        spinlock_t lock;
        struct rcu_head rcu;
 };
+enum {
+        NFS_DELEGATION_NEED_RECLAIM = 0,
+        NFS_DELEGATION_RETURN,
+        NFS_DELEGATION_REFERENCED,
+};
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
@@ -32,9 +38,11 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
-void nfs_return_all_delegations(struct super_block *sb);
+void nfs_super_return_all_delegations(struct super_block *sb);
 void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
+void nfs_client_return_marked_delegations(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -45,22 +53,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
-{
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
-        struct nfs_delegation *delegation;
-        int ret = 0;
-        flags &= FMODE_READ|FMODE_WRITE;
-        rcu_read_lock();
-        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation != NULL && (delegation->type & flags) == flags)
-                ret = 1;
-        rcu_read_unlock();
-        return ret;
-}
 #else
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3e64b98f3a93..e35c8199f82f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -799,6 +799,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
                goto out_bad;
        }
+        if (nfs_have_delegation(inode, FMODE_READ))
+                goto out_set_verifier;
        /* Force a full look up iff the parent directory has changed */
        if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
                if (nfs_lookup_verify_inode(inode, nd))
@@ -817,6 +820,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
                goto out_bad;
+out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
        dput(parent);
@@ -973,7 +977,7 @@ struct dentry_operations nfs4_dentry_operations = {
 * Use intent information to determine whether we need to substitute
 * the NFSv4-style stateful OPEN for the LOOKUP call
 */
-static int is_atomic_open(struct inode *dir, struct nameidata *nd)
+static int is_atomic_open(struct nameidata *nd)
 {
        if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
                return 0;
@@ -996,7 +1000,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
        /* Check that we are indeed trying to open this file */
-        if (!is_atomic_open(dir, nd))
+        if (!is_atomic_open(nd))
                goto no_open;
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
@@ -1047,10 +1051,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *dir;
        int openflags, ret = 0;
+        if (!is_atomic_open(nd))
+                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
-        if (!is_atomic_open(dir, nd))
-                goto no_open;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
@@ -1062,11 +1066,11 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-                goto no_open;
+                goto no_open_dput;
        openflags = nd->intent.open.flags;
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
-                goto no_open;
+                goto no_open_dput;
        /* We can't create new files, or truncate existing ones here */
        openflags &= ~(O_CREAT|O_TRUNC);
@@ -1081,10 +1085,9 @@ out:
        if (!ret)
                d_drop(dentry);
        return ret;
-no_open:
+no_open_dput:
        dput(parent);
-        if (inode != NULL && nfs_have_delegation(inode, FMODE_READ))
+no_open:
-                return 1;
        return nfs_lookup_revalidate(dentry, nd);
 }
 #endif /* CONFIG_NFSV4 */
@@ -1794,7 +1797,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
        cache = nfs_access_search_rbtree(inode, cred);
        if (cache == NULL)
                goto out;
-        if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+        if (!nfs_have_delegation(inode, FMODE_READ) &&
+            !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
                goto out_stale;
        res->jiffies = cache->jiffies;
        res->cred = cache->cred;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d22eb383e1cf..0c381686171e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context
 /*
 * Given an inode, search for an open context with the desired characteristics
 */
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *pos, *ctx = NULL;
@@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode)
        if (nfs_have_delegation(inode, FMODE_READ))
                return 0;
-        /*
+        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
-         * Special case: if the attribute timeout is set to 0, then always
-         *               treat the cache as having expired (unless holding
-         *               a delegation).
-         */
-        if (nfsi->attrtimeo == 0)
-                return 1;
-        return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 /**
@@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfsi->attrtimeo_timestamp = now;
                nfsi->attr_gencount = nfs_inc_attr_generation_counter();
        } else {
-                if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+                if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
                        if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
                                nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
                        nfsi->attrtimeo_timestamp = now;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d212ee41caf2..340ede8f608f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,20 @@ struct nfs_parsed_mount_data {
        struct security_mnt_opts lsm_opts;
 };
+/* mount_clnt.c */
+struct nfs_mount_request {
+        struct sockaddr         *sap;
+        size_t                  salen;
+        char                    *hostname;
+        char                    *dirpath;
+        u32                     version;
+        unsigned short          protocol;
+        struct nfs_fh           *fh;
+        int                     noresvport;
+};
+extern int nfs_mount(struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 086a6830d785..ca905a5bb1ba 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -29,47 +29,43 @@ struct mnt_fhstatus {
 /**
 * nfs_mount - Obtain an NFS file handle for the given host and path
- * @addr: pointer to server's address
+ * @info: pointer to mount request arguments
- * @len: size of server's address
- * @hostname: name of server host, or NULL
- * @path: pointer to string containing export path to mount
- * @version: mount version to use for this request
- * @protocol: transport protocol to use for thie request
- * @fh: pointer to location to place returned file handle
 *
 * Uses default timeout parameters specified by underlying transport.
 */
-int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
+int nfs_mount(struct nfs_mount_request *info)
-              int version, int protocol, struct nfs_fh *fh)
 {
        struct mnt_fhstatus     result = {
-                .fh             = fh
+                .fh             = info->fh
        };
        struct rpc_message msg  = {
-                .rpc_argp       = path,
+                .rpc_argp       = info->dirpath,
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
-                .protocol       = protocol,
+                .protocol       = info->protocol,
-                .address        = addr,
+                .address        = info->sap,
-                .addrsize       = len,
+                .addrsize       = info->salen,
-                .servername     = hostname,
+                .servername     = info->hostname,
                .program        = &mnt_program,
-                .version        = version,
+                .version        = info->version,
                .authflavor     = RPC_AUTH_UNIX,
-                .flags          = 0,
        };
        struct rpc_clnt         *mnt_clnt;
        int                     status;
        dprintk("NFS: sending MNT request for %s:%s\n",
-                (hostname ? hostname : "server"), path);
+                (info->hostname ? info->hostname : "server"),
+                        info->dirpath);
+        if (info->noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        mnt_clnt = rpc_create(&args);
        if (IS_ERR(mnt_clnt))
                goto out_clnt_err;
-        if (version == NFS_MNT3_VERSION)
+        if (info->version == NFS_MNT3_VERSION)
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
        else
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea790645fda6..4e4d33204376 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,8 +38,12 @@ struct idmap;
 ((err) != NFSERR_NOFILEHANDLE))
 enum nfs4_client_state {
-        NFS4CLNT_STATE_RECOVER  = 0,
+        NFS4CLNT_MANAGER_RUNNING  = 0,
+        NFS4CLNT_CHECK_LEASE,
        NFS4CLNT_LEASE_EXPIRED,
+        NFS4CLNT_RECLAIM_REBOOT,
+        NFS4CLNT_RECLAIM_NOGRACE,
+        NFS4CLNT_DELEGRETURN,
 };
 /*
@@ -90,12 +94,18 @@ struct nfs4_state_owner {
        spinlock_t           so_lock;
        atomic_t             so_count;
+        unsigned long        so_flags;
        struct list_head     so_states;
        struct list_head     so_delegations;
        struct nfs_seqid_counter so_seqid;
        struct rpc_sequence  so_sequence;
 };
+enum {
+        NFS_OWNER_RECLAIM_REBOOT,
+        NFS_OWNER_RECLAIM_NOGRACE
+};
 /*
 * struct nfs4_state maintains the client-side state for a given
 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -128,6 +138,8 @@ enum {
        NFS_O_RDONLY_STATE,             /* OPEN stateid has read-only state */
        NFS_O_WRONLY_STATE,             /* OPEN stateid has write-only state */
        NFS_O_RDWR_STATE,               /* OPEN stateid has read/write state */
+        NFS_STATE_RECLAIM_REBOOT,       /* OPEN stateid server rebooted */
+        NFS_STATE_RECLAIM_NOGRACE,      /* OPEN stateid needs to recover state */
 };
 struct nfs4_state {
@@ -149,7 +161,7 @@ struct nfs4_state {
        unsigned int n_rdonly;          /* Number of read-only references */
        unsigned int n_wronly;          /* Number of write-only references */
        unsigned int n_rdwr;            /* Number of read/write references */
-        int state;                      /* State on the server (R,W, or RW) */
+        fmode_t state;                  /* State on the server (R,W, or RW) */
        atomic_t count;
 };
@@ -157,9 +169,12 @@ struct nfs4_state {
 struct nfs4_exception {
        long timeout;
        int retry;
+        struct nfs4_state *state;
 };
 struct nfs4_state_recovery_ops {
+        int owner_flag_bit;
+        int state_flag_bit;
        int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 };
@@ -174,7 +189,6 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
-extern int nfs4_map_errors(int err);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
@@ -187,7 +201,7 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
-extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
+extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -202,16 +216,18 @@ extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
-extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
+extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
-extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t);
+extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
-extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83e700a2b0c0..8dde84b988d9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,14 +62,12 @@
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 /* Prevent leaks of NFSv4 errors into userland */
-int nfs4_map_errors(int err)
+static int nfs4_map_errors(int err)
 {
        if (err < -1000) {
                dprintk("%s could not handle NFSv4 error %d\n",
@@ -195,6 +193,83 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start, KM_USER0);
 }
+static int nfs4_wait_bit_killable(void *word)
+{
+        if (fatal_signal_pending(current))
+                return -ERESTARTSYS;
+        schedule();
+        return 0;
+}
+static int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+        int res;
+        might_sleep();
+        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+                        nfs4_wait_bit_killable, TASK_KILLABLE);
+        return res;
+}
+static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+{
+        int res = 0;
+        might_sleep();
+        if (*timeout <= 0)
+                *timeout = NFS4_POLL_RETRY_MIN;
+        if (*timeout > NFS4_POLL_RETRY_MAX)
+                *timeout = NFS4_POLL_RETRY_MAX;
+        schedule_timeout_killable(*timeout);
+        if (fatal_signal_pending(current))
+                res = -ERESTARTSYS;
+        *timeout <<= 1;
+        return res;
+}
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_state *state = exception->state;
+        int ret = errorcode;
+        exception->retry = 0;
+        switch(errorcode) {
+                case 0:
+                        return 0;
+                case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OPENMODE:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_EXPIRED:
+                        nfs4_schedule_state_recovery(clp);
+                        ret = nfs4_wait_clnt_recover(clp);
+                        if (ret == 0)
+                                exception->retry = 1;
+                        break;
+                case -NFS4ERR_FILE_OPEN:
+                case -NFS4ERR_GRACE:
+                case -NFS4ERR_DELAY:
+                        ret = nfs4_delay(server->client, &exception->timeout);
+                        if (ret != 0)
+                                break;
+                case -NFS4ERR_OLD_STATEID:
+                        exception->retry = 1;
+        }
+        /* We failed to handle the error */
+        return nfs4_map_errors(ret);
+}
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -248,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
-                struct nfs4_state_owner *sp, int flags,
+                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
                const struct iattr *attrs)
 {
        struct dentry *parent = dget_parent(path->dentry);
@@ -268,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->owner = sp;
        atomic_inc(&sp->so_count);
        p->o_arg.fh = NFS_FH(dir);
-        p->o_arg.open_flags = flags,
+        p->o_arg.open_flags = flags;
+        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
        p->o_arg.clientid = server->nfs_client->cl_clientid;
        p->o_arg.id = sp->so_owner_id.id;
        p->o_arg.name = &p->path.dentry->d_name;
@@ -324,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
        return ret;
 }
-static int can_open_cached(struct nfs4_state *state, int mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
 {
        int ret = 0;
-        switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
+        if (open_mode & O_EXCL)
+                goto out;
+        switch (mode & (FMODE_READ|FMODE_WRITE)) {
                case FMODE_READ:
                        ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
                        break;
@@ -337,21 +416,23 @@ static int can_open_cached(struct nfs4_state *state, int mode)
                case FMODE_READ|FMODE_WRITE:
                        ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
        }
+out:
        return ret;
 }
-static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 {
-        if ((delegation->type & open_flags) != open_flags)
+        if ((delegation->type & fmode) != fmode)
                return 0;
-        if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM)
+        if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
                return 0;
+        nfs_mark_delegation_referenced(delegation);
        return 1;
 }
-static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
 {
-        switch (open_flags) {
+        switch (fmode) {
                case FMODE_WRITE:
                        state->n_wronly++;
                        break;
@@ -361,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
                case FMODE_READ|FMODE_WRITE:
                        state->n_rdwr++;
        }
-        nfs4_state_set_mode_locked(state, state->state | open_flags);
+        nfs4_state_set_mode_locked(state, state->state | fmode);
 }
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
        memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
-        switch (open_flags) {
+        switch (fmode) {
                case FMODE_READ:
                        set_bit(NFS_O_RDONLY_STATE, &state->flags);
                        break;
@@ -381,16 +462,15 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
        }
 }
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        write_seqlock(&state->seqlock);
-        nfs_set_open_stateid_locked(state, stateid, open_flags);
+        nfs_set_open_stateid_locked(state, stateid, fmode);
        write_sequnlock(&state->seqlock);
 }
-static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags)
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
 {
-        open_flags &= (FMODE_READ|FMODE_WRITE);
        /*
         * Protect the call to nfs4_state_set_mode_locked and
         * serialise the stateid update
@@ -401,20 +481,60 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta
                set_bit(NFS_DELEGATED_STATE, &state->flags);
        }
        if (open_stateid != NULL)
-                nfs_set_open_stateid_locked(state, open_stateid, open_flags);
+                nfs_set_open_stateid_locked(state, open_stateid, fmode);
        write_sequnlock(&state->seqlock);
        spin_lock(&state->owner->so_lock);
-        update_open_stateflags(state, open_flags);
+        update_open_stateflags(state, fmode);
        spin_unlock(&state->owner->so_lock);
 }
-static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags)
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+{
+        struct nfs_inode *nfsi = NFS_I(state->inode);
+        struct nfs_delegation *deleg_cur;
+        int ret = 0;
+        fmode &= (FMODE_READ|FMODE_WRITE);
+        rcu_read_lock();
+        deleg_cur = rcu_dereference(nfsi->delegation);
+        if (deleg_cur == NULL)
+                goto no_delegation;
+        spin_lock(&deleg_cur->lock);
+        if (nfsi->delegation != deleg_cur ||
+            (deleg_cur->type & fmode) != fmode)
+                goto no_delegation_unlock;
+        if (delegation == NULL)
+                delegation = &deleg_cur->stateid;
+        else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+                goto no_delegation_unlock;
+        nfs_mark_delegation_referenced(deleg_cur);
+        __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+        ret = 1;
+no_delegation_unlock:
+        spin_unlock(&deleg_cur->lock);
+no_delegation:
+        rcu_read_unlock();
+        if (!ret && open_stateid != NULL) {
+                __update_open_stateid(state, open_stateid, NULL, fmode);
+                ret = 1;
+        }
+        return ret;
+}
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
 {
        struct nfs_delegation *delegation;
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation == NULL || (delegation->type & open_flags) == open_flags) {
+        if (delegation == NULL || (delegation->type & fmode) == fmode) {
                rcu_read_unlock();
                return;
        }
@@ -427,27 +547,28 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
        struct nfs4_state *state = opendata->state;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_delegation *delegation;
-        int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL);
+        int open_mode = opendata->o_arg.open_flags & O_EXCL;
+        fmode_t fmode = opendata->o_arg.fmode;
        nfs4_stateid stateid;
        int ret = -EAGAIN;
-        rcu_read_lock();
-        delegation = rcu_dereference(nfsi->delegation);
        for (;;) {
-                if (can_open_cached(state, open_mode)) {
+                if (can_open_cached(state, fmode, open_mode)) {
                        spin_lock(&state->owner->so_lock);
-                        if (can_open_cached(state, open_mode)) {
+                        if (can_open_cached(state, fmode, open_mode)) {
-                                update_open_stateflags(state, open_mode);
+                                update_open_stateflags(state, fmode);
                                spin_unlock(&state->owner->so_lock);
-                                rcu_read_unlock();
                                goto out_return_state;
                        }
                        spin_unlock(&state->owner->so_lock);
                }
-                if (delegation == NULL)
+                rcu_read_lock();
-                        break;
+                delegation = rcu_dereference(nfsi->delegation);
-                if (!can_open_delegated(delegation, open_mode))
+                if (delegation == NULL ||
+                    !can_open_delegated(delegation, fmode)) {
+                        rcu_read_unlock();
                        break;
+                }
                /* Save the delegation */
                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
                rcu_read_unlock();
@@ -455,19 +576,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                if (ret != 0)
                        goto out;
                ret = -EAGAIN;
-                rcu_read_lock();
-                delegation = rcu_dereference(nfsi->delegation);
+                /* Try to update the stateid using the delegation */
-                /* If no delegation, try a cached open */
+                if (update_open_stateid(state, NULL, &stateid, fmode))
-                if (delegation == NULL)
+                        goto out_return_state;
-                        continue;
-                /* Is the delegation still valid? */
-                if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0)
-                        continue;
-                rcu_read_unlock();
-                update_open_stateid(state, NULL, &stateid, open_mode);
-                goto out_return_state;
        }
-        rcu_read_unlock();
 out:
        return ERR_PTR(ret);
 out_return_state:
@@ -480,7 +593,6 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
        struct inode *inode;
        struct nfs4_state *state = NULL;
        struct nfs_delegation *delegation;
-        nfs4_stateid *deleg_stateid = NULL;
        int ret;
        if (!data->rpc_done) {
@@ -507,7 +619,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                if (delegation)
                        delegation_flags = delegation->flags;
                rcu_read_unlock();
-                if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
+                if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
                        nfs_inode_set_delegation(state->inode,
                                        data->owner->so_cred,
                                        &data->o_res);
@@ -516,12 +628,9 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                                        data->owner->so_cred,
                                        &data->o_res);
        }
-        rcu_read_lock();
-        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        update_open_stateid(state, &data->o_res.stateid, NULL,
-        if (delegation != NULL)
+                        data->o_arg.fmode);
-                deleg_stateid = &delegation->stateid;
-        update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags);
-        rcu_read_unlock();
        iput(inode);
 out:
        return state;
@@ -552,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
        struct nfs4_opendata *opendata;
-        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
+        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -560,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
        return opendata;
 }
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
 {
        struct nfs4_state *newstate;
        int ret;
-        opendata->o_arg.open_flags = openflags;
+        opendata->o_arg.open_flags = 0;
+        opendata->o_arg.fmode = fmode;
        memset(&opendata->o_res, 0, sizeof(opendata->o_res));
        memset(&opendata->c_res, 0, sizeof(opendata->c_res));
        nfs4_init_opendata_res(opendata);
@@ -575,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf
        newstate = nfs4_opendata_to_nfs4_state(opendata);
        if (IS_ERR(newstate))
                return PTR_ERR(newstate);
-        nfs4_close_state(&opendata->path, newstate, openflags);
+        nfs4_close_state(&opendata->path, newstate, fmode);
        *res = newstate;
        return 0;
 }
@@ -631,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 {
        struct nfs_delegation *delegation;
        struct nfs4_opendata *opendata;
-        int delegation_type = 0;
+        fmode_t delegation_type = 0;
        int status;
        opendata = nfs4_open_recoverdata_alloc(ctx, state);
@@ -641,7 +751,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        opendata->o_arg.fh = NFS_FH(state->inode);
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
-        if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0)
+        if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
                delegation_type = delegation->type;
        rcu_read_unlock();
        opendata->o_arg.u.delegation_type = delegation_type;
@@ -744,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata)
                goto out_free;
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
-                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+                nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
        nfs4_opendata_put(data);
 }
@@ -808,12 +918,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        if (data->state != NULL) {
                struct nfs_delegation *delegation;
-                if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL)))
+                if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
                        goto out_no_action;
                rcu_read_lock();
                delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
                if (delegation != NULL &&
-                   (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) {
+                    test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
                        rcu_read_unlock();
                        goto out_no_action;
                }
@@ -877,7 +987,7 @@ static void nfs4_open_release(void *calldata)
                goto out_free;
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
-                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+                nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
        nfs4_opendata_put(data);
 }
@@ -955,10 +1065,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
        int ret;
        for (;;) {
-                ret = nfs4_wait_clnt_recover(server->client, clp);
+                ret = nfs4_wait_clnt_recover(clp);
                if (ret != 0)
                        return ret;
-                if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
                nfs4_schedule_state_recovery(clp);
        }
@@ -993,8 +1104,9 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
        do {
                err = _nfs4_open_expired(ctx, state);
-                if (err == -NFS4ERR_DELAY)
+                if (err != -NFS4ERR_DELAY)
-                        nfs4_handle_exception(server, err, &exception);
+                        break;
+                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
        return err;
 }
@@ -1031,12 +1143,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 /*
 * Returns a referenced nfs4_state
 */
-static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
 {
        struct nfs4_state_owner  *sp;
        struct nfs4_state     *state = NULL;
        struct nfs_server       *server = NFS_SERVER(dir);
-        struct nfs_client *clp = server->nfs_client;
        struct nfs4_opendata *opendata;
        int status;
@@ -1050,12 +1161,11 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
        if (status != 0)
                goto err_put_state_owner;
        if (path->dentry->d_inode != NULL)
-                nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
+                nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
-        down_read(&clp->cl_sem);
        status = -ENOMEM;
-        opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
+        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
        if (opendata == NULL)
-                goto err_release_rwsem;
+                goto err_put_state_owner;
        if (path->dentry->d_inode != NULL)
                opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
@@ -1073,13 +1183,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
                goto err_opendata_put;
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
-        up_read(&clp->cl_sem);
        *res = state;
        return 0;
 err_opendata_put:
        nfs4_opendata_put(opendata);
-err_release_rwsem:
-        up_read(&clp->cl_sem);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
 out_err:
@@ -1088,14 +1195,14 @@ out_err:
 }
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        struct nfs4_state *res;
        int status;
        do {
-                status = _nfs4_do_open(dir, path, flags, sattr, cred, &res);
+                status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
                if (status == 0)
                        break;
                /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1230,10 +1337,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        renew_lease(server, calldata->timestamp);
                        break;
                case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_OLD_STATEID:
+                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_EXPIRED:
-                        break;
+                        if (calldata->arg.fmode == 0)
+                                break;
                default:
-                        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
                                rpc_restart_call(task);
                                return;
                        }
@@ -1272,10 +1382,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        nfs_fattr_init(calldata->res.fattr);
        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.open_flags = FMODE_READ;
+                calldata->arg.fmode = FMODE_READ;
        } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.open_flags = FMODE_WRITE;
+                calldata->arg.fmode = FMODE_WRITE;
        }
        calldata->timestamp = jiffies;
        rpc_call_start(task);
@@ -1328,6 +1438,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
+        calldata->arg.fmode = 0;
        calldata->arg.bitmask = server->attr_bitmask;
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
@@ -1354,13 +1465,13 @@ out:
        return status;
 }
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
        struct file *filp;
        int ret;
        /* If the open_intent is for execute, we have an extra check to make */
-        if (nd->intent.open.flags & FMODE_EXEC) {
+        if (fmode & FMODE_EXEC) {
                ret = nfs_may_open(state->inode,
                                state->owner->so_cred,
                                nd->intent.open.flags);
@@ -1376,7 +1487,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
        }
        ret = PTR_ERR(filp);
 out_close:
-        nfs4_close_sync(path, state, nd->intent.open.flags);
+        nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
        return ret;
 }
@@ -1392,6 +1503,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct rpc_cred *cred;
        struct nfs4_state *state;
        struct dentry *res;
+        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
@@ -1409,7 +1521,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
+        state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                if (PTR_ERR(state) == -ENOENT) {
@@ -1424,7 +1536,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                path.dentry = res;
        nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
        nfs_unblock_sillyrename(parent);
-        nfs4_intent_set_file(nd, &path, state);
+        nfs4_intent_set_file(nd, &path, state, fmode);
        return res;
 }
@@ -1437,11 +1549,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        };
        struct rpc_cred *cred;
        struct nfs4_state *state;
+        fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        state = nfs4_do_open(dir, &path, openflags, NULL, cred);
+        state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                switch (PTR_ERR(state)) {
@@ -1458,10 +1571,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        }
        if (state->inode == dentry->d_inode) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                nfs4_intent_set_file(nd, &path, state);
+                nfs4_intent_set_file(nd, &path, state, fmode);
                return 1;
        }
-        nfs4_close_sync(&path, state, openflags);
+        nfs4_close_sync(&path, state, fmode);
 out_drop:
        d_drop(dentry);
        return 0;
@@ -1887,6 +2000,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        };
        struct nfs4_state *state;
        struct rpc_cred *cred;
+        fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
        int status = 0;
        cred = rpc_lookup_cred();
@@ -1894,7 +2008,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = PTR_ERR(cred);
                goto out;
        }
-        state = nfs4_do_open(dir, &path, flags, sattr, cred);
+        state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
@@ -1910,9 +2024,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                nfs_post_op_update_inode(state->inode, &fattr);
        }
        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
-                status = nfs4_intent_set_file(nd, &path, state);
+                status = nfs4_intent_set_file(nd, &path, state, fmode);
        else
-                nfs4_close_sync(&path, state, flags);
+                nfs4_close_sync(&path, state, fmode);
 out_putcred:
        put_rpccred(cred);
 out:
@@ -1974,7 +2088,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
-        if (nfs4_async_handle_error(task, res->server) == -EAGAIN)
+        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
        nfs_post_op_update_inode(dir, &res->dir_attr);
@@ -2402,7 +2516,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2423,7 +2537,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2449,7 +2563,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2742,19 +2856,25 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
        struct nfs_client *clp = server->nfs_client;
        if (!clp || task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
+                case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OPENMODE:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
                        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
                        nfs4_schedule_state_recovery(clp);
-                        if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+                        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
                        task->tk_status = 0;
                        return -EAGAIN;
@@ -2772,79 +2892,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
        return 0;
 }
-static int nfs4_wait_bit_killable(void *word)
-{
-        if (fatal_signal_pending(current))
-                return -ERESTARTSYS;
-        schedule();
-        return 0;
-}
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
-{
-        int res;
-        might_sleep();
-        rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
-        res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
-                        nfs4_wait_bit_killable, TASK_KILLABLE);
-        rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
-        return res;
-}
-static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
-{
-        int res = 0;
-        might_sleep();
-        if (*timeout <= 0)
-                *timeout = NFS4_POLL_RETRY_MIN;
-        if (*timeout > NFS4_POLL_RETRY_MAX)
-                *timeout = NFS4_POLL_RETRY_MAX;
-        schedule_timeout_killable(*timeout);
-        if (fatal_signal_pending(current))
-                res = -ERESTARTSYS;
-        *timeout <<= 1;
-        return res;
-}
-/* This is the error handling routine for processes that are allowed
- * to sleep.
- */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
-{
-        struct nfs_client *clp = server->nfs_client;
-        int ret = errorcode;
-        exception->retry = 0;
-        switch(errorcode) {
-                case 0:
-                        return 0;
-                case -NFS4ERR_STALE_CLIENTID:
-                case -NFS4ERR_STALE_STATEID:
-                case -NFS4ERR_EXPIRED:
-                        nfs4_schedule_state_recovery(clp);
-                        ret = nfs4_wait_clnt_recover(server->client, clp);
-                        if (ret == 0)
-                                exception->retry = 1;
-                        break;
-                case -NFS4ERR_FILE_OPEN:
-                case -NFS4ERR_GRACE:
-                case -NFS4ERR_DELAY:
-                        ret = nfs4_delay(server->client, &exception->timeout);
-                        if (ret != 0)
-                                break;
-                case -NFS4ERR_OLD_STATEID:
-                        exception->retry = 1;
-        }
-        /* We failed to handle the error */
-        return nfs4_map_errors(ret);
-}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
@@ -2916,7 +2963,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
                spin_lock(&clp->cl_lock);
                clp->cl_lease_time = fsinfo.lease_time * HZ;
                clp->cl_last_renewal = now;
-                clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                spin_unlock(&clp->cl_lock);
        }
        return status;
@@ -3074,7 +3120,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        struct nfs4_lock_state *lsp;
        int status;
-        down_read(&clp->cl_sem);
        arg.lock_owner.clientid = clp->cl_clientid;
        status = nfs4_set_lock_state(state, request);
        if (status != 0)
@@ -3091,7 +3136,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        }
        request->fl_ops->fl_release_private(request);
 out:
-        up_read(&clp->cl_sem);
        return status;
 }
@@ -3181,11 +3225,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                                        sizeof(calldata->lsp->ls_stateid.data));
                        renew_lease(calldata->server, calldata->timestamp);
                        break;
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
                        break;
                default:
-                        if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN)
+                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
                                rpc_restart_call(task);
        }
 }
@@ -3248,6 +3294,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
+        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_seqid *seqid;
        struct nfs4_lock_state *lsp;
        struct rpc_task *task;
@@ -3257,8 +3304,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        status = nfs4_set_lock_state(state, request);
        /* Unlock _before_ we do the RPC call */
        request->fl_flags |= FL_EXISTS;
-        if (do_vfs_lock(request->fl_file, request) == -ENOENT)
+        down_read(&nfsi->rwsem);
+        if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+                up_read(&nfsi->rwsem);
                goto out;
+        }
+        up_read(&nfsi->rwsem);
        if (status != 0)
                goto out;
        /* Is this a delegated lock? */
@@ -3484,7 +3535,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs_client *clp = state->owner->so_client;
+        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
        int status;
@@ -3496,19 +3547,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        status = do_vfs_lock(request->fl_file, request);
        if (status < 0)
                goto out;
-        down_read(&clp->cl_sem);
+        down_read(&nfsi->rwsem);
        if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-                struct nfs_inode *nfsi = NFS_I(state->inode);
                /* Yes: cache locks! */
-                down_read(&nfsi->rwsem);
                /* ...but avoid races with delegation recall... */
-                if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+                request->fl_flags = fl_flags & ~FL_SLEEP;
-                        request->fl_flags = fl_flags & ~FL_SLEEP;
+                status = do_vfs_lock(request->fl_file, request);
-                        status = do_vfs_lock(request->fl_file, request);
+                goto out_unlock;
-                        up_read(&nfsi->rwsem);
-                        goto out_unlock;
-                }
-                up_read(&nfsi->rwsem);
        }
        status = _nfs4_do_setlk(state, cmd, request, 0);
        if (status != 0)
@@ -3518,7 +3563,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        if (do_vfs_lock(request->fl_file, request) < 0)
                printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 out_unlock:
-        up_read(&clp->cl_sem);
+        up_read(&nfsi->rwsem);
 out:
        request->fl_flags = fl_flags;
        return status;
@@ -3664,11 +3709,15 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 }
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
 };
-struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = {
+struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
        .recover_lock   = nfs4_lock_expired,
 };
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3305acbbe2ae..f524e932ff7b 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work)
        long lease, timeout;
        unsigned long last, now;
-        down_read(&clp->cl_sem);
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
@@ -77,17 +76,19 @@ nfs4_renew_state(struct work_struct *work)
        timeout = (2 * lease) / 3 + (long)last - (long)now;
        /* Are we close to a lease timeout? */
        if (time_after(now, last + lease/3)) {
-                cred = nfs4_get_renew_cred(clp);
+                cred = nfs4_get_renew_cred_locked(clp);
+                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        if (list_empty(&clp->cl_delegations)) {
-                        spin_unlock(&clp->cl_lock);
+                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                goto out;
+                        }
                        nfs_expire_all_delegations(clp);
-                        goto out;
+                } else {
+                        /* Queue an asynchronous RENEW. */
+                        nfs4_proc_async_renew(clp, cred);
+                        put_rpccred(cred);
                }
-                spin_unlock(&clp->cl_lock);
-                /* Queue an asynchronous RENEW. */
-                nfs4_proc_async_renew(clp, cred);
-                put_rpccred(cred);
                timeout = (2 * lease) / 3;
                spin_lock(&clp->cl_lock);
        } else
@@ -100,12 +101,11 @@ nfs4_renew_state(struct work_struct *work)
        cancel_delayed_work(&clp->cl_renewd);
        schedule_delayed_work(&clp->cl_renewd, timeout);
        spin_unlock(&clp->cl_lock);
+        nfs_expire_unreferenced_delegations(clp);
 out:
-        up_read(&clp->cl_sem);
        dprintk("%s: done\n", __func__);
 }
-/* Must be called with clp->cl_sem locked for writes */
 void
 nfs4_schedule_state_renewal(struct nfs_client *clp)
 {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 401ef8b28f97..2022fe47966f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,14 +71,12 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
        return status;
 }
-static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp)
+static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 {
        struct rpc_cred *cred = NULL;
-        spin_lock(&clp->cl_lock);
        if (clp->cl_machine_cred != NULL)
                cred = get_rpccred(clp->cl_machine_cred);
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
@@ -94,7 +92,7 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
@@ -110,13 +108,24 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
        return cred;
 }
+static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+{
+        struct rpc_cred *cred;
+        spin_lock(&clp->cl_lock);
+        cred = nfs4_get_renew_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
 static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct rpc_cred *cred;
-        cred = nfs4_get_machine_cred(clp);
+        spin_lock(&clp->cl_lock);
+        cred = nfs4_get_machine_cred_locked(clp);
        if (cred != NULL)
                goto out;
        pos = rb_first(&clp->cl_state_owners);
@@ -125,6 +134,7 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
                cred = get_rpccred(sp->so_cred);
        }
 out:
+        spin_unlock(&clp->cl_lock);
        return cred;
 }
@@ -295,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
        }
 }
-/*
- * Note: must be called with clp->cl_sem held in order to prevent races
- *       with reboot recovery!
- */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -327,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
-/*
- * Must be called with clp->cl_sem held in order to avoid races
- * with state recovery...
- */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_client;
@@ -361,18 +363,18 @@ nfs4_alloc_open_state(void)
 }
 void
-nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode)
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
 {
-        if (state->state == mode)
+        if (state->state == fmode)
                return;
        /* NB! List reordering - see the reclaim code for why.  */
-        if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+        if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
-                if (mode & FMODE_WRITE)
+                if (fmode & FMODE_WRITE)
                        list_move(&state->open_states, &state->owner->so_states);
                else
                        list_move_tail(&state->open_states, &state->owner->so_states);
        }
-        state->state = mode;
+        state->state = fmode;
 }
 static struct nfs4_state *
@@ -432,10 +434,6 @@ out:
        return state;
 }
-/*
- * Beware! Caller must be holding exactly one
- * reference to clp->cl_sem!
- */
 void nfs4_put_open_state(struct nfs4_state *state)
 {
        struct inode *inode = state->inode;
@@ -456,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
 * Close the current file.
 */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
 {
        struct nfs4_state_owner *owner = state->owner;
        int call_close = 0;
-        int newstate;
+        fmode_t newstate;
        atomic_inc(&owner->so_count);
        /* Protect against nfs4_find_state() */
        spin_lock(&owner->so_lock);
-        switch (mode & (FMODE_READ | FMODE_WRITE)) {
+        switch (fmode & (FMODE_READ | FMODE_WRITE)) {
                case FMODE_READ:
                        state->n_rdonly--;
                        break;
@@ -500,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod
                nfs4_do_close(path, state, wait);
 }
-void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, mode, 0);
+        __nfs4_close(path, state, fmode, 0);
 }
-void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, mode, 1);
+        __nfs4_close(path, state, fmode, 1);
 }
 /*
@@ -568,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 * Return a compatible lock_state. If no initialized lock_state structure
 * exists, return an uninitialized one.
 *
- * The caller must be holding clp->cl_sem
 */
 static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
@@ -770,32 +767,34 @@ unlock:
        return status;
 }
-static int reclaimer(void *);
+static int nfs4_run_state_manager(void *);
-static inline void nfs4_clear_recover_bit(struct nfs_client *clp)
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
 {
        smp_mb__before_clear_bit();
-        clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
+        clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
        smp_mb__after_clear_bit();
-        wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER);
+        wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
        rpc_wake_up(&clp->cl_rpcwaitq);
 }
 /*
- * State recovery routine
+ * Schedule the nfs_client asynchronous state management routine
 */
-static void nfs4_recover_state(struct nfs_client *clp)
+void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
        struct task_struct *task;
+        if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+                return;
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
-        task = kthread_run(reclaimer, clp, "%s-reclaim",
+        task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
                                rpc_peeraddr2str(clp->cl_rpcclient,
                                                        RPC_DISPLAY_ADDR));
        if (!IS_ERR(task))
                return;
-        nfs4_clear_recover_bit(clp);
+        nfs4_clear_state_manager_bit(clp);
        nfs_put_client(clp);
        module_put(THIS_MODULE);
 }
@@ -807,16 +806,42 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
-        if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+        if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-                nfs4_recover_state(clp);
+                set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+        nfs4_schedule_state_manager(clp);
 }
-static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+        /* Don't recover state that expired before the reboot */
+        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+                clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+                return 0;
+        }
+        set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+        return 1;
+}
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+        set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+        set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+        return 1;
+}
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
+        struct nfs_inode *nfsi = NFS_I(inode);
        struct file_lock *fl;
        int status = 0;
+        down_write(&nfsi->rwsem);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
@@ -839,12 +864,14 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
                                goto out_err;
                }
        }
+        up_write(&nfsi->rwsem);
        return 0;
 out_err:
+        up_write(&nfsi->rwsem);
        return status;
 }
-static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp)
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
 {
        struct nfs4_state *state;
        struct nfs4_lock_state *lock;
@@ -858,28 +885,34 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
         * recovering after a network partition or a reboot from a
         * server that doesn't support a grace period.
         */
+restart:
+        spin_lock(&sp->so_lock);
        list_for_each_entry(state, &sp->so_states, open_states) {
+                if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+                        continue;
                if (state->state == 0)
                        continue;
+                atomic_inc(&state->count);
+                spin_unlock(&sp->so_lock);
                status = ops->recover_open(sp, state);
                if (status >= 0) {
-                        status = nfs4_reclaim_locks(ops, state);
+                        status = nfs4_reclaim_locks(state, ops);
-                        if (status < 0)
+                        if (status >= 0) {
-                                goto out_err;
+                                list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                        list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                                        if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
-                                if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
+                                                printk("%s: Lock reclaim failed!\n",
-                                        printk("%s: Lock reclaim failed!\n",
                                                        __func__);
+                                }
+                                nfs4_put_open_state(state);
+                                goto restart;
                        }
-                        continue;
                }
                switch (status) {
                        default:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
                        case -ENOENT:
-                        case -NFS4ERR_RECLAIM_BAD:
+                        case -ESTALE:
-                        case -NFS4ERR_RECLAIM_CONFLICT:
                                /*
                                 * Open state on this file cannot be recovered
                                 * All we can do is revert to using the zero stateid.
@@ -889,84 +922,176 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -NFS4ERR_RECLAIM_BAD:
+                        case -NFS4ERR_RECLAIM_CONFLICT:
+                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                break;
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_NO_GRACE:
+                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
                        case -NFS4ERR_STALE_CLIENTID:
                                goto out_err;
                }
+                nfs4_put_open_state(state);
+                goto restart;
        }
+        spin_unlock(&sp->so_lock);
        return 0;
 out_err:
+        nfs4_put_open_state(state);
        return status;
 }
-static void nfs4_state_mark_reclaim(struct nfs_client *clp)
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+        struct nfs4_lock_state *lock;
+        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+        clear_bit(NFS_O_RDWR_STATE, &state->flags);
+        list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                lock->ls_seqid.flags = 0;
+                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+        }
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        struct nfs4_lock_state *lock;
        /* Reset all sequence ids to zero */
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-                sp->so_seqid.counter = 0;
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+                        if (mark_reclaim(clp, state))
-                        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+                                nfs4_clear_open_state(state);
-                        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-                        clear_bit(NFS_O_RDWR_STATE, &state->flags);
-                        list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                                lock->ls_seqid.counter = 0;
-                                lock->ls_seqid.flags = 0;
-                                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
-                        }
                }
                spin_unlock(&sp->so_lock);
        }
 }
-static int reclaimer(void *ptr)
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+        /* Mark all delegations for reclaim */
+        nfs_delegation_mark_reclaim(clp);
+        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
-        struct nfs_client *clp = ptr;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct nfs4_state_recovery_ops *ops;
+        struct nfs4_state *state;
-        struct rpc_cred *cred;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return;
+        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&sp->so_lock);
+                list_for_each_entry(state, &sp->so_states, open_states) {
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                                continue;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                }
+                spin_unlock(&sp->so_lock);
+        }
+        nfs_delegation_reap_unclaimed(clp);
+}
+static void nfs_delegation_clear_all(struct nfs_client *clp)
+{
+        nfs_delegation_mark_reclaim(clp);
+        nfs_delegation_reap_unclaimed(clp);
+}
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+        nfs_delegation_clear_all(clp);
+        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
+{
+        clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+}
+static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+        switch (error) {
+                case -NFS4ERR_CB_PATH_DOWN:
+                        nfs_handle_cb_pathdown(clp);
+                        break;
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_LEASE_MOVED:
+                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_start_reclaim_reboot(clp);
+                        break;
+                case -NFS4ERR_EXPIRED:
+                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_start_reclaim_nograce(clp);
+        }
+}
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+        struct rb_node *pos;
        int status = 0;
-        allow_signal(SIGKILL);
+restart:
+        spin_lock(&clp->cl_lock);
+        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                        continue;
+                atomic_inc(&sp->so_count);
+                spin_unlock(&clp->cl_lock);
+                status = nfs4_reclaim_open_state(sp, ops);
+                if (status < 0) {
+                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                        nfs4_put_state_owner(sp);
+                        nfs4_recovery_handle_error(clp, status);
+                        return status;
+                }
+                nfs4_put_state_owner(sp);
+                goto restart;
+        }
+        spin_unlock(&clp->cl_lock);
+        return status;
+}
-        /* Ensure exclusive access to NFSv4 state */
+static int nfs4_check_lease(struct nfs_client *clp)
-        down_write(&clp->cl_sem);
+{
-        /* Are there any NFS mounts out there? */
+        struct rpc_cred *cred;
-        if (list_empty(&clp->cl_superblocks))
+        int status = -NFS4ERR_EXPIRED;
-                goto out;
-restart_loop:
+        /* Is the client already known to have an expired lease? */
-        ops = &nfs4_network_partition_recovery_ops;
+        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-        /* Are there any open files on this volume? */
+                return 0;
        cred = nfs4_get_renew_cred(clp);
-        if (cred != NULL) {
+        if (cred == NULL) {
-                /* Yes there are: try to renew the old lease */
+                cred = nfs4_get_setclientid_cred(clp);
-                status = nfs4_proc_renew(clp, cred);
+                if (cred == NULL)
-                put_rpccred(cred);
+                        goto out;
-                switch (status) {
-                        case 0:
-                        case -NFS4ERR_CB_PATH_DOWN:
-                                goto out;
-                        case -NFS4ERR_STALE_CLIENTID:
-                        case -NFS4ERR_LEASE_MOVED:
-                                ops = &nfs4_reboot_recovery_ops;
-                }
-        } else {
-                /* "reboot" to ensure we clear all state on the server */
-                clp->cl_boot_time = CURRENT_TIME;
        }
-        /* We're going to have to re-establish a clientid */
+        status = nfs4_proc_renew(clp, cred);
-        nfs4_state_mark_reclaim(clp);
+        put_rpccred(cred);
-        status = -ENOENT;
+out:
+        nfs4_recovery_handle_error(clp, status);
+        return status;
+}
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+        struct rpc_cred *cred;
+        int status = -ENOENT;
        cred = nfs4_get_setclientid_cred(clp);
        if (cred != NULL) {
                status = nfs4_init_client(clp, cred);
@@ -974,42 +1099,90 @@ restart_loop:
                /* Handle case where the user hasn't set up machine creds */
                if (status == -EACCES && cred == clp->cl_machine_cred) {
                        nfs4_clear_machine_cred(clp);
-                        goto restart_loop;
+                        status = -EAGAIN;
                }
        }
-        if (status)
+        return status;
-                goto out_error;
+}
-        /* Mark all delegations for reclaim */
-        nfs_delegation_mark_reclaim(clp);
+static void nfs4_state_manager(struct nfs_client *clp)
-        /* Note: list is protected by exclusive lock on cl->cl_sem */
+{
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        int status = 0;
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-                status = nfs4_reclaim_open_state(ops, sp);
+        /* Ensure exclusive access to NFSv4 state */
-                if (status < 0) {
+        for(;;) {
-                        if (status == -NFS4ERR_NO_GRACE) {
+                if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
-                                ops = &nfs4_network_partition_recovery_ops;
+                        /* We're going to have to re-establish a clientid */
-                                status = nfs4_reclaim_open_state(ops, sp);
+                        status = nfs4_reclaim_lease(clp);
+                        if (status) {
+                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                if (status == -EAGAIN)
+                                        continue;
+                                goto out_error;
                        }
+                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+                }
+                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+                        status = nfs4_check_lease(clp);
+                        if (status != 0)
+                                continue;
+                }
+                /* First recover reboot state... */
+                if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+                        status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
                        if (status == -NFS4ERR_STALE_CLIENTID)
-                                goto restart_loop;
+                                continue;
-                        if (status == -NFS4ERR_EXPIRED)
+                        nfs4_state_end_reclaim_reboot(clp);
-                                goto restart_loop;
+                        continue;
+                }
+                /* Now recover expired state... */
+                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+                        status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
+                        if (status < 0) {
+                                set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+                                if (status == -NFS4ERR_STALE_CLIENTID)
+                                        continue;
+                                if (status == -NFS4ERR_EXPIRED)
+                                        continue;
+                                goto out_error;
+                        } else
+                                nfs4_state_end_reclaim_nograce(clp);
+                        continue;
                }
+                if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+                        nfs_client_return_marked_delegations(clp);
+                        continue;
+                }
+                nfs4_clear_state_manager_bit(clp);
+                /* Did we race with an attempt to give us more work? */
+                if (clp->cl_state == 0)
+                        break;
+                if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+                        break;
        }
-        nfs_delegation_reap_unclaimed(clp);
+        return;
-out:
+out_error:
-        up_write(&clp->cl_sem);
+        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
-        if (status == -NFS4ERR_CB_PATH_DOWN)
+                        " with error %d\n", clp->cl_hostname, -status);
-                nfs_handle_cb_pathdown(clp);
+        if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-        nfs4_clear_recover_bit(clp);
+                nfs4_state_end_reclaim_reboot(clp);
+        nfs4_clear_state_manager_bit(clp);
+}
+static int nfs4_run_state_manager(void *ptr)
+{
+        struct nfs_client *clp = ptr;
+        allow_signal(SIGKILL);
+        nfs4_state_manager(clp);
        nfs_put_client(clp);
        module_put_and_exit(0);
        return 0;
-out_error:
-        printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
-                        " with error %d\n", clp->cl_hostname, -status);
-        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-        goto out;
 }
 /*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b916297d2334..d1e4c8f8a0a9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -8,7 +8,7 @@
 *
 *  Kendrick Smith <kmsmith@umich.edu>
 *  Andy Adamson   <andros@umich.edu>
- * 
+ *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
@@ -67,7 +67,7 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_MAXTAGLEN          0
 #endif
-/* lock,open owner id: 
+/* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
 #define open_owner_id_maxsz     (1 + 4)
@@ -541,6 +541,7 @@ static struct {
 struct compound_hdr {
        int32_t         status;
        uint32_t        nops;
+        __be32 *        nops_p;
        uint32_t        taglen;
        char *          tag;
 };
@@ -578,7 +579,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
        xdr_encode_opaque(p, str, len);
 }
-static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -588,8 +589,13 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
        WRITE32(hdr->taglen);
        WRITEMEM(hdr->tag, hdr->taglen);
        WRITE32(NFS4_MINOR_VERSION);
+        hdr->nops_p = p;
        WRITE32(hdr->nops);
-        return 0;
+}
+static void encode_nops(struct compound_hdr *hdr)
+{
+        *hdr->nops_p = htonl(hdr->nops);
 }
 static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
@@ -601,7 +607,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
        xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
 }
-static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
 {
        char owner_name[IDMAP_NAMESZ];
        char owner_group[IDMAP_NAMESZ];
@@ -612,7 +618,6 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        int len;
        uint32_t bmval0 = 0;
        uint32_t bmval1 = 0;
-        int status;
        /*
         * We reserve enough space to write the entire attribute buffer at once.
@@ -709,7 +714,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
                WRITE32(NFS4_SET_TO_SERVER_TIME);
        }
-        
        /*
         * Now we backfill the bitmap and the attribute buffer length.
         */
@@ -723,23 +728,20 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        *q++ = htonl(bmval1);
        *q++ = htonl(len);
-        status = 0;
 /* out: */
-        return status;
 }
-static int encode_access(struct xdr_stream *xdr, u32 access)
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(8);
        WRITE32(OP_ACCESS);
        WRITE32(access);
-        
+        hdr->nops++;
-        return 0;
 }
-static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -747,26 +749,24 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
        WRITE32(OP_CLOSE);
        WRITE32(arg->seqid->sequence->counter);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-        
+        hdr->nops++;
-        return 0;
 }
-static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        
-        RESERVE_SPACE(16);
-        WRITE32(OP_COMMIT);
-        WRITE64(args->offset);
-        WRITE32(args->count);
-        return 0;
+        RESERVE_SPACE(16);
+        WRITE32(OP_COMMIT);
+        WRITE64(args->offset);
+        WRITE32(args->count);
+        hdr->nops++;
 }
-static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
 {
        __be32 *p;
-        
        RESERVE_SPACE(8);
        WRITE32(OP_CREATE);
        WRITE32(create->ftype);
@@ -791,64 +791,62 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
        RESERVE_SPACE(4 + create->name->len);
        WRITE32(create->name->len);
        WRITEMEM(create->name->name, create->name->len);
+        hdr->nops++;
-        return encode_attrs(xdr, create->attrs, create->server);
+        encode_attrs(xdr, create->attrs, create->server);
 }
-static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
+static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(12);
+        RESERVE_SPACE(12);
-        WRITE32(OP_GETATTR);
+        WRITE32(OP_GETATTR);
-        WRITE32(1);
+        WRITE32(1);
-        WRITE32(bitmap);
+        WRITE32(bitmap);
-        return 0;
+        hdr->nops++;
 }
-static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
+static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        RESERVE_SPACE(16);
-        WRITE32(OP_GETATTR);
+        WRITE32(OP_GETATTR);
-        WRITE32(2);
+        WRITE32(2);
-        WRITE32(bm0);
+        WRITE32(bm0);
-        WRITE32(bm1);
+        WRITE32(bm1);
-        return 0;
+        hdr->nops++;
 }
-static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr,
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-                        bitmask[0] & nfs4_fattr_bitmap[0],
+                           bitmask[1] & nfs4_fattr_bitmap[1], hdr);
-                        bitmask[1] & nfs4_fattr_bitmap[1]);
 }
-static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
-                        bitmask[1] & nfs4_fsinfo_bitmap[1]);
+                           bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
 }
-static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr,
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
-                                  bitmask[0] & nfs4_fs_locations_bitmap[0],
+                           bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
-                                  bitmask[1] & nfs4_fs_locations_bitmap[1]);
 }
-static int encode_getfh(struct xdr_stream *xdr)
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_GETFH);
+        hdr->nops++;
-        return 0;
 }
-static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -856,8 +854,7 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_LINK);
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
-        
+        hdr->nops++;
-        return 0;
 }
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -878,7 +875,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
 * opcode,type,reclaim,offset,length,new_lock_owner = 32
 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
 */
-static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -904,11 +901,10 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
                WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
                WRITE32(args->lock_seqid->sequence->counter);
        }
+        hdr->nops++;
-        return 0;
 }
-static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -921,11 +917,10 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
        WRITE32(16);
        WRITEMEM("lock id:", 8);
        WRITE64(args->lock_owner.id);
+        hdr->nops++;
-        return 0;
 }
-static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -936,11 +931,10 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
        WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
        WRITE64(args->fl->fl_start);
        WRITE64(nfs4_lock_length(args->fl));
+        hdr->nops++;
-        return 0;
 }
-static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        int len = name->len;
        __be32 *p;
@@ -949,27 +943,26 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_LOOKUP);
        WRITE32(len);
        WRITEMEM(name->name, len);
+        hdr->nops++;
-        return 0;
 }
-static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
        __be32 *p;
        RESERVE_SPACE(8);
-        switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
+        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
-                case FMODE_READ:
+        case FMODE_READ:
-                        WRITE32(NFS4_SHARE_ACCESS_READ);
+                WRITE32(NFS4_SHARE_ACCESS_READ);
-                        break;
+                break;
-                case FMODE_WRITE:
+        case FMODE_WRITE:
-                        WRITE32(NFS4_SHARE_ACCESS_WRITE);
+                WRITE32(NFS4_SHARE_ACCESS_WRITE);
-                        break;
+                break;
-                case FMODE_READ|FMODE_WRITE:
+        case FMODE_READ|FMODE_WRITE:
-                        WRITE32(NFS4_SHARE_ACCESS_BOTH);
+                WRITE32(NFS4_SHARE_ACCESS_BOTH);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                WRITE32(0);
        }
        WRITE32(0);             /* for linux, share_deny = 0 always */
 }
@@ -984,7 +977,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        RESERVE_SPACE(8);
        WRITE32(OP_OPEN);
        WRITE32(arg->seqid->sequence->counter);
-        encode_share_access(xdr, arg->open_flags);
+        encode_share_access(xdr, arg->fmode);
        RESERVE_SPACE(28);
        WRITE64(arg->clientid);
        WRITE32(16);
@@ -998,13 +991,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
        RESERVE_SPACE(4);
        switch(arg->open_flags & O_EXCL) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_CREATE_UNCHECKED);
+                WRITE32(NFS4_CREATE_UNCHECKED);
-                        encode_attrs(xdr, arg->u.attrs, arg->server);
+                encode_attrs(xdr, arg->u.attrs, arg->server);
-                        break;
+                break;
-                default:
+        default:
-                        WRITE32(NFS4_CREATE_EXCLUSIVE);
+                WRITE32(NFS4_CREATE_EXCLUSIVE);
-                        encode_nfs4_verifier(xdr, &arg->u.verifier);
+                encode_nfs4_verifier(xdr, &arg->u.verifier);
        }
 }
@@ -1014,33 +1007,33 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
        RESERVE_SPACE(4);
        switch (arg->open_flags & O_CREAT) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_OPEN_NOCREATE);
+                WRITE32(NFS4_OPEN_NOCREATE);
-                        break;
+                break;
-                default:
+        default:
-                        BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
+                BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-                        WRITE32(NFS4_OPEN_CREATE);
+                WRITE32(NFS4_OPEN_CREATE);
-                        encode_createmode(xdr, arg);
+                encode_createmode(xdr, arg);
        }
 }
-static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        switch (delegation_type) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_OPEN_DELEGATE_NONE);
+                WRITE32(NFS4_OPEN_DELEGATE_NONE);
-                        break;
+                break;
-                case FMODE_READ:
+        case FMODE_READ:
-                        WRITE32(NFS4_OPEN_DELEGATE_READ);
+                WRITE32(NFS4_OPEN_DELEGATE_READ);
-                        break;
+                break;
-                case FMODE_WRITE|FMODE_READ:
+        case FMODE_WRITE|FMODE_READ:
-                        WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+                WRITE32(NFS4_OPEN_DELEGATE_WRITE);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                BUG();
        }
 }
@@ -1053,7 +1046,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
        encode_string(xdr, name->len, name->name);
 }
-static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
        __be32 *p;
@@ -1072,27 +1065,27 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
        encode_string(xdr, name->len, name->name);
 }
-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
 {
        encode_openhdr(xdr, arg);
        encode_opentype(xdr, arg);
        switch (arg->claim) {
-                case NFS4_OPEN_CLAIM_NULL:
+        case NFS4_OPEN_CLAIM_NULL:
-                        encode_claim_null(xdr, arg->name);
+                encode_claim_null(xdr, arg->name);
-                        break;
+                break;
-                case NFS4_OPEN_CLAIM_PREVIOUS:
+        case NFS4_OPEN_CLAIM_PREVIOUS:
-                        encode_claim_previous(xdr, arg->u.delegation_type);
+                encode_claim_previous(xdr, arg->u.delegation_type);
-                        break;
+                break;
-                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+        case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                        encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+                encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                BUG();
        }
-        return 0;
+        hdr->nops++;
 }
-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1100,11 +1093,10 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
        WRITE32(OP_OPEN_CONFIRM);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
+        hdr->nops++;
-        return 0;
 }
-static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1112,12 +1104,12 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
        WRITE32(OP_OPEN_DOWNGRADE);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
-        encode_share_access(xdr, arg->open_flags);
+        encode_share_access(xdr, arg->fmode);
-        return 0;
+        hdr->nops++;
 }
-static int
+static void
-encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
 {
        int len = fh->size;
        __be32 *p;
@@ -1126,18 +1118,16 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
        WRITE32(OP_PUTFH);
        WRITE32(len);
        WRITEMEM(fh->data, len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_putrootfh(struct xdr_stream *xdr)
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        
-        RESERVE_SPACE(4);
-        WRITE32(OP_PUTROOTFH);
-        return 0;
+        RESERVE_SPACE(4);
+        WRITE32(OP_PUTROOTFH);
+        hdr->nops++;
 }
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,7 +1143,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
                WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 }
-static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1165,11 +1155,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
        RESERVE_SPACE(12);
        WRITE64(args->offset);
        WRITE32(args->count);
+        hdr->nops++;
-        return 0;
 }
-static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req)
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
        uint32_t attrs[2] = {
                FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
@@ -1191,6 +1180,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        WRITE32(attrs[0] & readdir->bitmask[0]);
        WRITE32(attrs[1] & readdir->bitmask[1]);
+        hdr->nops++;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
@@ -1198,21 +1188,18 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                        ((u32 *)readdir->verifier.data)[1],
                        attrs[0] & readdir->bitmask[0],
                        attrs[1] & readdir->bitmask[1]);
-        return 0;
 }
-static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req)
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_READLINK);
+        hdr->nops++;
-        return 0;
 }
-static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1220,11 +1207,10 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_REMOVE);
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1232,38 +1218,35 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
        WRITE32(OP_RENAME);
        WRITE32(oldname->len);
        WRITEMEM(oldname->name, oldname->len);
-        
        RESERVE_SPACE(4 + newname->len);
        WRITE32(newname->len);
        WRITEMEM(newname->name, newname->len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
+static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(12);
        WRITE32(OP_RENEW);
        WRITE64(client_stateid->cl_clientid);
+        hdr->nops++;
-        return 0;
 }
-static int
+static void
-encode_restorefh(struct xdr_stream *xdr)
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_RESTOREFH);
+        hdr->nops++;
-        return 0;
 }
 static int
-encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1278,36 +1261,32 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
        RESERVE_SPACE(4);
        WRITE32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+        hdr->nops++;
        return 0;
 }
-static int
+static void
-encode_savefh(struct xdr_stream *xdr)
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_SAVEFH);
+        hdr->nops++;
-        return 0;
 }
-static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
 {
-        int status;
        __be32 *p;
-        
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-        WRITE32(OP_SETATTR);
-        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
-        if ((status = encode_attrs(xdr, arg->iap, server)))
+        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-                return status;
+        WRITE32(OP_SETATTR);
+        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
+        hdr->nops++;
+        encode_attrs(xdr, arg->iap, server);
 }
-static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1322,23 +1301,21 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
        RESERVE_SPACE(4);
        WRITE32(setclientid->sc_cb_ident);
+        hdr->nops++;
-        return 0;
 }
-static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
-        WRITE32(OP_SETCLIENTID_CONFIRM);
-        WRITE64(client_state->cl_clientid);
-        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
-        return 0;
+        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+        WRITE32(OP_SETCLIENTID_CONFIRM);
+        WRITE64(client_state->cl_clientid);
+        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        hdr->nops++;
 }
-static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1353,11 +1330,10 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
        WRITE32(args->count);
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+        hdr->nops++;
-        return 0;
 }
-static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1365,8 +1341,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
        WRITE32(OP_DELEGRETURN);
        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
-        return 0;
+        hdr->nops++;
 }
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
@@ -1379,21 +1354,16 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status != 0)
+        encode_access(&xdr, args->access, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_access(&xdr, args->access);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1403,21 +1373,17 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 4,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_lookup(&xdr, args->name, &hdr);
-        if ((status = encode_lookup(&xdr, args->name)) != 0)
+        encode_getfh(&xdr, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        if ((status = encode_getfh(&xdr)) != 0)
+        encode_nops(&hdr);
-                goto out;
+        return 0;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1427,18 +1393,16 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putrootfh(&xdr)) != 0)
+        encode_putrootfh(&xdr, &hdr);
-                goto out;
+        encode_getfh(&xdr, &hdr);
-        if ((status = encode_getfh(&xdr)) == 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                status = encode_getfattr(&xdr, args->bitmask);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1448,19 +1412,16 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) != 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                goto out;
+        encode_remove(&xdr, &args->name, &hdr);
-        if ((status = encode_remove(&xdr, &args->name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_getfattr(&xdr, args->bitmask);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1470,27 +1431,20 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->old_dir)) != 0)
+        encode_putfh(&xdr, args->old_dir, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_putfh(&xdr, args->new_dir, &hdr);
-                goto out;
+        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
-        if ((status = encode_putfh(&xdr, args->new_dir)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1500,27 +1454,20 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) != 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_link(&xdr, args->name, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_link(&xdr, args->name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1530,27 +1477,20 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_create(&xdr, args, &hdr);
-                goto out;
+        encode_getfh(&xdr, &hdr);
-        if ((status = encode_create(&xdr, args)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_getfh(&xdr)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1568,15 +1508,15 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) == 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                status = encode_getfattr(&xdr, args->bitmask);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        return status;
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -1584,23 +1524,18 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 */
 static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
-        };
+        };
-        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_putfh(&xdr, args->fh, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_close(&xdr, args, &hdr);
-        if(status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_close(&xdr, args);
+        return 0;
-        if (status != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1610,33 +1545,20 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_savefh(&xdr, &hdr);
-                goto out;
+        encode_open(&xdr, args, &hdr);
-        status = encode_savefh(&xdr);
+        encode_getfh(&xdr, &hdr);
-        if (status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        status = encode_open(&xdr, args);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        if (status)
+        encode_nops(&hdr);
-                goto out;
+        return 0;
-        status = encode_getfh(&xdr);
-        if (status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-        if (status)
-                goto out;
-        status = encode_restorefh(&xdr);
-        if (status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1646,18 +1568,15 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_open_confirm(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_open_confirm(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1667,21 +1586,16 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_open(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_open(&xdr, args);
+        encode_nops(&hdr);
-        if (status)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1691,21 +1605,16 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_open_downgrade(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_open_downgrade(&xdr, args);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1715,18 +1624,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_lock(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_lock(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1736,18 +1642,15 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_lockt(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_lockt(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1757,18 +1660,15 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_locku(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_locku(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1778,18 +1678,15 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        unsigned int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_readlink(&xdr, args, req, &hdr);
-                goto out;
-        status = encode_readlink(&xdr, args, req);
        /* set up reply kvec
         *    toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1798,9 +1695,8 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                        args->pgbase, args->pglen);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1810,18 +1706,15 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_readdir(&xdr, args, req, &hdr);
-                goto out;
-        status = encode_readdir(&xdr, args, req);
        /* set up reply kvec
         *    toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1833,9 +1726,8 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
        dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
                        __func__, replen, args->pages,
                        args->pgbase, args->count);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1846,18 +1738,14 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int replen, status;
+        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_read(&xdr, args, &hdr);
-                goto out;
-        status = encode_read(&xdr, args);
-        if (status)
-                goto out;
        /* set up reply kvec
         *    toplevel status + taglen=0 + rescount + OP_PUTFH + status
@@ -1867,33 +1755,27 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
 * Encode an SETATTR request
 */
 static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
-        };
+        };
-        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_putfh(&xdr, args->fh, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_setattr(&xdr, args, args->server, &hdr);
-        if(status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_setattr(&xdr, args, args->server);
+        return 0;
-        if(status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1906,22 +1788,21 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
        struct xdr_stream xdr;
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int replen, status;
+        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
-                goto out;
-        status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
        /* set up reply buffer: */
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                args->acl_pages, args->acl_pgbase, args->acl_len);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1931,22 +1812,17 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_write(&xdr, args, &hdr);
-                goto out;
-        status = encode_write(&xdr, args);
-        if (status)
-                goto out;
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        status = encode_getfattr(&xdr, args->bitmask);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1956,21 +1832,16 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_commit(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_commit(&xdr, args);
+        encode_nops(&hdr);
-        if (status)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1980,16 +1851,15 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (!status)
+        encode_fsinfo(&xdr, args->bitmask, &hdr);
-                status = encode_fsinfo(&xdr, args->bitmask);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1999,17 +1869,16 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (!status)
+        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-                status = encode_getattr_one(&xdr,
+                           &hdr);
-                                args->bitmask[0] & nfs4_pathconf_bitmap[0]);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2019,18 +1888,16 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status == 0)
+        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-                status = encode_getattr_two(&xdr,
+                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
-                                args->bitmask[0] & nfs4_statfs_bitmap[0],
+        encode_nops(&hdr);
-                                args->bitmask[1] & nfs4_statfs_bitmap[1]);
+        return 0;
-        return status;
 }
 /*
@@ -2040,19 +1907,18 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, fhandle);
+        encode_putfh(&xdr, fhandle, &hdr);
-        if (status == 0)
+        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-                status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+                           FATTR4_WORD0_LINK_SUPPORT|
-                                FATTR4_WORD0_LINK_SUPPORT|
+                           FATTR4_WORD0_SYMLINK_SUPPORT|
-                                FATTR4_WORD0_SYMLINK_SUPPORT|
+                           FATTR4_WORD0_ACLSUPPORT, &hdr);
-                                FATTR4_WORD0_ACLSUPPORT);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2062,12 +1928,14 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 1,
+                .nops   = 0,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        return encode_renew(&xdr, clp);
+        encode_renew(&xdr, clp, &hdr);
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -2077,12 +1945,14 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 1,
+                .nops   = 0,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        return encode_setclientid(&xdr, sc);
+        encode_setclientid(&xdr, sc, &hdr);
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -2092,19 +1962,17 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_setclientid_confirm(&xdr, clp);
+        encode_setclientid_confirm(&xdr, clp, &hdr);
-        if (!status)
+        encode_putrootfh(&xdr, &hdr);
-                status = encode_putrootfh(&xdr);
+        encode_fsinfo(&xdr, lease_bitmap, &hdr);
-        if (!status)
+        encode_nops(&hdr);
-                status = encode_fsinfo(&xdr, lease_bitmap);
+        return 0;
-        return status;
 }
 /*
@@ -2114,21 +1982,16 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fhandle);
+        encode_putfh(&xdr, args->fhandle, &hdr);
-        if (status != 0)
+        encode_delegreturn(&xdr, args->stateid, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_delegreturn(&xdr, args->stateid);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -2138,20 +2001,17 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_lookup(&xdr, args->name, &hdr);
-        if ((status = encode_lookup(&xdr, args->name)) != 0)
+        encode_fs_locations(&xdr, args->bitmask, &hdr);
-                goto out;
-        if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
-                goto out;
        /* set up reply
         *   toplevel_status + OP_PUTFH + status
         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
@@ -2159,8 +2019,8 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
                        0, PAGE_SIZE);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2217,11 +2077,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
        READ_BUF(8);
        READ32(hdr->status);
        READ32(hdr->taglen);
-        
        READ_BUF(hdr->taglen + 4);
        hdr->tag = (char *)p;
        p += XDR_QUADLEN(hdr->taglen);
        READ32(hdr->nops);
+        if (unlikely(hdr->nops < 1))
+                return nfs4_stat_to_errno(hdr->status);
        return 0;
 }
@@ -3047,8 +2909,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3070,14 +2931,13 @@ xdr_error:
        dprintk("%s: xdr returned %d!\n", __func__, -status);
        return status;
 }
-        
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
-        
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3107,10 +2967,9 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
-        
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3256,7 +3115,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
        int status;
-        
        status = decode_op_hdr(xdr, OP_LINK);
        if (status)
                return status;
@@ -3344,27 +3203,27 @@ static int decode_lookup(struct xdr_stream *xdr)
 /* This is too sick! */
 static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 {
-        __be32 *p;
+        __be32 *p;
        uint32_t limit_type, nblocks, blocksize;
        READ_BUF(12);
        READ32(limit_type);
        switch (limit_type) {
-                case 1:
+        case 1:
-                        READ64(*maxsize);
+                READ64(*maxsize);
-                        break;
+                break;
-                case 2:
+        case 2:
-                        READ32(nblocks);
+                READ32(nblocks);
-                        READ32(blocksize);
+                READ32(blocksize);
-                        *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+                *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
        }
        return 0;
 }
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        __be32 *p;
+        __be32 *p;
-        uint32_t delegation_type;
+        uint32_t delegation_type;
        READ_BUF(4);
        READ32(delegation_type);
@@ -3375,13 +3234,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
        READ_BUF(NFS4_STATEID_SIZE+4);
        COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
        READ32(res->do_recall);
        switch (delegation_type) {
-                case NFS4_OPEN_DELEGATE_READ:
+        case NFS4_OPEN_DELEGATE_READ:
-                        res->delegation_type = FMODE_READ;
+                res->delegation_type = FMODE_READ;
-                        break;
+                break;
-                case NFS4_OPEN_DELEGATE_WRITE:
+        case NFS4_OPEN_DELEGATE_WRITE:
-                        res->delegation_type = FMODE_WRITE|FMODE_READ;
+                res->delegation_type = FMODE_WRITE|FMODE_READ;
-                        if (decode_space_limit(xdr, &res->maxsize) < 0)
+                if (decode_space_limit(xdr, &res->maxsize) < 0)
                                return -EIO;
        }
        return decode_ace(xdr, NULL, res->server->nfs_client);
@@ -3389,27 +3249,27 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        __be32 *p;
+        __be32 *p;
        uint32_t savewords, bmlen, i;
-        int status;
+        int status;
-        status = decode_op_hdr(xdr, OP_OPEN);
+        status = decode_op_hdr(xdr, OP_OPEN);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (status)
-                return status;
+                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
+        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        decode_change_info(xdr, &res->cinfo);
+        decode_change_info(xdr, &res->cinfo);
-        READ_BUF(8);
+        READ_BUF(8);
-        READ32(res->rflags);
+        READ32(res->rflags);
-        READ32(bmlen);
+        READ32(bmlen);
-        if (bmlen > 10)
+        if (bmlen > 10)
-                goto xdr_error;
+                goto xdr_error;
-        READ_BUF(bmlen << 2);
+        READ_BUF(bmlen << 2);
        savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
        for (i = 0; i < savewords; ++i)
                READ32(res->attrset[i]);
@@ -3424,17 +3284,17 @@ xdr_error:
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-        __be32 *p;
+        __be32 *p;
        int status;
-        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (status)
-                return status;
+                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
+        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
+        return 0;
 }
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -3562,7 +3422,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                dprintk("NFS: readdir reply truncated!\n");
                entry[1] = 1;
        }
-out:    
+out:
        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 short_pkt:
@@ -3718,7 +3578,6 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
        uint32_t bmlen;
        int status;
-        
        status = decode_op_hdr(xdr, OP_SETATTR);
        if (status)
                return status;
@@ -3738,7 +3597,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
        READ32(opnum);
        if (opnum != OP_SETCLIENTID) {
                dprintk("nfs: decode_setclientid: Server returned operation"
-                                " %d\n", opnum);
+                        " %d\n", opnum);
                return -EIO;
        }
        READ32(nfserr);
@@ -3792,34 +3651,34 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 }
 /*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+/*
 * Decode OPEN_DOWNGRADE response
 */
 static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(&xdr, res);
        if (status != 0)
                goto out;
        decode_getfattr(&xdr, res->fattr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
- * END OF "GENERIC" DECODE ROUTINES.
- */
-/*
 * Decode ACCESS response
 */
 static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
@@ -3827,7 +3686,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3850,7 +3709,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3873,7 +3732,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3893,7 +3752,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3914,7 +3773,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3944,7 +3803,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3977,7 +3836,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -4014,7 +3873,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
@@ -4025,7 +3884,6 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        status = decode_getfattr(&xdr, res->fattr, res->server);
 out:
        return status;
 }
 /*
@@ -4034,21 +3892,20 @@ out:
 static int
 nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
-        };
+        };
-        int status;
+        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        status = encode_setacl(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_setacl(&xdr, args);
+        return status;
-out:
-        return status;
 }
 /*
 * Decode SETACL response
 */
@@ -4099,18 +3956,18 @@ out:
 */
 static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(&xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -4121,7 +3978,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         */
        decode_getfattr(&xdr, res->fattr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4129,23 +3986,23 @@ out:
 */
 static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(&xdr);
+        if (status)
+                goto out;
+        status = decode_open(&xdr, res);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
-        if (status)
-                goto out;
        if (decode_getfh(&xdr, &res->fh) != 0)
                goto out;
        if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
@@ -4154,7 +4011,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
                goto out;
        decode_getfattr(&xdr, res->dir_attr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4162,20 +4019,20 @@ out:
 */
 static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(&xdr, res);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4183,23 +4040,23 @@ out:
 */
 static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(&xdr, res);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
        decode_getfattr(&xdr, res->f_attr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4207,25 +4064,25 @@ out:
 */
 static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr, res);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
        status = decode_getfattr(&xdr, res->fattr, res->server);
        if (status == NFS4ERR_DELAY)
                status = 0;
 out:
-        return status;
+        return status;
 }
 /*
@@ -4421,8 +4278,6 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
                status = decode_putfh(&xdr);
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4511,8 +4366,6 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
                status = decode_setclientid(&xdr, clp);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4533,8 +4386,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
                status = decode_putrootfh(&xdr);
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4715,7 +4566,7 @@ nfs4_stat_to_errno(int stat)
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
        .p_name   = #proc,                                      \
-    }
+}
 struct rpc_procinfo     nfs4_procedures[] = {
  PROC(READ,            enc_read,       dec_read),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d74d16ce0d49..d9ef602fbc5a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,6 +86,8 @@
 #include <net/ipconfig.h>
 #include <linux/parser.h>
+#include "internal.h"
 /* Define this to allow debugging output */
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
@@ -100,7 +102,7 @@ static char nfs_root_name[256] __initdata = "";
 static __be32 servaddr __initdata = 0;
 /* Name of directory to mount */
-static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -312,7 +314,7 @@ static int __init root_nfs_name(char *name)
                printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
                return -1;
        }
-        sprintf(nfs_path, buf, cp);
+        sprintf(nfs_export_path, buf, cp);
        return 1;
 }
@@ -340,7 +342,7 @@ static int __init root_nfs_addr(void)
 static void __init root_nfs_print(void)
 {
        printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
-                nfs_path, nfs_data.hostname);
+                nfs_export_path, nfs_data.hostname);
        printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
                nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
        printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
@@ -485,18 +487,23 @@ static int __init root_nfs_get_handle(void)
 {
        struct nfs_fh fh;
        struct sockaddr_in sin;
+        struct nfs_mount_request request = {
+                .sap            = (struct sockaddr *)&sin,
+                .salen          = sizeof(sin),
+                .dirpath        = nfs_export_path,
+                .version        = (nfs_data.flags & NFS_MOUNT_VER3) ?
+                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
+                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
+                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
+                .fh             = &fh,
+        };
        int status;
-        int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
-        int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
-                                        NFS_MNT3_VERSION : NFS_MNT_VERSION;
        set_sockaddr(&sin, servaddr, htons(mount_port));
-        status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL,
+        status = nfs_mount(&request);
-                           nfs_path, version, protocol, &fh);
        if (status < 0)
                printk(KERN_ERR "Root-NFS: Server returned error %d "
-                                "while mounting %s\n", status, nfs_path);
+                                "while mounting %s\n", status, nfs_export_path);
        else {
                nfs_data.root.size = fh.size;
                memcpy(nfs_data.root.data, fh.data, fh.size);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 40d17987d0e8..f856004bb7fa 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -533,12 +533,6 @@ readpage_async_filler(void *data, struct page *page)
        unsigned int len;
        int error;
-        error = nfs_wb_page(inode, page);
-        if (error)
-                goto out_unlock;
-        if (PageUptodate(page))
-                goto out_unlock;
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb0313ac9e1f..d6686f4786dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -75,6 +75,7 @@ enum {
        Opt_acl, Opt_noacl,
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
+        Opt_resvport, Opt_noresvport,
        /* Mount options that take integer arguments */
        Opt_port,
@@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nordirplus, "nordirplus" },
        { Opt_sharecache, "sharecache" },
        { Opt_nosharecache, "nosharecache" },
+        { Opt_resvport, "resvport" },
+        { Opt_noresvport, "noresvport" },
        { Opt_port, "port=%u" },
        { Opt_rsize, "rsize=%u" },
@@ -512,7 +515,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                { NFS_MOUNT_NONLM, ",nolock", "" },
                { NFS_MOUNT_NOACL, ",noacl", "" },
                { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
-                { NFS_MOUNT_UNSHARED, ",nosharecache", ""},
+                { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+                { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
                { 0, NULL, NULL }
        };
        const struct proc_nfs_info *nfs_infop;
@@ -1033,6 +1037,12 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_nosharecache:
                        mnt->flags |= NFS_MOUNT_UNSHARED;
                        break;
+                case Opt_resvport:
+                        mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+                        break;
+                case Opt_noresvport:
+                        mnt->flags |= NFS_MOUNT_NORESVPORT;
+                        break;
                /*
                 * options that take numeric values
@@ -1327,8 +1337,14 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
-        struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
+        struct nfs_mount_request request = {
-        char *hostname;
+                .sap            = (struct sockaddr *)
+                                                &args->mount_server.address,
+                .dirpath        = args->nfs_server.export_path,
+                .protocol       = args->mount_server.protocol,
+                .fh             = root_fh,
+                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
+        };
        int status;
        if (args->mount_server.version == 0) {
@@ -1337,42 +1353,38 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                else
                        args->mount_server.version = NFS_MNT_VERSION;
        }
+        request.version = args->mount_server.version;
        if (args->mount_server.hostname)
-                hostname = args->mount_server.hostname;
+                request.hostname = args->mount_server.hostname;
        else
-                hostname = args->nfs_server.hostname;
+                request.hostname = args->nfs_server.hostname;
        /*
         * Construct the mount server's address.
         */
        if (args->mount_server.address.ss_family == AF_UNSPEC) {
-                memcpy(sap, &args->nfs_server.address,
+                memcpy(request.sap, &args->nfs_server.address,
                       args->nfs_server.addrlen);
                args->mount_server.addrlen = args->nfs_server.addrlen;
        }
+        request.salen = args->mount_server.addrlen;
        /*
         * autobind will be used if mount_server.port == 0
         */
-        nfs_set_port(sap, args->mount_server.port);
+        nfs_set_port(request.sap, args->mount_server.port);
        /*
         * Now ask the mount server to map our export path
         * to a file handle.
         */
-        status = nfs_mount(sap,
+        status = nfs_mount(&request);
-                           args->mount_server.addrlen,
-                           hostname,
-                           args->nfs_server.export_path,
-                           args->mount_server.version,
-                           args->mount_server.protocol,
-                           root_fh);
        if (status == 0)
                return 0;
        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
-                        hostname, status);
+                        request.hostname, status);
        return status;
 }
@@ -2419,7 +2431,7 @@ static void nfs4_kill_super(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
-        nfs_return_all_delegations(sb);
+        nfs_super_return_all_delegations(sb);
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index c11f5375d7c1..04133aacb1e5 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -29,8 +29,8 @@
 MODULE_LICENSE("GPL");
-EXPORT_SYMBOL(nfsacl_encode);
+EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL(nfsacl_decode);
+EXPORT_SYMBOL_GPL(nfsacl_decode);
 struct nfsacl_encode_desc {
        struct xdr_array2_desc desc;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 094747a1227c..6d7d8c02c197 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -358,6 +358,7 @@ static struct rpc_program cb_program = {
                .nrvers         = ARRAY_SIZE(nfs_cb_version),
                .version        = nfs_cb_version,
                .stats          = &cb_stats,
+                .pipe_dir_name  = "/nfsd4_cb",
 };
 /* Reference counting, callback cleanup, etc., all look racy as heck.
@@ -382,8 +383,9 @@ static int do_probe_callback(void *data)
                .program        = &cb_program,
                .prognumber     = cb->cb_prog,
                .version        = nfs_cb_version[1]->number,
-                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
+                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+                .client_name    = clp->cl_principal,
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -392,6 +394,11 @@ static int do_probe_callback(void *data)
        struct rpc_clnt *client;
        int status;
+        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+                status = nfserr_cb_path_down;
+                goto out_err;
+        }
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
        addr.sin_family = AF_INET;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bf4cd46a5a11..13e0e074dbb8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,6 +54,7 @@
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
 #include <linux/module.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp)
        shutdown_callback_client(clp);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
+        kfree(clp->cl_principal);
        kfree(clp->cl_name.data);
        kfree(clp);
 }
@@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        unsigned int            strhashval;
        struct nfs4_client      *conf, *unconf, *new;
        __be32                  status;
+        char                    *princ;
        char                    dname[HEXDIR_LEN];
        
        if (!check_name(clname))
@@ -783,6 +786,15 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
        copy_verf(new, &clverifier);
        new->cl_addr = sin->sin_addr.s_addr;
+        new->cl_flavor = rqstp->rq_flavor;
+        princ = svc_gss_principal(rqstp);
+        if (princ) {
+                new->cl_principal = kstrdup(princ, GFP_KERNEL);
+                if (new->cl_principal == NULL) {
+                        free_client(new);
+                        goto out;
+                }
+        }
        copy_cred(&new->cl_cred, &rqstp->rq_cred);
        gen_confirm(new);
        gen_callback(new, setclid);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679d..3bb1cf1e7425 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -44,10 +44,13 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                for_each_irq_nr(j) {
-                for_each_irq_nr(j)
+#ifdef CONFIG_SPARSE_IRQ
+                        if (!irq_to_desc(j))
+                                continue;
+#endif
                        sum += kstat_irqs_cpu(j, i);
+                }
                sum += arch_irq_stat_cpu(i);
        }
        sum += arch_irq_stat();
@@ -92,7 +95,12 @@ static int show_stat(struct seq_file *p, void *v)
        /* sum again ? it could be updated? */
        for_each_irq_nr(j) {
                per_irq_sum = 0;
+#ifdef CONFIG_SPARSE_IRQ
+                if (!irq_to_desc(j)) {
+                        seq_printf(p, " %u", per_irq_sum);
+                        continue;
+                }
+#endif
                for_each_possible_cpu(i)
                        per_irq_sum += kstat_irqs_cpu(j, i);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 737c9a425361..c3dc491fff89 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -85,13 +85,13 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_trans_inode.o \
                                   xfs_trans_item.o \
                                   xfs_utils.o \
-                                   xfs_vfsops.o \
                                   xfs_vnodeops.o \
                                   xfs_rw.o \
                                   xfs_dmops.o \
                                   xfs_qmops.o
-xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o \
+                                   xfs_dir2_trace.o
 # Objects in linux/
 xfs-y                           += $(addprefix $(XFS_LINUX)/, \
@@ -106,7 +106,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
-                                   xfs_vnode.o \
+                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 351a8f454bd1..4dfc7c370819 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -32,23 +32,15 @@ typedef struct sv_s {
        wait_queue_head_t waiters;
 } sv_t;
-#define SV_FIFO         0x0             /* sv_t is FIFO type */
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-#define SV_LIFO         0x2             /* sv_t is LIFO type */
-#define SV_PRIO         0x4             /* sv_t is PRIO type */
-#define SV_KEYED        0x6             /* sv_t is KEYED type */
-#define SV_DEFAULT      SV_FIFO
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
-                             unsigned long timeout)
 {
        DECLARE_WAITQUEUE(wait, current);
        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(state);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
        spin_unlock(lock);
-        schedule_timeout(timeout);
+        schedule();
        remove_wait_queue(&sv->waiters, &wait);
 }
@@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
 #define sv_destroy(sv) \
        /*NOTHING*/
 #define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+        _sv_wait(sv, lock)
-#define sv_wait_sig(sv, pri, lock, s)   \
-        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
-        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
-        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
 #define sv_signal(sv) \
        wake_up(&(sv)->waiters)
 #define sv_broadcast(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a44d68eb50b5..de3a198f771e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -42,6 +42,40 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC          37
+#define to_ioend_wq(v)  (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+void __init
+xfs_ioend_init(void)
+{
+        int i;
+        for (i = 0; i < NVSYNC; i++)
+                init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+void
+xfs_ioend_wait(
+        xfs_inode_t     *ip)
+{
+        wait_queue_head_t *wq = to_ioend_wq(ip);
+        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+STATIC void
+xfs_ioend_wake(
+        xfs_inode_t     *ip)
+{
+        if (atomic_dec_and_test(&ip->i_iocount))
+                wake_up(to_ioend_wq(ip));
+}
 STATIC void
 xfs_count_page_state(
        struct page             *page,
@@ -146,16 +180,25 @@ xfs_destroy_ioend(
        xfs_ioend_t             *ioend)
 {
        struct buffer_head      *bh, *next;
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        for (bh = ioend->io_buffer_head; bh; bh = next) {
                next = bh->b_private;
                bh->b_end_io(bh, !ioend->io_error);
        }
-        if (unlikely(ioend->io_error)) {
-                vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
+        /*
-                                __FILE__,__LINE__);
+         * Volume managers supporting multiple paths can send back ENODEV
+         * when the final path disappears.  In this case continuing to fill
+         * the page cache with dirty data which cannot be written out is
+         * evil, so prevent that.
+         */
+        if (unlikely(ioend->io_error == -ENODEV)) {
+                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+                                      __FILE__, __LINE__);
        }
-        vn_iowake(XFS_I(ioend->io_inode));
+        xfs_ioend_wake(ip);
        mempool_free(ioend, xfs_ioend_pool);
 }
@@ -191,7 +234,7 @@ xfs_setfilesize(
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
                ip->i_update_size = 1;
-                mark_inode_dirty_sync(ioend->io_inode);
+                xfs_mark_inode_dirty_sync(ip);
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -317,14 +360,9 @@ xfs_map_blocks(
        xfs_iomap_t             *mapp,
        int                     flags)
 {
-        xfs_inode_t             *ip = XFS_I(inode);
+        int                     nmaps = 1;
-        int                     error, nmaps = 1;
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
-        error = xfs_iomap(ip, offset, count,
-                                flags, mapp, &nmaps);
-        if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
-                xfs_iflags_set(ip, XFS_IMODIFIED);
-        return -error;
 }
 STATIC_INLINE int
@@ -512,7 +550,7 @@ xfs_cancel_ioend(
                        unlock_buffer(bh);
                } while ((bh = next_bh) != NULL);
-                vn_iowake(XFS_I(ioend->io_inode));
+                xfs_ioend_wake(XFS_I(ioend->io_inode));
                mempool_free(ioend, xfs_ioend_pool);
        } while ((ioend = next) != NULL);
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 3ba0631a3818..7b26f5ff9692 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -43,4 +43,7 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 36d5fcd3f593..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -630,6 +630,29 @@ xfs_buf_get_flags(
        return NULL;
 }
+STATIC int
+_xfs_buf_read(
+        xfs_buf_t               *bp,
+        xfs_buf_flags_t         flags)
+{
+        int                     status;
+        XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
+        ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        status = xfs_buf_iorequest(bp);
+        if (!status && !(flags & XBF_ASYNC))
+                status = xfs_buf_iowait(bp);
+        return status;
+}
 xfs_buf_t *
 xfs_buf_read_flags(
        xfs_buftarg_t           *target,
@@ -646,7 +669,7 @@ xfs_buf_read_flags(
                if (!XFS_BUF_ISDONE(bp)) {
                        XB_TRACE(bp, "read", (unsigned long)flags);
                        XFS_STATS_INC(xb_get_read);
-                        xfs_buf_iostart(bp, flags);
+                        _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
                        XB_TRACE(bp, "read_async", (unsigned long)flags);
                        /*
@@ -1048,50 +1071,39 @@ xfs_buf_ioerror(
        XB_TRACE(bp, "ioerror", (unsigned long)error);
 }
-/*
- *      Initiate I/O on a buffer, based on the flags supplied.
- *      The b_iodone routine in the buffer supplied will only be called
- *      when all of the subsidiary I/O requests, if any, have been completed.
- */
 int
-xfs_buf_iostart(
+xfs_bawrite(
-        xfs_buf_t               *bp,
+        void                    *mp,
-        xfs_buf_flags_t         flags)
+        struct xfs_buf          *bp)
 {
-        int                     status = 0;
+        XB_TRACE(bp, "bawrite", 0);
-        XB_TRACE(bp, "iostart", (unsigned long)flags);
+        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
-        if (flags & XBF_DELWRI) {
+        xfs_buf_delwri_dequeue(bp);
-                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
-                bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
-                xfs_buf_delwri_queue(bp, 1);
-                return 0;
-        }
-        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+        bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
-                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-        bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
-                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_mount = mp;
+        bp->b_strat = xfs_bdstrat_cb;
+        return xfs_bdstrat_cb(bp);
+}
-        BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
+void
+xfs_bdwrite(
+        void                    *mp,
+        struct xfs_buf          *bp)
+{
+        XB_TRACE(bp, "bdwrite", 0);
-        /* For writes allow an alternate strategy routine to precede
+        bp->b_strat = xfs_bdstrat_cb;
-         * the actual I/O request (which may not be issued at all in
+        bp->b_mount = mp;
-         * a shutdown situation, for example).
-         */
-        status = (flags & XBF_WRITE) ?
-                xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
-        /* Wait for I/O if we are not an async request.
+        bp->b_flags &= ~XBF_READ;
-         * Note: async I/O request completion will release the buffer,
+        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-         * and that can already be done by this point.  So using the
-         * buffer pointer from here on, after async I/O, is invalid.
-         */
-        if (!status && !(flags & XBF_ASYNC))
-                status = xfs_buf_iowait(bp);
-        return status;
+        xfs_buf_delwri_queue(bp, 1);
 }
 STATIC_INLINE void
@@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io(
        unsigned int            blocksize = bp->b_target->bt_bsize;
        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+        xfs_buf_ioerror(bp, -error);
-                bp->b_error = EIO;
        do {
                struct page     *page = bvec->bv_page;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 456519a088c7..288ae7c4c800 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -168,7 +168,7 @@ typedef struct xfs_buf {
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-        void                    *b_fspriv3;
+        struct xfs_mount        *b_mount;
        unsigned short          b_error;        /* error code on I/O */
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
@@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
+extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfs_buf_ioend(xfs_buf_t *,  int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
@@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_UNORDERED(bp)   ((bp)->b_flags &= ~XBF_ORDERED)
 #define XFS_BUF_ISORDERED(bp)   ((bp)->b_flags & XBF_ORDERED)
-#define XFS_BUF_SHUT(bp)        do { } while (0)
-#define XFS_BUF_UNSHUT(bp)      do { } while (0)
-#define XFS_BUF_ISSHUT(bp)      (0)
 #define XFS_BUF_HOLD(bp)        xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)        ((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)      ((bp)->b_flags &= ~XBF_READ)
@@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_SET_FSPRIVATE(bp, val)          ((bp)->b_fspriv = (void*)(val))
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type)            ((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val)         ((bp)->b_fspriv3 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
 #define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
@@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_TARGET(bp)              ((bp)->b_target)
 #define XFS_BUFTARG_NAME(target)        xfs_buf_target_name(target)
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
-        bp->b_fspriv3 = mp;
-        bp->b_strat = xfs_bdstrat_cb;
-        xfs_buf_delwri_dequeue(bp);
-        return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
        if (!bp->b_relse)
@@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
-/*
- * No error can be returned from xfs_buf_iostart for delwri
- * buffers as they are queued and no I/O is issued.
- */
-static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
-        bp->b_strat = xfs_bdstrat_cb;
-        bp->b_fspriv3 = mp;
-        (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 #define xfs_iowait(bp)  xfs_buf_iowait(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 8c022cd0ad67..55bddf3b6091 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -25,12 +25,4 @@
 */
 typedef const struct cred cred_t;
-extern cred_t *sys_cred;
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static inline int capable_cred(cred_t *cr, int cid)
-{
-        return (cr == sys_cred) ? 1 : capable(cid);
-}
 #endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 7f7abec25e14..595751f78350 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,7 +29,6 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_vfsops.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3fee790f138b..e14c4e3aea0c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -36,89 +36,54 @@
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_ioctl32.h"
 #include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
 #include <linux/dcache.h>
 #include <linux/smp_lock.h>
 static struct vm_operations_struct xfs_file_vm_ops;
-STATIC_INLINE ssize_t
+STATIC ssize_t
-__xfs_file_read(
+xfs_file_aio_read(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-        int                     ioflags,
        loff_t                  pos)
 {
        struct file             *file = iocb->ki_filp;
+        int                     ioflags = IO_ISAIO;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
        return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
                                nr_segs, &iocb->ki_pos, ioflags);
 }
 STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_aio_write(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-STATIC ssize_t
-xfs_file_aio_read_invis(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-STATIC_INLINE ssize_t
-__xfs_file_write(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-        int                     ioflags,
        loff_t                  pos)
 {
-        struct file     *file = iocb->ki_filp;
+        struct file             *file = iocb->ki_filp;
+        int                     ioflags = IO_ISAIO;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
        return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
                                &iocb->ki_pos, ioflags);
 }
 STATIC ssize_t
-xfs_file_aio_write(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-STATIC ssize_t
-xfs_file_aio_write_invis(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-STATIC ssize_t
 xfs_file_splice_read(
        struct file             *infilp,
        loff_t                  *ppos,
@@ -126,20 +91,13 @@ xfs_file_splice_read(
        size_t                  len,
        unsigned int            flags)
 {
-        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
+        int                     ioflags = 0;
-                                   infilp, ppos, pipe, len, flags, 0);
-}
+        if (infilp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_read_invis(
-        struct file             *infilp,
-        loff_t                  *ppos,
-        struct pipe_inode_info  *pipe,
-        size_t                  len,
-        unsigned int            flags)
-{
        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-                                   infilp, ppos, pipe, len, flags, IO_INVIS);
+                                   infilp, ppos, pipe, len, flags, ioflags);
 }
 STATIC ssize_t
@@ -150,30 +108,49 @@ xfs_file_splice_write(
        size_t                  len,
        unsigned int            flags)
 {
-        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
+        int                     ioflags = 0;
-                                    pipe, outfilp, ppos, len, flags, 0);
-}
+        if (outfilp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_write_invis(
-        struct pipe_inode_info  *pipe,
-        struct file             *outfilp,
-        loff_t                  *ppos,
-        size_t                  len,
-        unsigned int            flags)
-{
        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-                                    pipe, outfilp, ppos, len, flags, IO_INVIS);
+                                    pipe, outfilp, ppos, len, flags, ioflags);
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
-        struct file     *filp)
+        struct file     *file)
 {
-        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EFBIG;
-        return -xfs_open(XFS_I(inode));
+        if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+                return -EIO;
+        return 0;
+}
+STATIC int
+xfs_dir_open(
+        struct inode    *inode,
+        struct file     *file)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        int             mode;
+        int             error;
+        error = xfs_file_open(inode, file);
+        if (error)
+                return error;
+        /*
+         * If there are any blocks, read-ahead block 0 as we're almost
+         * certain to have the next operation be a read there.
+         */
+        mode = xfs_ilock_map_shared(ip);
+        if (ip->i_d.di_nextents > 0)
+                xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+        xfs_iunlock(ip, mode);
+        return 0;
 }
 STATIC int
@@ -227,7 +204,7 @@ xfs_file_readdir(
         * point we can change the ->readdir prototype to include the
         * buffer size.
         */
-        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size);
+        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
        error = xfs_readdir(ip, dirent, bufsize,
                                (xfs_off_t *)&filp->f_pos, filldir);
@@ -248,48 +225,6 @@ xfs_file_mmap(
        return 0;
 }
-STATIC long
-xfs_file_ioctl(
-        struct file     *filp,
-        unsigned int    cmd,
-        unsigned long   p)
-{
-        int             error;
-        struct inode    *inode = filp->f_path.dentry->d_inode;
-        error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-        /* NOTE:  some of the ioctl's return positive #'s as a
-         *        byte count indicating success, such as
-         *        readlink_by_handle.  So we don't "sign flip"
-         *        like most other routines.  This means true
-         *        errors need to be returned as a negative value.
-         */
-        return error;
-}
-STATIC long
-xfs_file_ioctl_invis(
-        struct file     *filp,
-        unsigned int    cmd,
-        unsigned long   p)
-{
-        int             error;
-        struct inode    *inode = filp->f_path.dentry->d_inode;
-        error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-        /* NOTE:  some of the ioctl's return positive #'s as a
-         *        byte count indicating success, such as
-         *        readlink_by_handle.  So we don't "sign flip"
-         *        like most other routines.  This means true
-         *        errors need to be returned as a negative value.
-         */
-        return error;
-}
 /*
 * mmap()d file has taken write protection fault and is being made
 * writable. We can set the page state up correctly for a writable
@@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = {
 #endif
 };
-const struct file_operations xfs_invis_file_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = do_sync_read,
-        .write          = do_sync_write,
-        .aio_read       = xfs_file_aio_read_invis,
-        .aio_write      = xfs_file_aio_write_invis,
-        .splice_read    = xfs_file_splice_read_invis,
-        .splice_write   = xfs_file_splice_write_invis,
-        .unlocked_ioctl = xfs_file_ioctl_invis,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl   = xfs_file_compat_invis_ioctl,
-#endif
-        .mmap           = xfs_file_mmap,
-        .open           = xfs_file_open,
-        .release        = xfs_file_release,
-        .fsync          = xfs_file_fsync,
-};
 const struct file_operations xfs_dir_file_operations = {
+        .open           = xfs_dir_open,
        .read           = generic_read_dir,
        .readdir        = xfs_file_readdir,
        .llseek         = generic_file_llseek,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 36caa6d957df..5aeb77776961 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -24,6 +24,10 @@ int  fs_noerr(void) { return 0; }
 int  fs_nosys(void) { return ENOSYS; }
 void fs_noval(void) { return; }
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
 void
 xfs_tosspages(
        xfs_inode_t     *ip,
@@ -53,7 +57,7 @@ xfs_flushinval_pages(
                if (!ret)
                        truncate_inode_pages(mapping, first);
        }
-        return ret;
+        return -ret;
 }
 int
@@ -72,10 +76,23 @@ xfs_flush_pages(
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
                ret = filemap_fdatawrite(mapping);
                if (flags & XFS_B_ASYNC)
-                        return ret;
+                        return -ret;
                ret2 = filemap_fdatawait(mapping);
                if (!ret)
                        ret = ret2;
        }
-        return ret;
+        return -ret;
+}
+int
+xfs_wait_on_pages(
+        xfs_inode_t     *ip,
+        xfs_off_t       first,
+        xfs_off_t       last)
+{
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+                return -filemap_fdatawait(mapping);
+        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ef90e64641e6..2ae8b1ccb02e 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -26,7 +26,6 @@
 */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
-        .restrict_chown = {     0,              1,              1       },
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
        .panic_mask     = {     0,              0,              255     },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
 };
-/*
- * Global system credential structure.
- */
-static cred_t sys_cred_val;
-cred_t *sys_cred = &sys_cred_val;
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 6eda8a3eb6f1..69f71caf061c 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,5 @@
 #define __XFS_GLOBALS_H__
 extern uint64_t xfs_panic_mask;         /* set to cause more panics */
-extern cred_t *sys_cred;
 #endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 281cbd5a25cf..67205f6198ba 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -68,26 +68,22 @@
 * XFS_IOC_PATH_TO_HANDLE
 *    returns full handle for a path
 */
-STATIC int
+int
 xfs_find_handle(
        unsigned int            cmd,
-        void                    __user *arg)
+        xfs_fsop_handlereq_t    *hreq)
 {
        int                     hsize;
        xfs_handle_t            handle;
-        xfs_fsop_handlereq_t    hreq;
        struct inode            *inode;
-        if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                return -XFS_ERROR(EFAULT);
        memset((char *)&handle, 0, sizeof(handle));
        switch (cmd) {
        case XFS_IOC_PATH_TO_FSHANDLE:
        case XFS_IOC_PATH_TO_HANDLE: {
                struct path path;
-                int error = user_lpath((const char __user *)hreq.path, &path);
+                int error = user_lpath((const char __user *)hreq->path, &path);
                if (error)
                        return error;
@@ -101,7 +97,7 @@ xfs_find_handle(
        case XFS_IOC_FD_TO_HANDLE: {
                struct file     *file;
-                file = fget(hreq.fd);
+                file = fget(hreq->fd);
                if (!file)
                    return -EBADF;
@@ -158,8 +154,8 @@ xfs_find_handle(
        }
        /* now copy our handle into the user buffer & write out the size */
-        if (copy_to_user(hreq.ohandle, &handle, hsize) ||
+        if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-            copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+            copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
                iput(inode);
                return -XFS_ERROR(EFAULT);
        }
@@ -249,10 +245,10 @@ xfs_vget_fsop_handlereq(
        return 0;
 }
-STATIC int
+int
 xfs_open_by_handle(
        xfs_mount_t             *mp,
-        void                    __user *arg,
+        xfs_fsop_handlereq_t    *hreq,
        struct file             *parfilp,
        struct inode            *parinode)
 {
@@ -263,14 +259,11 @@ xfs_open_by_handle(
        struct file             *filp;
        struct inode            *inode;
        struct dentry           *dentry;
-        xfs_fsop_handlereq_t    hreq;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
@@ -281,10 +274,10 @@ xfs_open_by_handle(
        }
 #if BITS_PER_LONG != 32
-        hreq.oflags |= O_LARGEFILE;
+        hreq->oflags |= O_LARGEFILE;
 #endif
        /* Put open permission in namei format. */
-        permflag = hreq.oflags;
+        permflag = hreq->oflags;
        if ((permflag+1) & O_ACCMODE)
                permflag++;
        if (permflag & O_TRUNC)
@@ -322,15 +315,16 @@ xfs_open_by_handle(
        mntget(parfilp->f_path.mnt);
        /* Create file pointer. */
-        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred);
+        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
        if (IS_ERR(filp)) {
                put_unused_fd(new_fd);
                return -XFS_ERROR(-PTR_ERR(filp));
        }
        if (inode->i_mode & S_IFREG) {
                /* invisible operation should not change atime */
                filp->f_flags |= O_NOATIME;
-                filp->f_op = &xfs_invis_file_operations;
+                filp->f_mode |= FMODE_NOCMTIME;
        }
        fd_install(new_fd, filp);
@@ -363,24 +357,21 @@ do_readlink(
 }
-STATIC int
+int
 xfs_readlink_by_handle(
        xfs_mount_t             *mp,
-        void                    __user *arg,
+        xfs_fsop_handlereq_t    *hreq,
        struct inode            *parinode)
 {
        struct inode            *inode;
-        xfs_fsop_handlereq_t    hreq;
        __u32                   olen;
        void                    *link;
        int                     error;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
@@ -390,7 +381,7 @@ xfs_readlink_by_handle(
                goto out_iput;
        }
-        if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+        if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
                error = -XFS_ERROR(EFAULT);
                goto out_iput;
        }
@@ -402,7 +393,7 @@ xfs_readlink_by_handle(
        error = -xfs_readlink(XFS_I(inode), link);
        if (error)
                goto out_kfree;
-        error = do_readlink(hreq.ohandle, olen, link);
+        error = do_readlink(hreq->ohandle, olen, link);
        if (error)
                goto out_kfree;
@@ -501,7 +492,7 @@ xfs_attrlist_by_handle(
        return -error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_get(
        struct inode            *inode,
        char                    *name,
@@ -530,7 +521,7 @@ xfs_attrmulti_attr_get(
        return error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_set(
        struct inode            *inode,
        char                    *name,
@@ -560,7 +551,7 @@ xfs_attrmulti_attr_set(
        return error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_remove(
        struct inode            *inode,
        char                    *name,
@@ -662,19 +653,26 @@ xfs_attrmulti_by_handle(
        return -error;
 }
-STATIC int
+int
 xfs_ioc_space(
        struct xfs_inode        *ip,
        struct inode            *inode,
        struct file             *filp,
        int                     ioflags,
        unsigned int            cmd,
-        void                    __user *arg)
+        xfs_flock64_t           *bf)
 {
-        xfs_flock64_t           bf;
        int                     attr_flags = 0;
        int                     error;
+        /*
+         * Only allow the sys admin to reserve space unless
+         * unwritten extents are enabled.
+         */
+        if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+            !capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
                return -XFS_ERROR(EPERM);
@@ -684,16 +682,12 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -XFS_ERROR(EINVAL);
-        if (copy_from_user(&bf, arg, sizeof(bf)))
-                return -XFS_ERROR(EFAULT);
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
-        error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
+        error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
-                                              NULL, attr_flags);
        return -error;
 }
@@ -1105,10 +1099,6 @@ xfs_ioctl_setattr(
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & FSX_PROJID) {
                /*
@@ -1137,7 +1127,7 @@ xfs_ioctl_setattr(
                         * the superblock version number since projids didn't
                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
                         */
-                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+                        if (ip->i_d.di_version == 1)
                                xfs_bump_ino_vers2(tp, ip);
                }
@@ -1256,43 +1246,67 @@ xfs_ioc_setxflags(
 }
 STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+        struct getbmap __user   *base = *ap;
+        /* copy only getbmap portion (not getbmapx) */
+        if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+                return XFS_ERROR(EFAULT);
+        *ap += sizeof(struct getbmap);
+        return 0;
+}
+STATIC int
 xfs_ioc_getbmap(
        struct xfs_inode        *ip,
        int                     ioflags,
        unsigned int            cmd,
        void                    __user *arg)
 {
-        struct getbmap          bm;
+        struct getbmapx         bmx;
-        int                     iflags;
        int                     error;
-        if (copy_from_user(&bm, arg, sizeof(bm)))
+        if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
                return -XFS_ERROR(EFAULT);
-        if (bm.bmv_count < 2)
+        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
-        iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
        if (ioflags & IO_INVIS)
-                iflags |= BMV_IF_NO_DMAPI_READ;
+                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-        error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags);
+        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+                            (struct getbmap *)arg+1);
        if (error)
                return -error;
-        if (copy_to_user(arg, &bm, sizeof(bm)))
+        /* copy back header - only size of getbmap */
+        if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
 STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+        struct getbmapx __user  *base = *ap;
+        if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+                return XFS_ERROR(EFAULT);
+        *ap += sizeof(struct getbmapx);
+        return 0;
+}
+STATIC int
 xfs_ioc_getbmapx(
        struct xfs_inode        *ip,
        void                    __user *arg)
 {
        struct getbmapx         bmx;
-        struct getbmap          bm;
-        int                     iflags;
        int                     error;
        if (copy_from_user(&bmx, arg, sizeof(bmx)))
@@ -1301,46 +1315,46 @@ xfs_ioc_getbmapx(
        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
-        /*
+        if (bmx.bmv_iflags & (~BMV_IF_VALID))
-         * Map input getbmapx structure to a getbmap
-         * structure for xfs_getbmap.
-         */
-        GETBMAP_CONVERT(bmx, bm);
-        iflags = bmx.bmv_iflags;
-        if (iflags & (~BMV_IF_VALID))
                return -XFS_ERROR(EINVAL);
-        iflags |= BMV_IF_EXTENDED;
+        error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+                            (struct getbmapx *)arg+1);
-        error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags);
        if (error)
                return -error;
-        GETBMAP_CONVERT(bm, bmx);
+        /* copy back header */
+        if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-        if (copy_to_user(arg, &bmx, sizeof(bmx)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
-int
+/*
-xfs_ioctl(
+ * Note: some of the ioctl's return positive numbers as a
-        xfs_inode_t             *ip,
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
        struct file             *filp,
-        int                     ioflags,
        unsigned int            cmd,
-        void                    __user *arg)
+        unsigned long           p)
 {
        struct inode            *inode = filp->f_path.dentry->d_inode;
-        xfs_mount_t             *mp = ip->i_mount;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        void                    __user *arg = (void __user *)p;
+        int                     ioflags = 0;
        int                     error;
-        xfs_itrace_entry(XFS_I(inode));
+        if (filp->f_mode & FMODE_NOCMTIME)
-        switch (cmd) {
+                ioflags |= IO_INVIS;
+        xfs_itrace_entry(ip);
+        switch (cmd) {
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -1348,17 +1362,13 @@ xfs_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_UNRESVSP64: {
-                /*
+                xfs_flock64_t           bf;
-                 * Only allow the sys admin to reserve space unless
-                 * unwritten extents are enabled.
-                 */
-                if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-                    !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+                if (copy_from_user(&bf, arg, sizeof(bf)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+        }
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
                xfs_buftarg_t   *target =
@@ -1418,18 +1428,30 @@ xfs_ioctl(
        case XFS_IOC_FD_TO_HANDLE:
        case XFS_IOC_PATH_TO_HANDLE:
-        case XFS_IOC_PATH_TO_FSHANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE: {
-                return xfs_find_handle(cmd, arg);
+                xfs_fsop_handlereq_t    hreq;
-        case XFS_IOC_OPEN_BY_HANDLE:
+                if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                return xfs_open_by_handle(mp, arg, filp, inode);
+                        return -XFS_ERROR(EFAULT);
+                return xfs_find_handle(cmd, &hreq);
+        }
+        case XFS_IOC_OPEN_BY_HANDLE: {
+                xfs_fsop_handlereq_t    hreq;
+                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_open_by_handle(mp, &hreq, filp, inode);
+        }
        case XFS_IOC_FSSETDM_BY_HANDLE:
                return xfs_fssetdm_by_handle(mp, arg, inode);
-        case XFS_IOC_READLINK_BY_HANDLE:
+        case XFS_IOC_READLINK_BY_HANDLE: {
-                return xfs_readlink_by_handle(mp, arg, inode);
+                xfs_fsop_handlereq_t    hreq;
+                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_readlink_by_handle(mp, &hreq, inode);
+        }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
                return xfs_attrlist_by_handle(mp, arg, inode);
@@ -1437,7 +1459,11 @@ xfs_ioctl(
                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
        case XFS_IOC_SWAPEXT: {
-                error = xfs_swapext((struct xfs_swapext __user *)arg);
+                struct xfs_swapext      sxp;
+                if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_swapext(&sxp);
                return -error;
        }
@@ -1493,9 +1519,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSDATA: {
                xfs_growfs_data_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1506,9 +1529,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSLOG: {
                xfs_growfs_log_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1519,9 +1539,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSRT: {
                xfs_growfs_rt_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 000000000000..8c16bf2d7e03
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+extern int
+xfs_ioc_space(
+        struct xfs_inode        *ip,
+        struct inode            *inode,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        xfs_flock64_t           *bf);
+extern int
+xfs_find_handle(
+        unsigned int            cmd,
+        xfs_fsop_handlereq_t    *hreq);
+extern int
+xfs_open_by_handle(
+        xfs_mount_t             *mp,
+        xfs_fsop_handlereq_t    *hreq,
+        struct file             *parfilp,
+        struct inode            *parinode);
+extern int
+xfs_readlink_by_handle(
+        xfs_mount_t             *mp,
+        xfs_fsop_handlereq_t    *hreq,
+        struct inode            *parinode);
+extern int
+xfs_attrmulti_attr_get(
+        struct inode            *inode,
+        char                    *name,
+        char                    __user *ubuf,
+        __uint32_t              *len,
+        __uint32_t              flags);
+extern int
+        xfs_attrmulti_attr_set(
+        struct inode            *inode,
+        char                    *name,
+        const char              __user *ubuf,
+        __uint32_t              len,
+        __uint32_t              flags);
+extern int
+xfs_attrmulti_attr_remove(
+        struct inode            *inode,
+        char                    *name,
+        __uint32_t              flags);
+extern long
+xfs_file_ioctl(
+        struct file             *filp,
+        unsigned int            cmd,
+        unsigned long           p);
+extern long
+xfs_file_compat_ioctl(
+        struct file             *file,
+        unsigned int            cmd,
+        unsigned long           arg);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index a4b254eb43b2..0504cece9f66 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -16,11 +16,7 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include <linux/compat.h>
-#include <linux/init.h>
 #include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -36,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_vfs.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -44,221 +39,219 @@
 #include "xfs_error.h"
 #include "xfs_dfrag.h"
 #include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
 #include "xfs_ioctl32.h"
 #define  _NATIVE_IOC(cmd, type) \
          _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#ifdef BROKEN_X86_ALIGNMENT
-#define BROKEN_X86_ALIGNMENT
+STATIC int
-#define _PACKED __attribute__((packed))
+xfs_compat_flock64_copyin(
-/* on ia32 l_start is on a 32-bit boundary */
+        xfs_flock64_t           *bf,
-typedef struct xfs_flock64_32 {
+        compat_xfs_flock64_t    __user *arg32)
-        __s16           l_type;
-        __s16           l_whence;
-        __s64           l_start __attribute__((packed));
-                        /* len == 0 means until end of file */
-        __s64           l_len __attribute__((packed));
-        __s32           l_sysid;
-        __u32           l_pid;
-        __s32           l_pad[4];       /* reserve area */
-} xfs_flock64_32_t;
-#define XFS_IOC_ALLOCSP_32      _IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32       _IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32    _IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32     _IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32       _IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32     _IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32     _IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32   _IOW ('X', 43, struct xfs_flock64_32)
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
-        unsigned long           arg)
 {
-        xfs_flock64_32_t        __user *p32 = (void __user *)arg;
+        if (get_user(bf->l_type,        &arg32->l_type) ||
-        xfs_flock64_t           __user *p = compat_alloc_user_space(sizeof(*p));
+            get_user(bf->l_whence,      &arg32->l_whence) ||
+            get_user(bf->l_start,       &arg32->l_start) ||
-        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
+            get_user(bf->l_len,         &arg32->l_len) ||
-            copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
+            get_user(bf->l_sysid,       &arg32->l_sysid) ||
-            copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
+            get_user(bf->l_pid,         &arg32->l_pid) ||
-            copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
+            copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
-            copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
+                return -XFS_ERROR(EFAULT);
-            copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
+        return 0;
-            copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
-                return -EFAULT;
-        return (unsigned long)p;
 }
-typedef struct compat_xfs_fsop_geom_v1 {
+STATIC int
-        __u32           blocksize;      /* filesystem (data) block size */
+xfs_compat_ioc_fsgeometry_v1(
-        __u32           rtextsize;      /* realtime extent size         */
+        struct xfs_mount          *mp,
-        __u32           agblocks;       /* fsblocks in an AG            */
+        compat_xfs_fsop_geom_v1_t __user *arg32)
-        __u32           agcount;        /* number of allocation groups  */
-        __u32           logblocks;      /* fsblocks in the log          */
-        __u32           sectsize;       /* (data) sector size, bytes    */
-        __u32           inodesize;      /* inode size in bytes          */
-        __u32           imaxpct;        /* max allowed inode space(%)   */
-        __u64           datablocks;     /* fsblocks in data subvolume   */
-        __u64           rtblocks;       /* fsblocks in realtime subvol  */
-        __u64           rtextents;      /* rt extents in realtime subvol*/
-        __u64           logstart;       /* starting fsblock of the log  */
-        unsigned char   uuid[16];       /* unique id of the filesystem  */
-        __u32           sunit;          /* stripe unit, fsblocks        */
-        __u32           swidth;         /* stripe width, fsblocks       */
-        __s32           version;        /* structure version            */
-        __u32           flags;          /* superblock version flags     */
-        __u32           logsectsize;    /* log sector size, bytes       */
-        __u32           rtsectsize;     /* realtime sector size, bytes  */
-        __u32           dirblocksize;   /* directory block size, bytes  */
-} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
-#define XFS_IOC_FSGEOMETRY_V1_32  \
-        _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
-STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
 {
-        compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg;
+        xfs_fsop_geom_t           fsgeo;
-        xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p));
+        int                       error;
-        if (copy_in_user(p, p32, sizeof(*p32)))
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
-                return -EFAULT;
+        if (error)
-        return (unsigned long)p;
+                return -error;
+        /* The 32-bit variant simply has some padding at the end */
+        if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
 }
-typedef struct compat_xfs_inogrp {
+STATIC int
-        __u64           xi_startino;    /* starting inode number        */
+xfs_compat_growfs_data_copyin(
-        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        struct xfs_growfs_data   *in,
-        __u64           xi_allocmask;   /* mask of allocated inodes     */
+        compat_xfs_growfs_data_t __user *arg32)
-} __attribute__((packed)) compat_xfs_inogrp_t;
-STATIC int xfs_inumbers_fmt_compat(
-        void __user *ubuffer,
-        const xfs_inogrp_t *buffer,
-        long count,
-        long *written)
 {
-        compat_xfs_inogrp_t __user *p32 = ubuffer;
+        if (get_user(in->newblocks, &arg32->newblocks) ||
-        long i;
+            get_user(in->imaxpct,   &arg32->imaxpct))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_compat_growfs_rt_copyin(
+        struct xfs_growfs_rt     *in,
+        compat_xfs_growfs_rt_t  __user *arg32)
+{
+        if (get_user(in->newblocks, &arg32->newblocks) ||
+            get_user(in->extsize,   &arg32->extsize))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_inumbers_fmt_compat(
+        void                    __user *ubuffer,
+        const xfs_inogrp_t      *buffer,
+        long                    count,
+        long                    *written)
+{
+        compat_xfs_inogrp_t     __user *p32 = ubuffer;
+        long                    i;
        for (i = 0; i < count; i++) {
                if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
                    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
                    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                        return -EFAULT;
+                        return -XFS_ERROR(EFAULT);
        }
        *written = count * sizeof(*p32);
        return 0;
 }
 #else
 #define xfs_inumbers_fmt_compat xfs_inumbers_fmt
-#define _PACKED
+#endif  /* BROKEN_X86_ALIGNMENT */
-#endif
+STATIC int
+xfs_ioctl32_bstime_copyin(
+        xfs_bstime_t            *bstime,
+        compat_xfs_bstime_t     __user *bstime32)
+{
+        compat_time_t           sec32;  /* tv_sec differs on 64 vs. 32 */
-/* XFS_IOC_FSBULKSTAT and friends */
+        if (get_user(sec32,             &bstime32->tv_sec)      ||
+            get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
+                return -XFS_ERROR(EFAULT);
+        bstime->tv_sec = sec32;
+        return 0;
+}
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+        xfs_bstat_t             *bstat,
+        compat_xfs_bstat_t      __user *bstat32)
+{
+        if (get_user(bstat->bs_ino,     &bstat32->bs_ino)       ||
+            get_user(bstat->bs_mode,    &bstat32->bs_mode)      ||
+            get_user(bstat->bs_nlink,   &bstat32->bs_nlink)     ||
+            get_user(bstat->bs_uid,     &bstat32->bs_uid)       ||
+            get_user(bstat->bs_gid,     &bstat32->bs_gid)       ||
+            get_user(bstat->bs_rdev,    &bstat32->bs_rdev)      ||
+            get_user(bstat->bs_blksize, &bstat32->bs_blksize)   ||
+            get_user(bstat->bs_size,    &bstat32->bs_size)      ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+            get_user(bstat->bs_blocks,  &bstat32->bs_size)      ||
+            get_user(bstat->bs_xflags,  &bstat32->bs_size)      ||
+            get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
+            get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
+            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
+            get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
+            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
-typedef struct compat_xfs_bstime {
+/* XFS_IOC_FSBULKSTAT and friends */
-        __s32           tv_sec;         /* seconds              */
-        __s32           tv_nsec;        /* and nanoseconds      */
-} compat_xfs_bstime_t;
-STATIC int xfs_bstime_store_compat(
+STATIC int
-        compat_xfs_bstime_t __user *p32,
+xfs_bstime_store_compat(
-        const xfs_bstime_t *p)
+        compat_xfs_bstime_t     __user *p32,
+        const xfs_bstime_t      *p)
 {
-        __s32 sec32;
+        __s32                   sec32;
        sec32 = p->tv_sec;
        if (put_user(sec32, &p32->tv_sec) ||
            put_user(p->tv_nsec, &p32->tv_nsec))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        return 0;
 }
-typedef struct compat_xfs_bstat {
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
-        __u64           bs_ino;         /* inode number                 */
+STATIC int
-        __u16           bs_mode;        /* type and mode                */
+xfs_bulkstat_one_fmt_compat(
-        __u16           bs_nlink;       /* number of links              */
-        __u32           bs_uid;         /* user id                      */
-        __u32           bs_gid;         /* group id                     */
-        __u32           bs_rdev;        /* device value                 */
-        __s32           bs_blksize;     /* block size                   */
-        __s64           bs_size;        /* file size                    */
-        compat_xfs_bstime_t bs_atime;   /* access time                  */
-        compat_xfs_bstime_t bs_mtime;   /* modify time                  */
-        compat_xfs_bstime_t bs_ctime;   /* inode change time            */
-        int64_t         bs_blocks;      /* number of blocks             */
-        __u32           bs_xflags;      /* extended flags               */
-        __s32           bs_extsize;     /* extent size                  */
-        __s32           bs_extents;     /* number of extents            */
-        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
-        __u32           bs_dmevmask;    /* DMIG event mask              */
-        __u16           bs_dmstate;     /* DMIG state info              */
-        __u16           bs_aextents;    /* attribute number of extents  */
-} _PACKED compat_xfs_bstat_t;
-STATIC int xfs_bulkstat_one_fmt_compat(
        void                    __user *ubuffer,
+        int                     ubsize,
+        int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
-        compat_xfs_bstat_t __user *p32 = ubuffer;
+        compat_xfs_bstat_t      __user *p32 = ubuffer;
-        if (put_user(buffer->bs_ino, &p32->bs_ino) ||
+        if (ubsize < sizeof(*p32))
-            put_user(buffer->bs_mode, &p32->bs_mode) ||
+                return XFS_ERROR(ENOMEM);
-            put_user(buffer->bs_nlink, &p32->bs_nlink) ||
-            put_user(buffer->bs_uid, &p32->bs_uid) ||
+        if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
-            put_user(buffer->bs_gid, &p32->bs_gid) ||
+            put_user(buffer->bs_mode,     &p32->bs_mode)        ||
-            put_user(buffer->bs_rdev, &p32->bs_rdev) ||
+            put_user(buffer->bs_nlink,    &p32->bs_nlink)       ||
-            put_user(buffer->bs_blksize, &p32->bs_blksize) ||
+            put_user(buffer->bs_uid,      &p32->bs_uid)         ||
-            put_user(buffer->bs_size, &p32->bs_size) ||
+            put_user(buffer->bs_gid,      &p32->bs_gid)         ||
+            put_user(buffer->bs_rdev,     &p32->bs_rdev)        ||
+            put_user(buffer->bs_blksize,  &p32->bs_blksize)     ||
+            put_user(buffer->bs_size,     &p32->bs_size)        ||
            xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
            xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
            xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
-            put_user(buffer->bs_blocks, &p32->bs_blocks) ||
+            put_user(buffer->bs_blocks,   &p32->bs_blocks)      ||
-            put_user(buffer->bs_xflags, &p32->bs_xflags) ||
+            put_user(buffer->bs_xflags,   &p32->bs_xflags)      ||
-            put_user(buffer->bs_extsize, &p32->bs_extsize) ||
+            put_user(buffer->bs_extsize,  &p32->bs_extsize)     ||
-            put_user(buffer->bs_extents, &p32->bs_extents) ||
+            put_user(buffer->bs_extents,  &p32->bs_extents)     ||
-            put_user(buffer->bs_gen, &p32->bs_gen) ||
+            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
-            put_user(buffer->bs_projid, &p32->bs_projid) ||
+            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
-            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
+            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
-            put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
+            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
-                return -EFAULT;
+                return XFS_ERROR(EFAULT);
-        return sizeof(*p32);
+        if (ubused)
+                *ubused = sizeof(*p32);
+        return 0;
 }
+STATIC int
+xfs_bulkstat_one_compat(
-typedef struct compat_xfs_fsop_bulkreq {
+        xfs_mount_t     *mp,            /* mount point for filesystem */
-        compat_uptr_t   lastip;         /* last inode # pointer         */
+        xfs_ino_t       ino,            /* inode number to get data for */
-        __s32           icount;         /* count of entries in buffer   */
+        void            __user *buffer, /* buffer to place output in */
-        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        int             ubsize,         /* size of buffer */
-        compat_uptr_t   ocount;         /* output count pointer         */
+        void            *private_data,  /* my private data */
-} compat_xfs_fsop_bulkreq_t;
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
-#define XFS_IOC_FSBULKSTAT_32 \
+        void            *dibuff,        /* on-disk inode buffer */
-        _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+        int             *stat)          /* BULKSTAT_RV_... */
-#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+{
-        _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-#define XFS_IOC_FSINUMBERS_32 \
+                                    xfs_bulkstat_one_fmt_compat, bno,
-        _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+                                    ubused, dibuff, stat);
+}
 /* copied from xfs_ioctl.c */
 STATIC int
-xfs_ioc_bulkstat_compat(
+xfs_compat_ioc_bulkstat(
-        xfs_mount_t             *mp,
+        xfs_mount_t               *mp,
-        unsigned int            cmd,
+        unsigned int              cmd,
-        void                    __user *arg)
+        compat_xfs_fsop_bulkreq_t __user *p32)
 {
-        compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
        u32                     addr;
        xfs_fsop_bulkreq_t      bulkreq;
        int                     count;  /* # of records returned */
@@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat(
        /* should be called again (unused here, but used in dmapi) */
        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
+                return -XFS_ERROR(EPERM);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
        if (get_user(addr, &p32->lastip))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.lastip = compat_ptr(addr);
        if (get_user(bulkreq.icount, &p32->icount) ||
            get_user(addr, &p32->ubuffer))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.ubuffer = compat_ptr(addr);
        if (get_user(addr, &p32->ocount))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.ocount = compat_ptr(addr);
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
@@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat(
        if (bulkreq.ubuffer == NULL)
                return -XFS_ERROR(EINVAL);
-        if (cmd == XFS_IOC_FSINUMBERS)
+        if (cmd == XFS_IOC_FSINUMBERS_32) {
                error = xfs_inumbers(mp, &inlast, &count,
                                bulkreq.ubuffer, xfs_inumbers_fmt_compat);
-        else {
+        } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
-                /* declare a var to get a warning in case the type changes */
+                int res;
-                bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat;
+                error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+                                sizeof(compat_xfs_bstat_t),
+                                NULL, 0, NULL, NULL, &res);
+        } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
-                        xfs_bulkstat_one, formatter,
+                        xfs_bulkstat_one_compat, NULL,
                        sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
                        BULKSTAT_FG_QUICK, &done);
-        }
+        } else
+                error = XFS_ERROR(EINVAL);
        if (error)
                return -error;
@@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat(
        return 0;
 }
+STATIC int
+xfs_compat_handlereq_copyin(
+        xfs_fsop_handlereq_t            *hreq,
+        compat_xfs_fsop_handlereq_t     __user *arg32)
+{
+        compat_xfs_fsop_handlereq_t     hreq32;
+        if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        hreq->fd = hreq32.fd;
+        hreq->path = compat_ptr(hreq32.path);
+        hreq->oflags = hreq32.oflags;
+        hreq->ihandle = compat_ptr(hreq32.ihandle);
+        hreq->ihandlen = hreq32.ihandlen;
+        hreq->ohandle = compat_ptr(hreq32.ohandle);
+        hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+        return 0;
+}
-typedef struct compat_xfs_fsop_handlereq {
+/*
-        __u32           fd;             /* fd for FD_TO_HANDLE          */
+ * Convert userspace handle data into inode.
-        compat_uptr_t   path;           /* user pathname                */
+ *
-        __u32           oflags;         /* open flags                   */
+ * We use the fact that all the fsop_handlereq ioctl calls have a data
-        compat_uptr_t   ihandle;        /* user supplied handle         */
+ * structure argument whose first component is always a xfs_fsop_handlereq_t,
-        __u32           ihandlen;       /* user supplied length         */
+ * so we can pass that sub structure into this handy, shared routine.
-        compat_uptr_t   ohandle;        /* user buffer for handle       */
+ *
-        compat_uptr_t   ohandlen;       /* user buffer length           */
+ * If no error, caller must always iput the returned inode.
-} compat_xfs_fsop_handlereq_t;
+ */
+STATIC int
-#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+xfs_vget_fsop_handlereq_compat(
-        _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+        xfs_mount_t             *mp,
-#define XFS_IOC_PATH_TO_HANDLE_32 \
+        struct inode            *parinode,      /* parent inode pointer    */
-        _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+        compat_xfs_fsop_handlereq_t     *hreq,
-#define XFS_IOC_FD_TO_HANDLE_32 \
+        struct inode            **inode)
-        _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_OPEN_BY_HANDLE_32 \
-        _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_READLINK_BY_HANDLE_32 \
-        _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
-STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
 {
-        compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg;
+        void                    __user *hanp;
-        xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p));
+        size_t                  hlen;
-        u32 addr;
+        xfs_fid_t               *xfid;
+        xfs_handle_t            *handlep;
-        if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) ||
+        xfs_handle_t            handle;
-            get_user(addr, &p32->path) ||
+        xfs_inode_t             *ip;
-            put_user(compat_ptr(addr), &p->path) ||
+        xfs_ino_t               ino;
-            copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) ||
+        __u32                   igen;
-            get_user(addr, &p32->ihandle) ||
+        int                     error;
-            put_user(compat_ptr(addr), &p->ihandle) ||
-            copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) ||
+        /*
-            get_user(addr, &p32->ohandle) ||
+         * Only allow handle opens under a directory.
-            put_user(compat_ptr(addr), &p->ohandle) ||
+         */
-            get_user(addr, &p32->ohandlen) ||
+        if (!S_ISDIR(parinode->i_mode))
-            put_user(compat_ptr(addr), &p->ohandlen))
+                return XFS_ERROR(ENOTDIR);
-                return -EFAULT;
+        hanp = compat_ptr(hreq->ihandle);
-        return (unsigned long)p;
+        hlen = hreq->ihandlen;
+        handlep = &handle;
+        if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+                return XFS_ERROR(EINVAL);
+        if (copy_from_user(handlep, hanp, hlen))
+                return XFS_ERROR(EFAULT);
+        if (hlen < sizeof(*handlep))
+                memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+        if (hlen > sizeof(handlep->ha_fsid)) {
+                if (handlep->ha_fid.fid_len !=
+                    (hlen - sizeof(handlep->ha_fsid) -
+                            sizeof(handlep->ha_fid.fid_len)) ||
+                    handlep->ha_fid.fid_pad)
+                        return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Crack the handle, obtain the inode # & generation #
+         */
+        xfid = (struct xfs_fid *)&handlep->ha_fid;
+        if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
+                ino  = xfid->fid_ino;
+                igen = xfid->fid_gen;
+        } else {
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Get the XFS inode, building a Linux inode to go with it.
+         */
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+        if (error)
+                return error;
+        if (ip == NULL)
+                return XFS_ERROR(EIO);
+        if (ip->i_d.di_gen != igen) {
+                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                return XFS_ERROR(ENOENT);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        *inode = VFS_I(ip);
+        return 0;
 }
+STATIC int
+xfs_compat_attrlist_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct inode            *parinode)
+{
+        int                     error;
+        attrlist_cursor_kern_t  *cursor;
+        compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+        struct inode            *inode;
+        char                    *kbuf;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&al_hreq, arg,
+                           sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        if (al_hreq.buflen > XATTR_LIST_MAX)
+                return -XFS_ERROR(EINVAL);
+        /*
+         * Reject flags, only allow namespaces.
+         */
+        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+                return -XFS_ERROR(EINVAL);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
+                                               &inode);
+        if (error)
+                goto out;
+        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        if (!kbuf)
+                goto out_vn_rele;
+        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+        error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+                                        al_hreq.flags, cursor);
+        if (error)
+                goto out_kfree;
+        if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+                error = -EFAULT;
+ out_kfree:
+        kfree(kbuf);
+ out_vn_rele:
+        iput(inode);
+ out:
+        return -error;
+}
-STATIC long
+STATIC int
-xfs_compat_ioctl(
+xfs_compat_attrmulti_by_handle(
-        int             mode,
+        xfs_mount_t                             *mp,
-        struct file     *file,
+        void                                    __user *arg,
-        unsigned        cmd,
+        struct inode                            *parinode)
-        unsigned long   arg)
+{
+        int                                     error;
+        compat_xfs_attr_multiop_t               *ops;
+        compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
+        struct inode                            *inode;
+        unsigned int                            i, size;
+        char                                    *attr_name;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&am_hreq, arg,
+                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
+                                               &inode);
+        if (error)
+                goto out;
+        error = E2BIG;
+        size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+        if (!size || size > 16 * PAGE_SIZE)
+                goto out_vn_rele;
+        error = ENOMEM;
+        ops = kmalloc(size, GFP_KERNEL);
+        if (!ops)
+                goto out_vn_rele;
+        error = EFAULT;
+        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
+                goto out_kfree_ops;
+        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+        if (!attr_name)
+                goto out_kfree_ops;
+        error = 0;
+        for (i = 0; i < am_hreq.opcount; i++) {
+                ops[i].am_error = strncpy_from_user(attr_name,
+                                compat_ptr(ops[i].am_attrname),
+                                MAXNAMELEN);
+                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                        error = -ERANGE;
+                if (ops[i].am_error < 0)
+                        break;
+                switch (ops[i].am_opcode) {
+                case ATTR_OP_GET:
+                        ops[i].am_error = xfs_attrmulti_attr_get(inode,
+                                        attr_name,
+                                        compat_ptr(ops[i].am_attrvalue),
+                                        &ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_SET:
+                        ops[i].am_error = xfs_attrmulti_attr_set(inode,
+                                        attr_name,
+                                        compat_ptr(ops[i].am_attrvalue),
+                                        ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_REMOVE:
+                        ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+                                        attr_name, ops[i].am_flags);
+                        break;
+                default:
+                        ops[i].am_error = EINVAL;
+                }
+        }
+        if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+                error = XFS_ERROR(EFAULT);
+        kfree(attr_name);
+ out_kfree_ops:
+        kfree(ops);
+ out_vn_rele:
+        iput(inode);
+ out:
+        return -error;
+}
+STATIC int
+xfs_compat_fssetdm_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct inode            *parinode)
+{
+        int                     error;
+        struct fsdmidata        fsd;
+        compat_xfs_fsop_setdm_handlereq_t dmhreq;
+        struct inode            *inode;
+        if (!capable(CAP_MKNOD))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&dmhreq, arg,
+                           sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
+                                               &inode);
+        if (error)
+                return -error;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+                error = -XFS_ERROR(EPERM);
+                goto out;
+        }
+        if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+                error = -XFS_ERROR(EFAULT);
+                goto out;
+        }
+        error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+                                 fsd.fsd_dmstate);
+out:
+        iput(inode);
+        return error;
+}
+long
+xfs_file_compat_ioctl(
+        struct file             *filp,
+        unsigned                cmd,
+        unsigned long           p)
 {
-        struct inode    *inode = file->f_path.dentry->d_inode;
+        struct inode            *inode = filp->f_path.dentry->d_inode;
-        int             error;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        void                    __user *arg = (void __user *)p;
+        int                     ioflags = 0;
+        int                     error;
+        if (filp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
+        xfs_itrace_entry(ip);
        switch (cmd) {
+        /* No size or alignment issues on any arch */
        case XFS_IOC_DIOINFO:
        case XFS_IOC_FSGEOMETRY:
        case XFS_IOC_FSGETXATTR:
@@ -387,48 +628,18 @@ xfs_compat_ioctl(
        case XFS_IOC_GETBMAP:
        case XFS_IOC_GETBMAPA:
        case XFS_IOC_GETBMAPX:
-/* not handled
-        case XFS_IOC_FSSETDM_BY_HANDLE:
-        case XFS_IOC_ATTRLIST_BY_HANDLE:
-        case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
        case XFS_IOC_FSCOUNTS:
        case XFS_IOC_SET_RESBLKS:
        case XFS_IOC_GET_RESBLKS:
-        case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSLOG:
-        case XFS_IOC_FSGROWFSRT:
        case XFS_IOC_FREEZE:
        case XFS_IOC_THAW:
        case XFS_IOC_GOINGDOWN:
        case XFS_IOC_ERROR_INJECTION:
        case XFS_IOC_ERROR_CLEARALL:
-                break;
+                return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
-        case XFS_IOC32_GETXFLAGS:
+        /* These are handled fine if no alignment issues */
-        case XFS_IOC32_SETXFLAGS:
-        case XFS_IOC32_GETVERSION:
-                cmd = _NATIVE_IOC(cmd, long);
-                break;
-#ifdef BROKEN_X86_ALIGNMENT
-        /* xfs_flock_t has wrong u32 vs u64 alignment */
-        case XFS_IOC_ALLOCSP_32:
-        case XFS_IOC_FREESP_32:
-        case XFS_IOC_ALLOCSP64_32:
-        case XFS_IOC_FREESP64_32:
-        case XFS_IOC_RESVSP_32:
-        case XFS_IOC_UNRESVSP_32:
-        case XFS_IOC_RESVSP64_32:
-        case XFS_IOC_UNRESVSP64_32:
-                arg = xfs_ioctl32_flock(arg);
-                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-                break;
-        case XFS_IOC_FSGEOMETRY_V1_32:
-                arg = xfs_ioctl32_geom_v1(arg);
-                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
-                break;
-#else /* These are handled fine if no alignment issues */
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -438,51 +649,97 @@ xfs_compat_ioctl(
        case XFS_IOC_RESVSP64:
        case XFS_IOC_UNRESVSP64:
        case XFS_IOC_FSGEOMETRY_V1:
-                break;
+        case XFS_IOC_FSGROWFSDATA:
+        case XFS_IOC_FSGROWFSRT:
+                return xfs_file_ioctl(filp, cmd, p);
+#else
+        case XFS_IOC_ALLOCSP_32:
+        case XFS_IOC_FREESP_32:
+        case XFS_IOC_ALLOCSP64_32:
+        case XFS_IOC_FREESP64_32:
+        case XFS_IOC_RESVSP_32:
+        case XFS_IOC_UNRESVSP_32:
+        case XFS_IOC_RESVSP64_32:
+        case XFS_IOC_UNRESVSP64_32: {
+                struct xfs_flock64      bf;
-        /* xfs_bstat_t still has wrong u32 vs u64 alignment */
+                if (xfs_compat_flock64_copyin(&bf, arg))
-        case XFS_IOC_SWAPEXT:
+                        return -XFS_ERROR(EFAULT);
-                break;
+                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+        }
+        case XFS_IOC_FSGEOMETRY_V1_32:
+                return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+        case XFS_IOC_FSGROWFSDATA_32: {
+                struct xfs_growfs_data  in;
+                if (xfs_compat_growfs_data_copyin(&in, arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_data(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSRT_32: {
+                struct xfs_growfs_rt    in;
+                if (xfs_compat_growfs_rt_copyin(&in, arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_rt(mp, &in);
+                return -error;
+        }
 #endif
+        /* long changes size, but xfs only copiese out 32 bits */
+        case XFS_IOC_GETXFLAGS_32:
+        case XFS_IOC_SETXFLAGS_32:
+        case XFS_IOC_GETVERSION_32:
+                cmd = _NATIVE_IOC(cmd, long);
+                return xfs_file_ioctl(filp, cmd, p);
+        case XFS_IOC_SWAPEXT: {
+                struct xfs_swapext        sxp;
+                struct compat_xfs_swapext __user *sxu = arg;
+                /* Bulk copy in up to the sx_stat field, then copy bstat */
+                if (copy_from_user(&sxp, sxu,
+                                   offsetof(struct xfs_swapext, sx_stat)) ||
+                    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_swapext(&sxp);
+                return -error;
+        }
        case XFS_IOC_FSBULKSTAT_32:
        case XFS_IOC_FSBULKSTAT_SINGLE_32:
        case XFS_IOC_FSINUMBERS_32:
-                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq);
+                return xfs_compat_ioc_bulkstat(mp, cmd, arg);
-                return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount,
-                                cmd, (void __user*)arg);
        case XFS_IOC_FD_TO_HANDLE_32:
        case XFS_IOC_PATH_TO_HANDLE_32:
-        case XFS_IOC_PATH_TO_FSHANDLE_32:
+        case XFS_IOC_PATH_TO_FSHANDLE_32: {
-        case XFS_IOC_OPEN_BY_HANDLE_32:
+                struct xfs_fsop_handlereq       hreq;
-        case XFS_IOC_READLINK_BY_HANDLE_32:
-                arg = xfs_ioctl32_fshandle(arg);
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
+                        return -XFS_ERROR(EFAULT);
                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
-                break;
+                return xfs_find_handle(cmd, &hreq);
-        default:
-                return -ENOIOCTLCMD;
        }
+        case XFS_IOC_OPEN_BY_HANDLE_32: {
+                struct xfs_fsop_handlereq       hreq;
-        error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg);
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
+                        return -XFS_ERROR(EFAULT);
+                return xfs_open_by_handle(mp, &hreq, filp, inode);
-        return error;
+        }
-}
+        case XFS_IOC_READLINK_BY_HANDLE_32: {
+                struct xfs_fsop_handlereq       hreq;
-long
-xfs_file_compat_ioctl(
-        struct file             *file,
-        unsigned                cmd,
-        unsigned long           arg)
-{
-        return xfs_compat_ioctl(0, file, cmd, arg);
-}
-long
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
-xfs_file_compat_invis_ioctl(
+                        return -XFS_ERROR(EFAULT);
-        struct file             *file,
+                return xfs_readlink_by_handle(mp, &hreq, inode);
-        unsigned                cmd,
+        }
-        unsigned long           arg)
+        case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-{
+                return xfs_compat_attrlist_by_handle(mp, arg, inode);
-        return xfs_compat_ioctl(IO_INVIS, file, cmd, arg);
+        case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+                return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+        case XFS_IOC_FSSETDM_BY_HANDLE_32:
+                return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+        default:
+                return -XFS_ERROR(ENOIOCTLCMD);
+        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 02de6e62ee37..1024c4f8ba0d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -18,7 +18,217 @@
 #ifndef __XFS_IOCTL32_H__
 #define __XFS_IOCTL32_H__
-extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long);
+#include <linux/compat.h>
-extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long);
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32    FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32    FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32   FS_IOC32_GETVERSION
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+typedef struct compat_xfs_bstime {
+        compat_time_t   tv_sec;         /* seconds              */
+        __s32           tv_nsec;        /* and nanoseconds      */
+} compat_xfs_bstime_t;
+typedef struct compat_xfs_bstat {
+        __u64           bs_ino;         /* inode number                 */
+        __u16           bs_mode;        /* type and mode                */
+        __u16           bs_nlink;       /* number of links              */
+        __u32           bs_uid;         /* user id                      */
+        __u32           bs_gid;         /* group id                     */
+        __u32           bs_rdev;        /* device value                 */
+        __s32           bs_blksize;     /* block size                   */
+        __s64           bs_size;        /* file size                    */
+        compat_xfs_bstime_t bs_atime;   /* access time                  */
+        compat_xfs_bstime_t bs_mtime;   /* modify time                  */
+        compat_xfs_bstime_t bs_ctime;   /* inode change time            */
+        int64_t         bs_blocks;      /* number of blocks             */
+        __u32           bs_xflags;      /* extended flags               */
+        __s32           bs_extsize;     /* extent size                  */
+        __s32           bs_extents;     /* number of extents            */
+        __u32           bs_gen;         /* generation count             */
+        __u16           bs_projid;      /* project id                   */
+        unsigned char   bs_pad[14];     /* pad space, unused            */
+        __u32           bs_dmevmask;    /* DMIG event mask              */
+        __u16           bs_dmstate;     /* DMIG state info              */
+        __u16           bs_aextents;    /* attribute number of extents  */
+} __compat_packed compat_xfs_bstat_t;
+typedef struct compat_xfs_fsop_bulkreq {
+        compat_uptr_t   lastip;         /* last inode # pointer         */
+        __s32           icount;         /* count of entries in buffer   */
+        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        compat_uptr_t   ocount;         /* output count pointer         */
+} compat_xfs_fsop_bulkreq_t;
+#define XFS_IOC_FSBULKSTAT_32 \
+        _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+        _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+        _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+typedef struct compat_xfs_fsop_handlereq {
+        __u32           fd;             /* fd for FD_TO_HANDLE          */
+        compat_uptr_t   path;           /* user pathname                */
+        __u32           oflags;         /* open flags                   */
+        compat_uptr_t   ihandle;        /* user supplied handle         */
+        __u32           ihandlen;       /* user supplied length         */
+        compat_uptr_t   ohandle;        /* user buffer for handle       */
+        compat_uptr_t   ohandlen;       /* user buffer length           */
+} compat_xfs_fsop_handlereq_t;
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+        _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+        _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+        _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+        _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+        _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+        __int64_t               sx_version;     /* version */
+        __int64_t               sx_fdtarget;    /* fd of target file */
+        __int64_t               sx_fdtmp;       /* fd of tmp file */
+        xfs_off_t               sx_offset;      /* offset into file */
+        xfs_off_t               sx_length;      /* leng from offset */
+        char                    sx_pad[16];     /* pad space, unused */
+        compat_xfs_bstat_t      sx_stat;        /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+#define XFS_IOC_SWAPEXT_32      _IOWR('X', 109, struct compat_xfs_swapext)
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+        struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+        struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
+        __u32                           flags;  /* which namespace to use */
+        __u32                           buflen; /* length of buffer supplied */
+        compat_uptr_t                   buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+        _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+        __u32           am_opcode;
+        __s32           am_error;
+        compat_uptr_t   am_attrname;
+        compat_uptr_t   am_attrvalue;
+        __u32           am_length;
+        __u32           am_flags;
+} compat_xfs_attr_multiop_t;
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+        struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+        __u32                           opcount;/* count of following multiop */
+        /* ptr to compat_xfs_attr_multiop */
+        compat_uptr_t                   ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+        _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+typedef struct compat_xfs_fsop_setdm_handlereq {
+        struct compat_xfs_fsop_handlereq hreq;  /* handle information   */
+        /* ptr to struct fsdmidata */
+        compat_uptr_t                   data;   /* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+        _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start __attribute__((packed));
+                        /* len == 0 means until end of file */
+        __s64           l_len __attribute__((packed));
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area */
+} compat_xfs_flock64_t;
+#define XFS_IOC_ALLOCSP_32      _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32       _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32    _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32     _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32       _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32     _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32     _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32   _IOW('X', 43, struct compat_xfs_flock64)
+typedef struct compat_xfs_fsop_geom_v1 {
+        __u32           blocksize;      /* filesystem (data) block size */
+        __u32           rtextsize;      /* realtime extent size         */
+        __u32           agblocks;       /* fsblocks in an AG            */
+        __u32           agcount;        /* number of allocation groups  */
+        __u32           logblocks;      /* fsblocks in the log          */
+        __u32           sectsize;       /* (data) sector size, bytes    */
+        __u32           inodesize;      /* inode size in bytes          */
+        __u32           imaxpct;        /* max allowed inode space(%)   */
+        __u64           datablocks;     /* fsblocks in data subvolume   */
+        __u64           rtblocks;       /* fsblocks in realtime subvol  */
+        __u64           rtextents;      /* rt extents in realtime subvol*/
+        __u64           logstart;       /* starting fsblock of the log  */
+        unsigned char   uuid[16];       /* unique id of the filesystem  */
+        __u32           sunit;          /* stripe unit, fsblocks        */
+        __u32           swidth;         /* stripe width, fsblocks       */
+        __s32           version;        /* structure version            */
+        __u32           flags;          /* superblock version flags     */
+        __u32           logsectsize;    /* log sector size, bytes       */
+        __u32           rtsectsize;     /* realtime sector size, bytes  */
+        __u32           dirblocksize;   /* directory block size, bytes  */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+        _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+typedef struct compat_xfs_inogrp {
+        __u64           xi_startino;    /* starting inode number        */
+        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        __u64           xi_allocmask;   /* mask of allocated inodes     */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+        __u64           newblocks;      /* new data subvol size, fsblocks */
+        __u32           imaxpct;        /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+typedef struct compat_xfs_growfs_rt {
+        __u64           newblocks;      /* new realtime size, fsblocks */
+        __u32           extsize;        /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+#endif /* BROKEN_X86_ALIGNMENT */
 #endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 095d271f3434..7aa53fefc67f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -53,6 +53,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
+#include <linux/fiemap.h>
 /*
 * Bring the atime in the XFS inode uptodate.
@@ -64,14 +65,14 @@ xfs_synchronize_atime(
 {
        struct inode    *inode = VFS_I(ip);
-        if (inode) {
+        if (!(inode->i_state & I_CLEAR)) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
        }
 }
 /*
- * If the linux inode exists, mark it dirty.
+ * If the linux inode is valid, mark it dirty.
 * Used when commiting a dirty inode into a transaction so that
 * the inode will get written back by the linux code
 */
@@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync(
 {
        struct inode    *inode = VFS_I(ip);
-        if (inode)
+        if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
                mark_inode_dirty_sync(inode);
 }
@@ -128,7 +129,7 @@ xfs_ichgtime(
        if (sync_it) {
                SYNCHRONIZE();
                ip->i_update_core = 1;
-                mark_inode_dirty_sync(inode);
+                xfs_mark_inode_dirty_sync(ip);
        }
 }
@@ -158,8 +159,6 @@ xfs_init_security(
        }
        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-        if (!error)
-                xfs_iflags_set(ip, XFS_IMODIFIED);
        kfree(name);
        kfree(value);
@@ -260,7 +259,6 @@ xfs_vn_mknod(
                error = _ACL_INHERIT(inode, mode, default_acl);
                if (unlikely(error))
                        goto out_cleanup_inode;
-                xfs_iflags_set(ip, XFS_IMODIFIED);
                _ACL_FREE(default_acl);
        }
@@ -366,21 +364,17 @@ xfs_vn_link(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-        struct inode    *inode; /* inode of guy being linked to */
+        struct inode    *inode = old_dentry->d_inode;
        struct xfs_name name;
        int             error;
-        inode = old_dentry->d_inode;
        xfs_dentry_to_name(&name, dentry);
-        igrab(inode);
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
-        if (unlikely(error)) {
+        if (unlikely(error))
-                iput(inode);
                return -error;
-        }
-        xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+        atomic_inc(&inode->i_count);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -601,7 +595,7 @@ xfs_vn_setattr(
        struct dentry   *dentry,
        struct iattr    *iattr)
 {
-        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
+        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 /*
@@ -642,7 +636,7 @@ xfs_vn_fallocate(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                      0, NULL, XFS_ATTR_NOLOCK);
+                                      0, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
@@ -653,7 +647,7 @@ xfs_vn_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
-                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
+                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -661,6 +655,88 @@ out_error:
        return error;
 }
+#define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+        void                    **arg,
+        struct getbmapx         *bmv,
+        int                     *full)
+{
+        int                     error;
+        struct fiemap_extent_info *fieinfo = *arg;
+        u32                     fiemap_flags = 0;
+        u64                     logical, physical, length;
+        /* Do nothing for a hole */
+        if (bmv->bmv_block == -1LL)
+                return 0;
+        logical = BBTOB(bmv->bmv_offset);
+        physical = BBTOB(bmv->bmv_block);
+        length = BBTOB(bmv->bmv_length);
+        if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+                fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+        else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+                fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+                physical = 0;   /* no block yet */
+        }
+        if (bmv->bmv_oflags & BMV_OF_LAST)
+                fiemap_flags |= FIEMAP_EXTENT_LAST;
+        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+                                        length, fiemap_flags);
+        if (error > 0) {
+                error = 0;
+                *full = 1;      /* user array now full */
+        }
+        return -error;
+}
+STATIC int
+xfs_vn_fiemap(
+        struct inode            *inode,
+        struct fiemap_extent_info *fieinfo,
+        u64                     start,
+        u64                     length)
+{
+        xfs_inode_t             *ip = XFS_I(inode);
+        struct getbmapx         bm;
+        int                     error;
+        error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+        if (error)
+                return error;
+        /* Set up bmap header for xfs internal routine */
+        bm.bmv_offset = BTOBB(start);
+        /* Special case for whole file */
+        if (length == FIEMAP_MAX_OFFSET)
+                bm.bmv_length = -1LL;
+        else
+                bm.bmv_length = BTOBB(length);
+        /* our formatter will tell xfs_getbmap when to stop. */
+        bm.bmv_count = MAXEXTNUM;
+        bm.bmv_iflags = BMV_IF_PREALLOC;
+        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+                bm.bmv_iflags |= BMV_IF_ATTRFORK;
+        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+                bm.bmv_iflags |= BMV_IF_DELALLOC;
+        error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+        if (error)
+                return -error;
+        return 0;
+}
 static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
@@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
        .fallocate              = xfs_vn_fallocate,
+        .fiemap                 = xfs_vn_fiemap,
 };
 static const struct inode_operations xfs_dir_inode_operations = {
@@ -766,12 +843,20 @@ xfs_diflags_to_iflags(
 * When reading existing inodes from disk this is called directly
 * from xfs_iget, when creating a new inode it is called from
 * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
 */
 void
 xfs_setup_inode(
        struct xfs_inode        *ip)
 {
-        struct inode            *inode = ip->i_vnode;
+        struct inode            *inode = &ip->i_vnode;
+        inode->i_ino = ip->i_ino;
+        inode->i_state = I_NEW|I_LOCK;
+        inode_add_to_lists(ip->i_mount->m_super, inode);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
@@ -799,7 +884,6 @@ xfs_setup_inode(
        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
        xfs_diflags_to_iflags(inode, ip);
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 8b1a1e31dc21..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,7 +22,6 @@ struct xfs_inode;
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
-extern const struct file_operations xfs_invis_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index cc0f7b3a9795..507492d6dccd 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -21,18 +21,12 @@
 #include <linux/types.h>
 /*
- * Some types are conditional depending on the target system.
 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- * as requiring XFS_BIG_BLKNOS to be set.
 */
 #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS 1
-# if BITS_PER_LONG == 64
+# define XFS_BIG_INUMS  1
-#  define XFS_BIG_INUMS 1
-# else
-#  define XFS_BIG_INUMS 0
-# endif
 #else
 # define XFS_BIG_BLKNOS 0
 # define XFS_BIG_INUMS  0
@@ -77,6 +71,7 @@
 #include <linux/spinlock.h>
 #include <linux/random.h>
 #include <linux/ctype.h>
+#include <linux/writeback.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -85,7 +80,6 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_vfs.h>
 #include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
@@ -107,7 +101,6 @@
 #undef  HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
 #endif
-#define restricted_chown        xfs_params.restrict_chown.val
 #define irix_sgid_inherit       xfs_params.sgid_inherit.val
 #define irix_symlink_mode       xfs_params.symlink_mode.val
 #define xfs_panic_mask          xfs_params.panic_mask.val
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1957e5357d04..7e90daa0d1d1 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,7 +51,6 @@
 #include "xfs_vnodeops.h"
 #include <linux/capability.h>
-#include <linux/mount.h>
 #include <linux/writeback.h>
@@ -243,7 +242,7 @@ xfs_read(
        if (unlikely(ioflags & IO_ISDIRECT)) {
                if (inode->i_mapping->nrpages)
-                        ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
+                        ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
                                                    -1, FI_REMAPF_LOCKED);
                mutex_unlock(&inode->i_mutex);
                if (ret) {
@@ -668,15 +667,8 @@ start:
        if (new_size > xip->i_size)
                xip->i_new_size = new_size;
-        /*
+        if (likely(!(ioflags & IO_INVIS)))
-         * We're not supposed to change timestamps in readonly-mounted
-         * filesystems.  Throw it away if anyone asks us.
-         */
-        if (likely(!(ioflags & IO_INVIS) &&
-                   !mnt_want_write(file->f_path.mnt))) {
                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                mnt_drop_write(file->f_path.mnt);
-        }
        /*
         * If the offset is beyond the size of the file, we have a couple
@@ -715,7 +707,6 @@ start:
                }
        }
-retry:
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -771,6 +762,17 @@ retry:
        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
                ret = wait_on_sync_kiocb(iocb);
+        isize = i_size_read(inode);
+        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+                *offset = isize;
+        if (*offset > xip->i_size) {
+                xfs_ilock(xip, XFS_ILOCK_EXCL);
+                if (*offset > xip->i_size)
+                        xip->i_size = *offset;
+                xfs_iunlock(xip, XFS_ILOCK_EXCL);
+        }
        if (ret == -ENOSPC &&
            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
                xfs_iunlock(xip, iolock);
@@ -784,20 +786,7 @@ retry:
                xfs_ilock(xip, iolock);
                if (error)
                        goto out_unlock_internal;
-                pos = xip->i_size;
+                goto start;
-                ret = 0;
-                goto retry;
-        }
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-                *offset = isize;
-        if (*offset > xip->i_size) {
-                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                if (*offset > xip->i_size)
-                        xip->i_size = *offset;
-                xfs_iunlock(xip, XFS_ILOCK_EXCL);
        }
        error = -ret;
@@ -855,13 +844,7 @@ retry:
 int
 xfs_bdstrat_cb(struct xfs_buf *bp)
 {
-        xfs_mount_t     *mp;
+        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                xfs_buf_iorequest(bp);
-                return 0;
-        } else {
                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
                /*
                 * Metadata write that didn't get logged but
@@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
                else
                        return (xfs_bioerror(bp));
        }
+        xfs_buf_iorequest(bp);
+        return 0;
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 3d5b67c075c7..c3526d445f6a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -53,11 +53,15 @@ xfs_read_xfsstats(
                { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
                { "vnodes",             XFSSTAT_END_VNODE_OPS           },
                { "buf",                XFSSTAT_END_BUF                 },
+                { "abtb2",              XFSSTAT_END_ABTB_V2             },
+                { "abtc2",              XFSSTAT_END_ABTC_V2             },
+                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
+                { "ibt2",               XFSSTAT_END_IBT_V2              },
        };
        /* Loop over all stats groups */
        for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
-                len += sprintf(buffer + len, xstats[i].desc);
+                len += sprintf(buffer + len, "%s", xstats[i].desc);
                /* inner loop does each group */
                while (j < xstats[i].endpoint) {
                        val = 0;
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index e83820febc9f..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -118,6 +118,71 @@ struct xfsstats {
        __uint32_t              xb_page_retries;
        __uint32_t              xb_page_found;
        __uint32_t              xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2             (XFSSTAT_END_BUF+15)
+        __uint32_t              xs_abtb_2_lookup;
+        __uint32_t              xs_abtb_2_compare;
+        __uint32_t              xs_abtb_2_insrec;
+        __uint32_t              xs_abtb_2_delrec;
+        __uint32_t              xs_abtb_2_newroot;
+        __uint32_t              xs_abtb_2_killroot;
+        __uint32_t              xs_abtb_2_increment;
+        __uint32_t              xs_abtb_2_decrement;
+        __uint32_t              xs_abtb_2_lshift;
+        __uint32_t              xs_abtb_2_rshift;
+        __uint32_t              xs_abtb_2_split;
+        __uint32_t              xs_abtb_2_join;
+        __uint32_t              xs_abtb_2_alloc;
+        __uint32_t              xs_abtb_2_free;
+        __uint32_t              xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2             (XFSSTAT_END_ABTB_V2+15)
+        __uint32_t              xs_abtc_2_lookup;
+        __uint32_t              xs_abtc_2_compare;
+        __uint32_t              xs_abtc_2_insrec;
+        __uint32_t              xs_abtc_2_delrec;
+        __uint32_t              xs_abtc_2_newroot;
+        __uint32_t              xs_abtc_2_killroot;
+        __uint32_t              xs_abtc_2_increment;
+        __uint32_t              xs_abtc_2_decrement;
+        __uint32_t              xs_abtc_2_lshift;
+        __uint32_t              xs_abtc_2_rshift;
+        __uint32_t              xs_abtc_2_split;
+        __uint32_t              xs_abtc_2_join;
+        __uint32_t              xs_abtc_2_alloc;
+        __uint32_t              xs_abtc_2_free;
+        __uint32_t              xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2             (XFSSTAT_END_ABTC_V2+15)
+        __uint32_t              xs_bmbt_2_lookup;
+        __uint32_t              xs_bmbt_2_compare;
+        __uint32_t              xs_bmbt_2_insrec;
+        __uint32_t              xs_bmbt_2_delrec;
+        __uint32_t              xs_bmbt_2_newroot;
+        __uint32_t              xs_bmbt_2_killroot;
+        __uint32_t              xs_bmbt_2_increment;
+        __uint32_t              xs_bmbt_2_decrement;
+        __uint32_t              xs_bmbt_2_lshift;
+        __uint32_t              xs_bmbt_2_rshift;
+        __uint32_t              xs_bmbt_2_split;
+        __uint32_t              xs_bmbt_2_join;
+        __uint32_t              xs_bmbt_2_alloc;
+        __uint32_t              xs_bmbt_2_free;
+        __uint32_t              xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2              (XFSSTAT_END_BMBT_V2+15)
+        __uint32_t              xs_ibt_2_lookup;
+        __uint32_t              xs_ibt_2_compare;
+        __uint32_t              xs_ibt_2_insrec;
+        __uint32_t              xs_ibt_2_delrec;
+        __uint32_t              xs_ibt_2_newroot;
+        __uint32_t              xs_ibt_2_killroot;
+        __uint32_t              xs_ibt_2_increment;
+        __uint32_t              xs_ibt_2_decrement;
+        __uint32_t              xs_ibt_2_lshift;
+        __uint32_t              xs_ibt_2_rshift;
+        __uint32_t              xs_ibt_2_split;
+        __uint32_t              xs_ibt_2_join;
+        __uint32_t              xs_ibt_2_alloc;
+        __uint32_t              xs_ibt_2_free;
+        __uint32_t              xs_ibt_2_moves;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 37ebe36056eb..36f6cc703ef2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -18,7 +18,6 @@
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
-#include "xfs_clnt.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -36,6 +35,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -48,7 +48,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
 #include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
@@ -58,6 +57,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
+#include "xfs_sync.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -70,36 +70,9 @@
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_vnode_zone;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
-        struct super_block      *sb,
-        int                     silent)
-{
-        struct xfs_mount_args   *args;
-        args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
-        if (!args)
-                return NULL;
-        args->logbufs = args->logbufsize = -1;
-        strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-        /* Copy the already-parsed mount(2) flags we're interested in */
-        if (sb->s_flags & MS_DIRSYNC)
-                args->flags |= XFSMNT_DIRSYNC;
-        if (sb->s_flags & MS_SYNCHRONOUS)
-                args->flags |= XFSMNT_WSYNC;
-        if (silent)
-                args->flags |= XFSMNT_QUIET;
-        args->flags |= XFSMNT_32BITINODES;
-        return args;
-}
 #define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
 #define MNTOPT_LOGDEV   "logdev"        /* log device */
@@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
        char                    *options,
-        struct xfs_mount_args   *args,
+        char                    **mtpt)
-        int                     update)
 {
+        struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
-        int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
+        int                     dsunit = 0;
-        int                     iosize;
+        int                     dswidth = 0;
+        int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
+        uchar_t                 iosizelog = 0;
+        /*
+         * Copy binary VFS mount flags we are interested in.
+         */
+        if (sb->s_flags & MS_RDONLY)
+                mp->m_flags |= XFS_MOUNT_RDONLY;
+        if (sb->s_flags & MS_DIRSYNC)
+                mp->m_flags |= XFS_MOUNT_DIRSYNC;
+        if (sb->s_flags & MS_SYNCHRONOUS)
+                mp->m_flags |= XFS_MOUNT_WSYNC;
+        /*
+         * Set some default flags that could be cleared by the mount option
+         * parsing.
+         */
+        mp->m_flags |= XFS_MOUNT_BARRIER;
+        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-        args->flags |= XFSMNT_BARRIER;
+        /*
-        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+         * These can be overridden by the mount option parsing.
+         */
+        mp->m_logbufs = -1;
+        mp->m_logbsize = -1;
        if (!options)
                goto done;
-        iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
        while ((this_char = strsep(&options, ",")) != NULL) {
                if (!*this_char)
                        continue;
@@ -221,7 +222,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        args->logbufs = simple_strtoul(value, &eov, 10);
+                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -229,7 +230,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        args->logbufsize = suffix_strtoul(value, &eov, 10);
+                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -237,7 +238,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->logname, value, MAXNAMELEN);
+                        mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!mp->m_logname)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -245,7 +248,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->mtpt, value, MAXNAMELEN);
+                        *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!*mtpt)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -253,7 +258,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->rtname, value, MAXNAMELEN);
+                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!mp->m_rtname)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -262,8 +269,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = simple_strtoul(value, &eov, 10);
-                        args->flags |= XFSMNT_IOSIZE;
+                        iosizelog = ffs(iosize) - 1;
-                        args->iosizelog = (uint8_t) iosize;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -272,8 +278,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = suffix_strtoul(value, &eov, 10);
-                        args->flags |= XFSMNT_IOSIZE;
+                        iosizelog = ffs(iosize) - 1;
-                        args->iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
                        mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +286,25 @@ xfs_parseargs(
                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
-                        args->flags |= XFSMNT_WSYNC;
+                        mp->m_flags |= XFS_MOUNT_WSYNC;
                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                        args->flags |= XFSMNT_OSYNCISOSYNC;
+                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
-                        args->flags |= XFSMNT_NORECOVERY;
+                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_INO64)) {
-                        args->flags |= XFSMNT_INO64;
+#if XFS_BIG_INUMS
-#if !XFS_BIG_INUMS
+                        mp->m_flags |= XFS_MOUNT_INO64;
+                        mp->m_inoadd = XFS_INO64_OFFSET;
+#else
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
-                        args->flags |= XFSMNT_NOALIGN;
+                        mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
-                        args->flags |= XFSMNT_SWALLOC;
+                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -315,7 +322,7 @@ xfs_parseargs(
                        }
                        dswidth = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
-                        args->flags &= ~XFSMNT_32BITINODES;
+                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
@@ -323,56 +330,61 @@ xfs_parseargs(
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
-                        args->flags |= XFSMNT_NOUUID;
+                        mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
-                        args->flags |= XFSMNT_BARRIER;
+                        mp->m_flags |= XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
-                        args->flags &= ~XFSMNT_BARRIER;
+                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                        args->flags |= XFSMNT_IKEEP;
+                        mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
                        dmapi_implies_ikeep = 0;
-                        args->flags &= ~XFSMNT_IKEEP;
+                        mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
-                        args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
+                        mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
-                        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+                        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-                        args->flags |= XFSMNT_ATTR2;
+                        mp->m_flags |= XFS_MOUNT_ATTR2;
                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-                        args->flags &= ~XFSMNT_ATTR2;
+                        mp->m_flags &= ~XFS_MOUNT_ATTR2;
-                        args->flags |= XFSMNT_NOATTR2;
+                        mp->m_flags |= XFS_MOUNT_NOATTR2;
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
-                        args->flags2 |= XFSMNT2_FILESTREAMS;
+                        mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                        args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
+                        mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
-                        args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+                                          XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                          XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                          XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
-                        args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                         XFS_UQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-                        args->flags |= XFSMNT_UQUOTA;
+                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_UQUOTAENF;
+                        mp->m_qflags &= ~XFS_UQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-                        args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
+                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                         XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-                        args->flags |= XFSMNT_PQUOTA;
+                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_PQUOTAENF;
+                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-                        args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                         XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-                        args->flags |= XFSMNT_GQUOTA;
+                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_GQUOTAENF;
+                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +402,29 @@ xfs_parseargs(
                }
        }
-        if (args->flags & XFSMNT_NORECOVERY) {
+        /*
-                if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) {
+         * no recovery flag requires a read-only mount
-                        cmn_err(CE_WARN,
+         */
-                                "XFS: no-recovery mounts must be read-only.");
+        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
-                        return EINVAL;
+            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                }
+                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                return EINVAL;
        }
-        if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                cmn_err(CE_WARN,
        "XFS: sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
-        if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
+        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
-        if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') {
+        if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
                printk("XFS: %s option needs the mount point option as well\n",
                        MNTOPT_DMAPI);
                return EINVAL;
@@ -438,27 +452,66 @@ xfs_parseargs(
         * Note that if "ikeep" or "noikeep" mount options are
         * supplied, then they are honored.
         */
-        if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
+        if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
-                args->flags |= XFSMNT_IKEEP;
+                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+done:
+        if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+                /*
+                 * At this point the superblock has not been read
+                 * in, therefore we do not know the block size.
+                 * Before the mount call ends we will convert
+                 * these to FSBs.
+                 */
                if (dsunit) {
-                        args->sunit = dsunit;
+                        mp->m_dalign = dsunit;
-                        args->flags |= XFSMNT_RETERR;
+                        mp->m_flags |= XFS_MOUNT_RETERR;
-                } else {
-                        args->sunit = vol_dsunit;
                }
-                dswidth ? (args->swidth = dswidth) :
-                          (args->swidth = vol_dswidth);
+                if (dswidth)
-        } else {
+                        mp->m_swidth = dswidth;
-                args->sunit = args->swidth = 0;
+        }
+        if (mp->m_logbufs != -1 &&
+            mp->m_logbufs != 0 &&
+            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+                cmn_err(CE_WARN,
+                        "XFS: invalid logbufs value: %d [not %d-%d]",
+                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+                return XFS_ERROR(EINVAL);
+        }
+        if (mp->m_logbsize != -1 &&
+            mp->m_logbsize !=  0 &&
+            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+             !is_power_of_2(mp->m_logbsize))) {
+                cmn_err(CE_WARN,
+        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        mp->m_logbsize);
+                return XFS_ERROR(EINVAL);
+        }
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        if (iosizelog) {
+                if (iosizelog > XFS_MAX_IO_LOG ||
+                    iosizelog < XFS_MIN_IO_LOG) {
+                        cmn_err(CE_WARN,
+                "XFS: invalid log iosize: %d [not %d-%d]",
+                                iosizelog, XFS_MIN_IO_LOG,
+                                XFS_MAX_IO_LOG);
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+                mp->m_readio_log = iosizelog;
+                mp->m_writeio_log = iosizelog;
        }
-done:
-        if (args->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-        if (args->flags2)
-                args->flags |= XFSMNT_FLAGS2;
        return 0;
 }
@@ -704,8 +757,7 @@ xfs_close_devices(
 */
 STATIC int
 xfs_open_devices(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        struct xfs_mount_args   *args)
 {
        struct block_device     *ddev = mp->m_super->s_bdev;
        struct block_device     *logdev = NULL, *rtdev = NULL;
@@ -714,14 +766,14 @@ xfs_open_devices(
        /*
         * Open real time and log devices - order is important.
         */
-        if (args->logname[0]) {
+        if (mp->m_logname) {
-                error = xfs_blkdev_get(mp, args->logname, &logdev);
+                error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
                if (error)
                        goto out;
        }
-        if (args->rtname[0]) {
+        if (mp->m_rtname) {
-                error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+                error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
                if (error)
                        goto out_close_logdev;
@@ -813,18 +865,18 @@ xfs_setup_devices(
 */
 void
 xfsaild_wakeup(
-        xfs_mount_t             *mp,
+        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        mp->m_ail.xa_target = threshold_lsn;
+        ailp->xa_target = threshold_lsn;
-        wake_up_process(mp->m_ail.xa_task);
+        wake_up_process(ailp->xa_task);
 }
 int
 xfsaild(
        void    *data)
 {
-        xfs_mount_t     *mp = (xfs_mount_t *)data;
+        struct xfs_ail  *ailp = data;
        xfs_lsn_t       last_pushed_lsn = 0;
        long            tout = 0;
@@ -836,11 +888,11 @@ xfsaild(
                /* swsusp */
                try_to_freeze();
-                ASSERT(mp->m_log);
+                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(mp))
+                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
                        continue;
-                tout = xfsaild_push(mp, &last_pushed_lsn);
+                tout = xfsaild_push(ailp, &last_pushed_lsn);
        }
        return 0;
@@ -848,43 +900,82 @@ xfsaild(
 int
 xfsaild_start(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
-        mp->m_ail.xa_target = 0;
+        ailp->xa_target = 0;
-        mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
-        if (IS_ERR(mp->m_ail.xa_task))
+        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(mp->m_ail.xa_task);
+                return -PTR_ERR(ailp->xa_task);
        return 0;
 }
 void
 xfsaild_stop(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
-        kthread_stop(mp->m_ail.xa_task);
+        kthread_stop(ailp->xa_task);
 }
+/* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
+        BUG();
+        return NULL;
 }
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
 STATIC void
 xfs_fs_destroy_inode(
-        struct inode            *inode)
+        struct inode    *inode)
 {
-        kmem_zone_free(xfs_vnode_zone, inode);
+        xfs_inode_t             *ip = XFS_I(inode);
+        XFS_STATS_INC(vn_reclaim);
+        if (xfs_reclaim(ip))
+                panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 }
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly intialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
 STATIC void
 xfs_fs_inode_init_once(
-        void                    *vnode)
+        void                    *inode)
 {
-        inode_init_once((struct inode *)vnode);
+        struct xfs_inode        *ip = inode;
+        memset(ip, 0, sizeof(struct xfs_inode));
+        /* vfs inode */
+        inode_init_once(VFS_I(ip));
+        /* xfs inode */
+        atomic_set(&ip->i_iocount, 0);
+        atomic_set(&ip->i_pincount, 0);
+        spin_lock_init(&ip->i_flags_lock);
+        init_waitqueue_head(&ip->i_ipin_wait);
+        /*
+         * Because we want to use a counting completion, complete
+         * the flush completion once to allow a single access to
+         * the flush completion without blocking.
+         */
+        init_completion(&ip->i_flush);
+        complete(&ip->i_flush);
+        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                     "xfsino", ip->i_ino);
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 /*
@@ -898,21 +989,26 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        int                     error = 0;
        int                     flags = 0;
-        xfs_itrace_entry(XFS_I(inode));
+        xfs_itrace_entry(ip);
        if (sync) {
-                filemap_fdatawait(inode->i_mapping);
+                error = xfs_wait_on_pages(ip, 0, -1);
+                if (error)
+                        goto out_error;
                flags |= FLUSH_SYNC;
        }
-        error = xfs_inode_flush(XFS_I(inode), flags);
+        error = xfs_inode_flush(ip, flags);
+out_error:
        /*
         * if we failed to write out the inode then mark
         * it dirty again so we'll try again later.
         */
        if (error)
-                mark_inode_dirty_sync(inode);
+                xfs_mark_inode_dirty_sync(ip);
        return -error;
 }
@@ -923,164 +1019,12 @@ xfs_fs_clear_inode(
 {
        xfs_inode_t             *ip = XFS_I(inode);
-        /*
+        xfs_itrace_entry(ip);
-         * ip can be null when xfs_iget_core calls xfs_idestroy if we
+        XFS_STATS_INC(vn_rele);
-         * find an inode with di_mode == 0 but without IGET_CREATE set.
+        XFS_STATS_INC(vn_remove);
-         */
+        XFS_STATS_DEC(vn_active);
-        if (ip) {
-                xfs_itrace_entry(ip);
-                XFS_STATS_INC(vn_rele);
-                XFS_STATS_INC(vn_remove);
-                XFS_STATS_INC(vn_reclaim);
-                XFS_STATS_DEC(vn_active);
-                xfs_inactive(ip);
-                xfs_iflags_clear(ip, XFS_IMODIFIED);
-                if (xfs_reclaim(ip))
-                        panic("%s: cannot reclaim 0x%p\n", __func__, inode);
-        }
-        ASSERT(XFS_I(inode) == NULL);
-}
-/*
+        xfs_inactive(ip);
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *))
-{
-        struct bhv_vfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        filemap_flush(inode->i_mapping);
-        iput(inode);
-}
-void
-xfs_flush_inode(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-        delay(msecs_to_jiffies(500));
-}
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        sync_blockdev(mp->m_super->s_bdev);
-        iput(inode);
-}
-void
-xfs_flush_device(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-        delay(msecs_to_jiffies(500));
-        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-STATIC void
-xfs_sync_worker(
-        struct xfs_mount *mp,
-        void            *unused)
-{
-        int             error;
-        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
-}
-STATIC int
-xfssyncd(
-        void                    *arg)
-{
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        bhv_vfs_sync_work_t     *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
-                /*
-                 * We can get woken by laptop mode, to do a sync -
-                 * that's the (only!) case where the list would be
-                 * empty with time remaining.
-                 */
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
-                        list_move(&work->w_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
-                        (*work->w_syncer)(mp, work->w_data);
-                        list_del(&work->w_list);
-                        if (work == &mp->m_sync_work)
-                                continue;
-                        kmem_free(work);
-                }
-        }
-        return 0;
 }
 STATIC void
@@ -1099,11 +1043,9 @@ xfs_fs_put_super(
        struct xfs_mount        *mp = XFS_M(sb);
        struct xfs_inode        *rip = mp->m_rootip;
        int                     unmount_event_flags = 0;
-        int                     error;
-        kthread_stop(mp->m_sync_task);
+        xfs_syncd_stop(mp);
+        xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
-        xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1128,18 +1070,6 @@ xfs_fs_put_super(
        xfs_filestream_unmount(mp);
        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        WARN_ON(error);
-        /*
-         * If we're forcing a shutdown, typically because of a media error,
-         * we want to make sure we invalidate dirty pages that belong to
-         * referenced vnodes as well.
-         */
-        if (XFS_FORCED_SHUTDOWN(mp)) {
-                error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-                ASSERT(error != EFSCORRUPTED);
-        }
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
        struct super_block      *sb)
 {
        if (!(sb->s_flags & MS_RDONLY))
-                xfs_sync(XFS_M(sb), SYNC_FSDATA);
+                xfs_sync_fsdata(XFS_M(sb), 0);
        sb->s_dirt = 0;
 }
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        int                     error;
-        int                     flags;
        /*
         * Treat a sync operation like a freeze.  This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
         * dirty the Linux inode until after the transaction I/O
         * completes.
         */
-        if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) {
+        if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
-                /*
+                error = xfs_quiesce_data(mp);
-                 * First stage of freeze - no more writers will make progress
+        else
-                 * now we are here, so we flush delwri and delalloc buffers
+                error = xfs_sync_fsdata(mp, 0);
-                 * here, then wait for all I/O to complete.  Data is frozen at
-                 * that point. Metadata is not frozen, transactions can still
-                 * occur here so don't bother flushing the buftarg (i.e
-                 * SYNC_QUIESCE) because it'll just get dirty again.
-                 */
-                flags = SYNC_DATA_QUIESCE;
-        } else
-                flags = SYNC_FSDATA;
-        error = xfs_sync(mp, flags);
        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-                xfs_filestream_flush(mp);
+                xfs_quiesce_data(mp);
-                xfs_sync(mp, SYNC_DATA_QUIESCE);
+                xfs_quiesce_attr(mp);
-                xfs_attr_quiesce(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
@@ -1348,7 +1266,7 @@ xfs_fs_remount(
 /*
 * Second stage of a freeze. The data is already frozen so we only
- * need to take care of themetadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done write a dummy
 * record to dirty the log in case of a crash while frozen.
 */
 STATIC void
@@ -1357,7 +1275,7 @@ xfs_fs_lockfs(
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        xfs_attr_quiesce(mp);
+        xfs_quiesce_attr(mp);
        xfs_fs_log_dummy(mp);
 }
@@ -1422,175 +1340,28 @@ xfs_fs_setxquota(
 /*
 * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-        struct xfs_mount_args   *ap,
-        struct xfs_mount        *mp)
-{
-        int                     error;
-        /* Values are in BBs */
-        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-                /*
-                 * At this point the superblock has not been read
-                 * in, therefore we do not know the block size.
-                 * Before the mount call ends we will convert
-                 * these to FSBs.
-                 */
-                mp->m_dalign = ap->sunit;
-                mp->m_swidth = ap->swidth;
-        }
-        if (ap->logbufs != -1 &&
-            ap->logbufs != 0 &&
-            (ap->logbufs < XLOG_MIN_ICLOGS ||
-             ap->logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
-                        ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-                return XFS_ERROR(EINVAL);
-        }
-        mp->m_logbufs = ap->logbufs;
-        if (ap->logbufsize != -1 &&
-            ap->logbufsize !=  0 &&
-            (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-             ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-             !is_power_of_2(ap->logbufsize))) {
-                cmn_err(CE_WARN,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                        ap->logbufsize);
-                return XFS_ERROR(EINVAL);
-        }
-        error = ENOMEM;
-        mp->m_logbsize = ap->logbufsize;
-        mp->m_fsname_len = strlen(ap->fsname) + 1;
-        mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
-        if (!mp->m_fsname)
-                goto out;
-        if (ap->rtname[0]) {
-                mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-                if (!mp->m_rtname)
-                        goto out_free_fsname;
-        }
-        if (ap->logname[0]) {
-                mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-                if (!mp->m_logname)
-                        goto out_free_rtname;
-        }
-        if (ap->flags & XFSMNT_WSYNC)
-                mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-        if (ap->flags & XFSMNT_INO64) {
-                mp->m_flags |= XFS_MOUNT_INO64;
-                mp->m_inoadd = XFS_INO64_OFFSET;
-        }
-#endif
-        if (ap->flags & XFSMNT_RETERR)
-                mp->m_flags |= XFS_MOUNT_RETERR;
-        if (ap->flags & XFSMNT_NOALIGN)
-                mp->m_flags |= XFS_MOUNT_NOALIGN;
-        if (ap->flags & XFSMNT_SWALLOC)
-                mp->m_flags |= XFS_MOUNT_SWALLOC;
-        if (ap->flags & XFSMNT_OSYNCISOSYNC)
-                mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-        if (ap->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        if (ap->flags & XFSMNT_IOSIZE) {
-                if (ap->iosizelog > XFS_MAX_IO_LOG ||
-                    ap->iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
-                "XFS: invalid log iosize: %d [not %d-%d]",
-                                ap->iosizelog, XFS_MIN_IO_LOG,
-                                XFS_MAX_IO_LOG);
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-        }
-        if (ap->flags & XFSMNT_IKEEP)
-                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if (ap->flags & XFSMNT_DIRSYNC)
-                mp->m_flags |= XFS_MOUNT_DIRSYNC;
-        if (ap->flags & XFSMNT_ATTR2)
-                mp->m_flags |= XFS_MOUNT_ATTR2;
-        if (ap->flags & XFSMNT_NOATTR2)
-                mp->m_flags |= XFS_MOUNT_NOATTR2;
-        if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-                mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-        /*
-         * no recovery flag requires a read-only mount
-         */
-        if (ap->flags & XFSMNT_NORECOVERY) {
-                if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                        cmn_err(CE_WARN,
-        "XFS: tried to mount a FS read-write without recovery!");
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_NORECOVERY;
-        }
-        if (ap->flags & XFSMNT_NOUUID)
-                mp->m_flags |= XFS_MOUNT_NOUUID;
-        if (ap->flags & XFSMNT_BARRIER)
-                mp->m_flags |= XFS_MOUNT_BARRIER;
-        else
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-        if (ap->flags2 & XFSMNT2_FILESTREAMS)
-                mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-        if (ap->flags & XFSMNT_DMAPI)
-                mp->m_flags |= XFS_MOUNT_DMAPI;
-        return 0;
- out_free_rtname:
-        kfree(mp->m_rtname);
- out_free_fsname:
-        kfree(mp->m_fsname);
- out:
-        return error;
-}
-/*
- * This function fills in xfs_mount_t fields based on mount args.
 * Note: the superblock _has_ now been read in.
 */
 STATIC int
 xfs_finish_flags(
-        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
        /* Fail a mount where the logbuf is smaller then the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-                if ((ap->logbufsize <= 0) &&
+                if (mp->m_logbsize <= 0 &&
-                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+                    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
-                } else if (ap->logbufsize > 0 &&
+                } else if (mp->m_logbsize > 0 &&
-                           ap->logbufsize < mp->m_sb.sb_logsunit) {
+                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
-                if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
         * told by noattr2 to turn it off
         */
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
-            !(ap->flags & XFSMNT_NOATTR2))
+            !(mp->m_flags & XFS_MOUNT_NOATTR2))
                mp->m_flags |= XFS_MOUNT_ATTR2;
        /*
@@ -1614,48 +1385,6 @@ xfs_finish_flags(
                return XFS_ERROR(EROFS);
        }
-        /*
-         * check for shared mount.
-         */
-        if (ap->flags & XFSMNT_SHARED) {
-                if (!xfs_sb_version_hasshared(&mp->m_sb))
-                        return XFS_ERROR(EINVAL);
-                /*
-                 * For IRIX 6.5, shared mounts must have the shared
-                 * version bit set, have the persistent readonly
-                 * field set, must be version 0 and can only be mounted
-                 * read-only.
-                 */
-                if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
-                     (mp->m_sb.sb_shared_vn != 0))
-                        return XFS_ERROR(EINVAL);
-                mp->m_flags |= XFS_MOUNT_SHARED;
-                /*
-                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
-                 */
-                if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
-                        return XFS_ERROR(EINVAL);
-        }
-        if (ap->flags & XFSMNT_UQUOTA) {
-                mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_UQUOTAENF)
-                        mp->m_qflags |= XFS_UQUOTA_ENFD;
-        }
-        if (ap->flags & XFSMNT_GQUOTA) {
-                mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_GQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        } else if (ap->flags & XFSMNT_PQUOTA) {
-                mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_PQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        }
        return 0;
 }
@@ -1667,19 +1396,14 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-        struct xfs_mount_args   *args;
        int                     flags = 0, error = ENOMEM;
+        char                    *mtpt = NULL;
-        args = xfs_args_allocate(sb, silent);
-        if (!args)
-                return -ENOMEM;
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
-                goto out_free_args;
+                goto out;
        spin_lock_init(&mp->m_sb_lock);
-        mutex_init(&mp->m_ilock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
        INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1413,9 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
-        if (sb->s_flags & MS_RDONLY)
+        error = xfs_parseargs(mp, (char *)data, &mtpt);
-                mp->m_flags |= XFS_MOUNT_RDONLY;
-        error = xfs_parseargs(mp, (char *)data, args, 0);
        if (error)
-                goto out_free_mp;
+                goto out_free_fsname;
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1423,28 @@ xfs_fs_fill_super(
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
-        error = xfs_dmops_get(mp, args);
+        error = xfs_dmops_get(mp);
        if (error)
-                goto out_free_mp;
+                goto out_free_fsname;
-        error = xfs_qmops_get(mp, args);
+        error = xfs_qmops_get(mp);
        if (error)
                goto out_put_dmops;
-        if (args->flags & XFSMNT_QUIET)
+        if (silent)
                flags |= XFS_MFSI_QUIET;
-        error = xfs_open_devices(mp, args);
+        error = xfs_open_devices(mp);
        if (error)
                goto out_put_qmops;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
-        /*
-         * Setup flags based on mount(2) options and then the superblock
-         */
-        error = xfs_start_flags(args, mp);
-        if (error)
-                goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-                goto out_free_fsname;
+                goto out_destroy_counters;
-        error = xfs_finish_flags(args, mp);
+        error = xfs_finish_flags(mp);
        if (error)
                goto out_free_sb;
@@ -1747,7 +1463,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
-        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
+        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1488,31 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        error = xfs_syncd_init(mp);
-        mp->m_sync_work.w_mount = mp;
+        if (error)
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
-        if (IS_ERR(mp->m_sync_task)) {
-                error = -PTR_ERR(mp->m_sync_task);
                goto fail_vnrele;
-        }
-        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
+        kfree(mtpt);
-        kfree(args);
+        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
        return 0;
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
        xfs_freesb(mp);
- out_free_fsname:
+ out_destroy_counters:
-        xfs_free_fsname(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
 out_put_qmops:
        xfs_qmops_put(mp);
 out_put_dmops:
        xfs_dmops_put(mp);
- out_free_mp:
+ out_free_fsname:
+        xfs_free_fsname(mp);
+        kfree(mtpt);
        kfree(mp);
- out_free_args:
+ out:
-        kfree(args);
        return -error;
 fail_vnrele:
@@ -1820,8 +1532,6 @@ xfs_fs_fill_super(
        xfs_filestream_unmount(mp);
        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        WARN_ON(error);
        xfs_unmountfs(mp);
        goto out_free_sb;
@@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void)
        if (!xfs_bmap_trace_buf)
                goto out_free_alloc_trace;
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
+        xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
+                                             KM_MAYFAIL);
+        if (!xfs_allocbt_trace_buf)
+                goto out_free_bmap_trace;
+        xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_inobt_trace_buf)
+                goto out_free_allocbt_trace;
        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
        if (!xfs_bmbt_trace_buf)
-                goto out_free_bmap_trace;
+                goto out_free_inobt_trace;
 #endif
 #ifdef XFS_ATTR_TRACE
        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void)
        ktrace_free(xfs_attr_trace_buf);
 out_free_bmbt_trace:
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+ out_free_inobt_trace:
+        ktrace_free(xfs_inobt_trace_buf);
+ out_free_allocbt_trace:
+        ktrace_free(xfs_allocbt_trace_buf);
 out_free_bmap_trace:
 #endif
 #ifdef XFS_BMAP_TRACE
@@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void)
 #ifdef XFS_ATTR_TRACE
        ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+        ktrace_free(xfs_inobt_trace_buf);
+        ktrace_free(xfs_allocbt_trace_buf);
 #endif
 #ifdef XFS_BMAP_TRACE
        ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                        KM_ZONE_SPREAD,
-                                        xfs_fs_inode_init_once);
-        if (!xfs_vnode_zone)
-                goto out;
        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
        if (!xfs_ioend_zone)
-                goto out_destroy_vnode_zone;
+                goto out;
        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
                                                  xfs_ioend_zone);
@@ -1970,6 +1689,7 @@ xfs_init_zones(void)
                                                "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
                                                "xfs_btree_cur");
        if (!xfs_btree_cur_zone)
@@ -2017,8 +1737,8 @@ xfs_init_zones(void)
        xfs_inode_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-                                        KM_ZONE_SPREAD, NULL);
+                        xfs_fs_inode_init_once);
        if (!xfs_inode_zone)
                goto out_destroy_efi_zone;
@@ -2066,8 +1786,6 @@ xfs_init_zones(void)
        mempool_destroy(xfs_ioend_pool);
 out_destroy_ioend_zone:
        kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-        kmem_zone_destroy(xfs_vnode_zone);
 out:
        return -ENOMEM;
 }
@@ -2092,7 +1810,6 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_log_ticket_zone);
        mempool_destroy(xfs_ioend_pool);
        kmem_zone_destroy(xfs_ioend_zone);
-        kmem_zone_destroy(xfs_vnode_zone);
 }
@@ -2100,13 +1817,12 @@ STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
-        static char             message[] __initdata = KERN_INFO \
-                XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
-        printk(message);
+        printk(KERN_INFO XFS_VERSION_STRING " with "
+                         XFS_BUILD_OPTIONS " enabled\n");
        ktrace_init(64);
-        vn_init();
+        xfs_ioend_init();
        xfs_dir_startup();
        error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index fe2ef4e6a0f9..d5d776d4cd67 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -20,24 +20,12 @@
 #include <linux/exportfs.h>
-#ifdef CONFIG_XFS_DMAPI
-# define vfs_insertdmapi(vfs)   vfs_insertops(vfsp, &xfs_dmops)
-# define vfs_initdmapi()        dmapi_init()
-# define vfs_exitdmapi()        dmapi_uninit()
-#else
-# define vfs_insertdmapi(vfs)   do { } while (0)
-# define vfs_initdmapi()        do { } while (0)
-# define vfs_exitdmapi()        do { } while (0)
-#endif
 #ifdef CONFIG_XFS_QUOTA
-# define vfs_insertquota(vfs)   vfs_insertops(vfsp, &xfs_qmops)
 extern void xfs_qm_init(void);
 extern void xfs_qm_exit(void);
 # define vfs_initquota()        xfs_qm_init()
 # define vfs_exitquota()        xfs_qm_exit()
 #else
-# define vfs_insertquota(vfs)   do { } while (0)
 # define vfs_initquota()        do { } while (0)
 # define vfs_exitquota()        do { } while (0)
 #endif
@@ -101,9 +89,6 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_flush_inode(struct xfs_inode *);
-extern void xfs_flush_device(struct xfs_inode *);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 000000000000..2ed035354c26
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_utils.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_rw.h"
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+/*
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
+ */
+STATIC int
+xfs_sync_inodes_ag(
+        xfs_mount_t     *mp,
+        int             ag,
+        int             flags)
+{
+        xfs_perag_t     *pag = &mp->m_perag[ag];
+        int             nr_found;
+        uint32_t        first_index = 0;
+        int             error = 0;
+        int             last_error = 0;
+        int             fflag = XFS_B_ASYNC;
+        if (flags & SYNC_DELWRI)
+                fflag = XFS_B_DELWRI;
+        if (flags & SYNC_WAIT)
+                fflag = 0;              /* synchronous overrides all */
+        do {
+                struct inode    *inode;
+                xfs_inode_t     *ip = NULL;
+                int             lock_flags = XFS_ILOCK_SHARED;
+                /*
+                 * use a gang lookup to find the next inode in the tree
+                 * as the tree is sparse and a gang lookup walks to find
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                (void**)&ip, first_index, 1);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* nothing to sync during shutdown */
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        return 0;
+                }
+                /*
+                 * If we can't get a reference on the inode, it must be
+                 * in reclaim. Leave it for the reclaim code to flush.
+                 */
+                inode = VFS_I(ip);
+                if (!igrab(inode)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        continue;
+                }
+                read_unlock(&pag->pag_ici_lock);
+                /* avoid new or bad inodes */
+                if (is_bad_inode(inode) ||
+                    xfs_iflags_test(ip, XFS_INEW)) {
+                        IRELE(ip);
+                        continue;
+                }
+                /*
+                 * If we have to flush data or wait for I/O completion
+                 * we need to hold the iolock.
+                 */
+                if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                        lock_flags |= XFS_IOLOCK_SHARED;
+                        error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                        if (flags & SYNC_IOWAIT)
+                                xfs_ioend_wait(ip);
+                }
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+                        if (flags & SYNC_WAIT) {
+                                xfs_iflock(ip);
+                                if (!xfs_inode_clean(ip))
+                                        error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                                else
+                                        xfs_ifunlock(ip);
+                        } else if (xfs_iflock_nowait(ip)) {
+                                if (!xfs_inode_clean(ip))
+                                        error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+                                else
+                                        xfs_ifunlock(ip);
+                        }
+                }
+                xfs_iput(ip, lock_flags);
+                if (error)
+                        last_error = error;
+                /*
+                 * bail out if the filesystem is corrupted.
+                 */
+                if (error == EFSCORRUPTED)
+                        return XFS_ERROR(error);
+        } while (nr_found);
+        return last_error;
+}
+int
+xfs_sync_inodes(
+        xfs_mount_t     *mp,
+        int             flags)
+{
+        int             error;
+        int             last_error;
+        int             i;
+        int             lflags = XFS_LOG_FORCE;
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return 0;
+        error = 0;
+        last_error = 0;
+        if (flags & SYNC_WAIT)
+                lflags |= XFS_LOG_SYNC;
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                error = xfs_sync_inodes_ag(mp, i, flags);
+                if (error)
+                        last_error = error;
+                if (error == EFSCORRUPTED)
+                        break;
+        }
+        if (flags & SYNC_DELWRI)
+                xfs_log_force(mp, 0, lflags);
+        return XFS_ERROR(last_error);
+}
+STATIC int
+xfs_commit_dummy_trans(
+        struct xfs_mount        *mp,
+        uint                    log_flags)
+{
+        struct xfs_inode        *ip = mp->m_rootip;
+        struct xfs_trans        *tp;
+        int                     error;
+        /*
+         * Put a dummy transaction in the log to tell recovery
+         * that all others are OK.
+         */
+        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        /* XXX(hch): ignoring the error here.. */
+        error = xfs_trans_commit(tp, 0);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_log_force(mp, 0, log_flags);
+        return 0;
+}
+int
+xfs_sync_fsdata(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        struct xfs_buf          *bp;
+        struct xfs_buf_log_item *bip;
+        int                     error = 0;
+        /*
+         * If this is xfssyncd() then only sync the superblock if we can
+         * lock it without sleeping and it is not pinned.
+         */
+        if (flags & SYNC_BDFLUSH) {
+                ASSERT(!(flags & SYNC_WAIT));
+                bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+                if (!bp)
+                        goto out;
+                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+                        goto out_brelse;
+        } else {
+                bp = xfs_getsb(mp, 0);
+                /*
+                 * If the buffer is pinned then push on the log so we won't
+                 * get stuck waiting in the write for someone, maybe
+                 * ourselves, to flush the log.
+                 *
+                 * Even though we just pushed the log above, we did not have
+                 * the superblock buffer locked at that point so it can
+                 * become pinned in between there and here.
+                 */
+                if (XFS_BUF_ISPINNED(bp))
+                        xfs_log_force(mp, 0, XFS_LOG_FORCE);
+        }
+        if (flags & SYNC_WAIT)
+                XFS_BUF_UNASYNC(bp);
+        else
+                XFS_BUF_ASYNC(bp);
+        return xfs_bwrite(mp, bp);
+ out_brelse:
+        xfs_buf_relse(bp);
+ out:
+        return error;
+}
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+        struct xfs_mount        *mp)
+{
+        int error;
+        /* push non-blocking */
+        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+        XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+        xfs_filestream_flush(mp);
+        /* push and block */
+        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+        XFS_QM_DQSYNC(mp, SYNC_WAIT);
+        /* write superblock and hoover up shutdown errors */
+        error = xfs_sync_fsdata(mp, 0);
+        /* flush data-only devices */
+        if (mp->m_rtdev_targp)
+                XFS_bflush(mp->m_rtdev_targp);
+        return error;
+}
+STATIC void
+xfs_quiesce_fs(
+        struct xfs_mount        *mp)
+{
+        int     count = 0, pincount;
+        xfs_flush_buftarg(mp->m_ddev_targp, 0);
+        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+        /*
+         * This loop must run at least twice.  The first instance of the loop
+         * will flush most meta data but that will generate more meta data
+         * (typically directory updates).  Which then must be flushed and
+         * logged before we can write the unmount record.
+         */
+        do {
+                xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+                if (!pincount) {
+                        delay(50);
+                        count++;
+                }
+        } while (count < 2);
+}
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+        struct xfs_mount        *mp)
+{
+        int     error = 0;
+        /* wait for all modifications to complete */
+        while (atomic_read(&mp->m_active_trans) > 0)
+                delay(100);
+        /* flush inodes and push all remaining buffers out to disk */
+        xfs_quiesce_fs(mp);
+        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+        /* Push the superblock and write an unmount record */
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
+        xfs_log_unmount_write(mp);
+        xfs_unmountfs_writesb(mp);
+}
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+        struct xfs_mount *mp,
+        void            *data,
+        void            (*syncer)(struct xfs_mount *, void *))
+{
+        struct bhv_vfs_sync_work *work;
+        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+        INIT_LIST_HEAD(&work->w_list);
+        work->w_syncer = syncer;
+        work->w_data = data;
+        work->w_mount = mp;
+        spin_lock(&mp->m_sync_lock);
+        list_add_tail(&work->w_list, &mp->m_sync_list);
+        spin_unlock(&mp->m_sync_lock);
+        wake_up_process(mp->m_sync_task);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+        struct xfs_mount *mp,
+        void            *arg)
+{
+        struct inode    *inode = arg;
+        filemap_flush(inode->i_mapping);
+        iput(inode);
+}
+void
+xfs_flush_inode(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        igrab(inode);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+        delay(msecs_to_jiffies(500));
+}
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+        struct xfs_mount *mp,
+        void            *arg)
+{
+        struct inode    *inode = arg;
+        sync_blockdev(mp->m_super->s_bdev);
+        iput(inode);
+}
+void
+xfs_flush_device(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        igrab(inode);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+        delay(msecs_to_jiffies(500));
+        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+        struct xfs_mount *mp,
+        void            *unused)
+{
+        int             error;
+        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                /* dgc: errors ignored here */
+                error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+                error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+                if (xfs_log_need_covered(mp))
+                        error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+        }
+        mp->m_sync_seq++;
+        wake_up(&mp->m_wait_single_sync_task);
+}
+STATIC int
+xfssyncd(
+        void                    *arg)
+{
+        struct xfs_mount        *mp = arg;
+        long                    timeleft;
+        bhv_vfs_sync_work_t     *work, *n;
+        LIST_HEAD               (tmp);
+        set_freezable();
+        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+        for (;;) {
+                timeleft = schedule_timeout_interruptible(timeleft);
+                /* swsusp */
+                try_to_freeze();
+                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+                        break;
+                spin_lock(&mp->m_sync_lock);
+                /*
+                 * We can get woken by laptop mode, to do a sync -
+                 * that's the (only!) case where the list would be
+                 * empty with time remaining.
+                 */
+                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                        if (!timeleft)
+                                timeleft = xfs_syncd_centisecs *
+                                                        msecs_to_jiffies(10);
+                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+                        list_add_tail(&mp->m_sync_work.w_list,
+                                        &mp->m_sync_list);
+                }
+                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                        list_move(&work->w_list, &tmp);
+                spin_unlock(&mp->m_sync_lock);
+                list_for_each_entry_safe(work, n, &tmp, w_list) {
+                        (*work->w_syncer)(mp, work->w_data);
+                        list_del(&work->w_list);
+                        if (work == &mp->m_sync_work)
+                                continue;
+                        kmem_free(work);
+                }
+        }
+        return 0;
+}
+int
+xfs_syncd_init(
+        struct xfs_mount        *mp)
+{
+        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        mp->m_sync_work.w_mount = mp;
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        if (IS_ERR(mp->m_sync_task))
+                return -PTR_ERR(mp->m_sync_task);
+        return 0;
+}
+void
+xfs_syncd_stop(
+        struct xfs_mount        *mp)
+{
+        kthread_stop(mp->m_sync_task);
+}
+int
+xfs_reclaim_inode(
+        xfs_inode_t     *ip,
+        int             locked,
+        int             sync_mode)
+{
+        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+        /* The hash lock here protects a thread in xfs_iget_core from
+         * racing with us on linking the inode back with a vnode.
+         * Once we have the XFS_IRECLAIM flag set it will not touch
+         * us.
+         */
+        write_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+            !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+                spin_unlock(&ip->i_flags_lock);
+                write_unlock(&pag->pag_ici_lock);
+                if (locked) {
+                        xfs_ifunlock(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        write_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(ip->i_mount, pag);
+        /*
+         * If the inode is still dirty, then flush it out.  If the inode
+         * is not in the AIL, then it will be OK to flush it delwri as
+         * long as xfs_iflush() does not keep any references to the inode.
+         * We leave that decision up to xfs_iflush() since it has the
+         * knowledge of whether it's OK to simply do a delwri flush of
+         * the inode or whether we need to wait until the inode is
+         * pulled from the AIL.
+         * We get the flush lock regardless, though, just to make sure
+         * we don't free it while it is being flushed.
+         */
+        if (!locked) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_iflock(ip);
+        }
+        /*
+         * In the case of a forced shutdown we rely on xfs_iflush() to
+         * wait for the inode to be unpinned before returning an error.
+         */
+        if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+                /* synchronize with xfs_iflush_done */
+                xfs_iflock(ip);
+                xfs_ifunlock(ip);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_ireclaim(ip);
+        return 0;
+}
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        read_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        radix_tree_tag_set(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+        spin_unlock(&ip->i_flags_lock);
+        read_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
+}
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
+void
+xfs_inode_clear_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        read_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+        spin_unlock(&ip->i_flags_lock);
+        read_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
+}
+STATIC void
+xfs_reclaim_inodes_ag(
+        xfs_mount_t     *mp,
+        int             ag,
+        int             noblock,
+        int             mode)
+{
+        xfs_inode_t     *ip = NULL;
+        xfs_perag_t     *pag = &mp->m_perag[ag];
+        int             nr_found;
+        uint32_t        first_index;
+        int             skipped;
+restart:
+        first_index = 0;
+        skipped = 0;
+        do {
+                /*
+                 * use a gang lookup to find the next inode in the tree
+                 * as the tree is sparse and a gang lookup walks to find
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                        (void**)&ip, first_index, 1,
+                                        XFS_ICI_RECLAIM_TAG);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* ignore if already under reclaim */
+                if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        continue;
+                }
+                if (noblock) {
+                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                                read_unlock(&pag->pag_ici_lock);
+                                continue;
+                        }
+                        if (xfs_ipincount(ip) ||
+                            !xfs_iflock_nowait(ip)) {
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                read_unlock(&pag->pag_ici_lock);
+                                continue;
+                        }
+                }
+                read_unlock(&pag->pag_ici_lock);
+                /*
+                 * hmmm - this is an inode already in reclaim. Do
+                 * we even bother catching it here?
+                 */
+                if (xfs_reclaim_inode(ip, noblock, mode))
+                        skipped++;
+        } while (nr_found);
+        if (skipped) {
+                delay(1);
+                goto restart;
+        }
+        return;
+}
+int
+xfs_reclaim_inodes(
+        xfs_mount_t     *mp,
+        int              noblock,
+        int             mode)
+{
+        int             i;
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 000000000000..5f6de1efe1f6
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+struct xfs_mount;
+typedef struct bhv_vfs_sync_work {
+        struct list_head        w_list;
+        struct xfs_mount        *w_mount;
+        void                    *w_data;        /* syncer routine argument */
+        void                    (*w_syncer)(struct xfs_mount *, void *);
+} bhv_vfs_sync_work_t;
+#define SYNC_ATTR               0x0001  /* sync attributes */
+#define SYNC_DELWRI             0x0002  /* look at delayed writes */
+#define SYNC_WAIT               0x0004  /* wait for i/o to complete */
+#define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
+#define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_device(struct xfs_inode *ip);
+int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+                                struct xfs_inode *ip);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7dacb5bbde3f..916c0ffb6083 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -56,17 +56,6 @@ xfs_stats_clear_proc_handler(
 static ctl_table xfs_table[] = {
        {
-                .ctl_name       = XFS_RESTRICT_CHOWN,
-                .procname       = "restrict_chown",
-                .data           = &xfs_params.restrict_chown.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
-                .extra1         = &xfs_params.restrict_chown.min,
-                .extra2         = &xfs_params.restrict_chown.max
-        },
-        {
                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 4aadb8056c37..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;
 typedef struct xfs_param {
-        xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
        xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
                                         * not a member of parent dir GID. */
        xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
 enum {
        /* XFS_REFCACHE_SIZE = 1 */
        /* XFS_REFCACHE_PURGE = 2 */
-        XFS_RESTRICT_CHOWN = 3,
+        /* XFS_RESTRICT_CHOWN = 3 */
        XFS_SGID_INHERIT = 4,
        XFS_SYMLINK_MODE = 5,
        XFS_PANIC_MASK = 6,
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 7e60c7776b1c..000000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-struct inode;
-struct fid;
-struct cred;
-struct seq_file;
-struct super_block;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_mount_args;
-typedef struct kstatfs  bhv_statvfs_t;
-typedef struct bhv_vfs_sync_work {
-        struct list_head        w_list;
-        struct xfs_mount        *w_mount;
-        void                    *w_data;        /* syncer routine argument */
-        void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
-#define SYNC_ATTR               0x0001  /* sync attributes */
-#define SYNC_CLOSE              0x0002  /* close file system down */
-#define SYNC_DELWRI             0x0004  /* look at delayed writes */
-#define SYNC_WAIT               0x0008  /* wait for i/o to complete */
-#define SYNC_BDFLUSH            0x0010  /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA             0x0020  /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_IOWAIT             0x0100  /* wait for all I/O to complete */
-/*
- * When remounting a filesystem read-only or freezing the filesystem,
- * we have two phases to execute. This first phase is syncing the data
- * before we quiesce the fielsystem, and the second is flushing all the
- * inodes out after we've waited for all the transactions created by
- * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
- * to ensure that the inodes are written to their location on disk
- * rather than just existing in transactions in the log. This means
- * after a quiesce there is no log replay required to write the inodes
- * to disk (this is the main difference between a sync and a quiesce).
- */
-#define SYNC_DATA_QUIESCE       (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
-#define SYNC_INODE_QUIESCE      (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
-#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
-#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
-#endif  /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index b52528bbbfff..000000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-/*
- * And this gunk is needed for xfs_mount.h"
- */
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-/*
- * Dedicated vnode inactive/reclaim sync wait queues.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC                  37
-#define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t vsync[NVSYNC];
-void __init
-vn_init(void)
-{
-        int i;
-        for (i = 0; i < NVSYNC; i++)
-                init_waitqueue_head(&vsync[i]);
-}
-void
-vn_iowait(
-        xfs_inode_t     *ip)
-{
-        wait_queue_head_t *wq = vptosync(ip);
-        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-void
-vn_iowake(
-        xfs_inode_t     *ip)
-{
-        if (atomic_dec_and_test(&ip->i_iocount))
-                wake_up(vptosync(ip));
-}
-/*
- * Volume managers supporting multiple paths can send back ENODEV when the
- * final path disappears.  In this case continuing to fill the page cache
- * with dirty data which cannot be written out is evil, so prevent that.
- */
-void
-vn_ioerror(
-        xfs_inode_t     *ip,
-        int             error,
-        char            *f,
-        int             l)
-{
-        if (unlikely(error == -ENODEV))
-                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
-}
-#ifdef  XFS_INODE_TRACE
-/*
- * Reference count of Linux inode if present, -1 if the xfs_inode
- * has no associated Linux inode.
- */
-static inline int xfs_icount(struct xfs_inode *ip)
-{
-        struct inode *vp = VFS_I(ip);
-        if (vp)
-                return vn_count(vp);
-        return -1;
-}
-#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
-        ktrace_enter(   (ip)->i_trace,                          \
-/*  0 */                (void *)(__psint_t)(vk),                \
-/*  1 */                (void *)(s),                            \
-/*  2 */                (void *)(__psint_t) line,               \
-/*  3 */                (void *)(__psint_t)xfs_icount(ip),      \
-/*  4 */                (void *)(ra),                           \
-/*  5 */                NULL,                                   \
-/*  6 */                (void *)(__psint_t)current_cpu(),       \
-/*  7 */                (void *)(__psint_t)current_pid(),       \
-/*  8 */                (void *)__return_address,               \
-/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 683ce16210ff..f65983a230d3 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -18,7 +18,10 @@
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
+#include "xfs_fs.h"
 struct file;
+struct xfs_inode;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
@@ -51,40 +54,6 @@ struct attrlist_cursor_kern;
                                           Prevent VM access to the pages until
                                           the operation completes. */
-extern void     vn_init(void);
-/*
- * Yeah, these don't take vnode anymore at all, all this should be
- * cleaned up at some point.
- */
-extern void     vn_iowait(struct xfs_inode *ip);
-extern void     vn_iowake(struct xfs_inode *ip);
-extern void     vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(struct inode *vp)
-{
-        return atomic_read(&vp->i_count);
-}
-#define IHOLD(ip) \
-do { \
-        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
-        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-} while (0)
-#define IRELE(ip) \
-do { \
-        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-        iput(VFS_I(ip)); \
-} while (0)
-static inline struct inode *vn_grab(struct inode *vp)
-{
-        return igrab(vp);
-}
 /*
 * Dealing with bad inodes
 */
@@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
                                        PAGECACHE_TAG_DIRTY)
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_INODE_TRACE)
-#define INODE_TRACE_SIZE        16              /* number of trace entries */
-#define INODE_KTRACE_ENTRY      1
-#define INODE_KTRACE_EXIT       2
-#define INODE_KTRACE_HOLD       3
-#define INODE_KTRACE_REF        4
-#define INODE_KTRACE_RELE       5
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip)    \
-        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip)     \
-        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag)    \
-        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip)      \
-        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-#else
-#define xfs_itrace_entry(a)
-#define xfs_itrace_exit(a)
-#define xfs_itrace_exit_tag(a, b)
-#define xfs_itrace_hold(a, b, c, d)
-#define xfs_itrace_ref(a)
-#define xfs_itrace_rele(a, b, c, d)
-#endif
 #endif  /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f2705f2fd43c..591ca6602bfb 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+                init_waitqueue_head(&dqp->q_pinwait);
                /*
                 * Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
                 dqp->q_res_bcount = 0;
                 dqp->q_res_icount = 0;
                 dqp->q_res_rtbcount = 0;
-                 dqp->q_pincount = 0;
+                 atomic_set(&dqp->q_pincount, 0);
                 dqp->q_hash = NULL;
                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
        xfs_dqtrace_entry(dqp, "DQFLUSH");
        /*
-         * If not dirty, nada.
+         * If not dirty, or it's pinned and we are not supposed to
+         * block, nada.
         */
-        if (!XFS_DQ_IS_DIRTY(dqp)) {
+        if (!XFS_DQ_IS_DIRTY(dqp) ||
+            (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
                xfs_dqfunlock(dqp);
-                return (0);
+                return 0;
        }
-        /*
-         * Cant flush a pinned dquot. Wait for it.
-         */
        xfs_qm_dqunpin_wait(dqp);
        /*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
        mp = dqp->q_mount;
-        /* lsn is 64 bits */
+        xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
-        spin_lock(&mp->m_ail_lock);
+                                        &dqp->q_logitem.qli_item.li_lsn);
-        dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
-        spin_unlock(&mp->m_ail_lock);
        /*
         * Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
        xfs_dq_logitem_t        *qip)
 {
        xfs_dquot_t             *dqp;
+        struct xfs_ail          *ailp;
        dqp = qip->qli_dquot;
+        ailp = qip->qli_item.li_ailp;
        /*
         * We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-                spin_lock(&dqp->q_mount->m_ail_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                /*
+                spin_lock(&ailp->xa_lock);
-                 * xfs_trans_delete_ail() drops the AIL lock.
-                 */
                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                        xfs_trans_delete_ail(dqp->q_mount,
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
-                                             (xfs_log_item_t*)qip);
                else
-                        spin_unlock(&dqp->q_mount->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
        }
        /*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
        mutex_unlock(&(dqp->q_qlock));
        if (dqp->q_logitem.qli_dquot == dqp) {
                /* Once was dqp->q_mount, but might just have been cleared */
-                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
                                        (xfs_log_item_t*)&(dqp->q_logitem));
        }
 }
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
-        ASSERT(dqp->q_pincount == 0);
+        ASSERT(atomic_read(&dqp->q_pincount) == 0);
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 8958d0faf8d3..7e455337e2ba 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
-        uint             q_pincount;    /* pin count for this dquot */
+        atomic_t          q_pincount;   /* dquot pin count */
-        sv_t             q_pinwait;     /* sync var for pinning */
+        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 #ifdef XFS_DQUOT_TRACE
        struct ktrace   *q_trace;       /* trace header structure */
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f028644caa5e..1728f6a7c4f5 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
 /*
 * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
 */
 STATIC void
 xfs_qm_dquot_logitem_pin(
        xfs_dq_logitem_t *logitem)
 {
-        xfs_dquot_t *dqp;
+        xfs_dquot_t *dqp = logitem->qli_dquot;
-        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+        atomic_inc(&dqp->q_pincount);
-        dqp->q_pincount++;
-        spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
 }
 /*
 * Decrement the pin count of the given dquot, and wake up
 * anyone in xfs_dqwait_unpin() if the count goes to 0.  The
- * dquot must have been previously pinned with a call to xfs_dqpin().
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
 */
 /* ARGSUSED */
 STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
        xfs_dq_logitem_t *logitem,
        int               stale)
 {
-        xfs_dquot_t *dqp;
+        xfs_dquot_t *dqp = logitem->qli_dquot;
-        dqp = logitem->qli_dquot;
+        ASSERT(atomic_read(&dqp->q_pincount) > 0);
-        ASSERT(dqp->q_pincount > 0);
+        if (atomic_dec_and_test(&dqp->q_pincount))
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+                wake_up(&dqp->q_pinwait);
-        dqp->q_pincount--;
-        if (dqp->q_pincount == 0) {
-                sv_broadcast(&dqp->q_pinwait);
-        }
-        spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
 }
 /* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
        xfs_dquot_t     *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        if (dqp->q_pincount == 0) {
+        if (atomic_read(&dqp->q_pincount) == 0)
                return;
-        }
        /*
         * Give the log a push so we don't wait here too long.
         */
        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+        wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
-        if (dqp->q_pincount == 0) {
-                spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-                return;
-        }
-        sv_wait(&(dqp->q_pinwait), PINOD,
-                &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
 }
 /*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
        uint                    retval;
        dqp = qip->qli_dquot;
-        if (dqp->q_pincount > 0)
+        if (atomic_read(&dqp->q_pincount) > 0)
                return (XFS_ITEM_PINNED);
        if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
        xfs_lsn_t lsn)
 {
        xfs_qoff_logitem_t      *qfs;
+        struct xfs_ail          *ailp;
        qfs = qfe->qql_start_lip;
-        spin_lock(&qfs->qql_item.li_mountp->m_ail_lock);
+        ailp = qfs->qql_item.li_ailp;
+        spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
-         * xfs_trans_delete_ail() drops the AIL lock.
+         * xfs_trans_ail_delete() drops the AIL lock.
         */
-        xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
+        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index df0ffef9775a..6b13960cf318 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -396,13 +395,10 @@ xfs_qm_mount_quotas(
 /*
 * Called from the vfsops layer.
 */
-int
+void
 xfs_qm_unmount_quotas(
        xfs_mount_t     *mp)
 {
-        xfs_inode_t     *uqp, *gqp;
-        int             error = 0;
        /*
         * Release the dquots that root inode, et al might be holding,
         * before we flush quotas and blow away the quotainfo structure.
@@ -415,43 +411,18 @@ xfs_qm_unmount_quotas(
                xfs_qm_dqdetach(mp->m_rsumip);
        /*
-         * Flush out the quota inodes.
+         * Release the quota inodes.
         */
-        uqp = gqp = NULL;
        if (mp->m_quotainfo) {
-                if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
+                if (mp->m_quotainfo->qi_uquotaip) {
-                        xfs_ilock(uqp, XFS_ILOCK_EXCL);
+                        IRELE(mp->m_quotainfo->qi_uquotaip);
-                        xfs_iflock(uqp);
+                        mp->m_quotainfo->qi_uquotaip = NULL;
-                        error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
-                        xfs_iunlock(uqp, XFS_ILOCK_EXCL);
-                        if (unlikely(error == EFSCORRUPTED)) {
-                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                goto out;
-                        }
                }
-                if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
+                if (mp->m_quotainfo->qi_gquotaip) {
-                        xfs_ilock(gqp, XFS_ILOCK_EXCL);
+                        IRELE(mp->m_quotainfo->qi_gquotaip);
-                        xfs_iflock(gqp);
+                        mp->m_quotainfo->qi_gquotaip = NULL;
-                        error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
-                        xfs_iunlock(gqp, XFS_ILOCK_EXCL);
-                        if (unlikely(error == EFSCORRUPTED)) {
-                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                goto out;
-                        }
                }
        }
-        if (uqp) {
-                 IRELE(uqp);
-                 mp->m_quotainfo->qi_uquotaip = NULL;
-        }
-        if (gqp) {
-                IRELE(gqp);
-                mp->m_quotainfo->qi_gquotaip = NULL;
-        }
-out:
-        return XFS_ERROR(error);
 }
 /*
@@ -987,14 +958,10 @@ xfs_qm_dqdetach(
 }
 /*
- * This is called by VFS_SYNC and flags arg determines the caller,
+ * This is called to sync quotas. We can be told to use non-blocking
- * and its motives, as done in xfs_sync.
+ * semantics by either the SYNC_BDFLUSH flag or the absence of the
- *
+ * SYNC_WAIT flag.
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
 */
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        spin_lock_init(&qinf->qi_pinlock);
        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
        qinf->qi_dqreclaims = 0;
@@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        spinlock_destroy(&qi->qi_pinlock);
        xfs_qm_list_destroy(&qi->qi_dqlist);
        if (qi->qi_uquotaip) {
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 44f25349e478..ddf09166387c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        spinlock_t       qi_pinlock;     /* dquot pinning lock */
        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
@@ -168,7 +167,7 @@ extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
+extern void             xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 extern int              xfs_qm_sync(xfs_mount_t *, int);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index eea2e60b456b..bc6c5cca3e12 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -51,7 +50,7 @@
 STATIC void
 xfs_fill_statvfs_from_dquot(
-        bhv_statvfs_t           *statp,
+        struct kstatfs          *statp,
        xfs_disk_dquot_t        *dp)
 {
        __uint64_t              limit;
@@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot(
 STATIC void
 xfs_qm_statvfs(
        xfs_inode_t             *ip,
-        bhv_statvfs_t           *statp)
+        struct kstatfs          *statp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 1a3b803dfa55..68139b38aede 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
                break;
        case Q_XQUOTASYNC:
-                return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL));
+                return xfs_sync_inodes(mp, SYNC_DELWRI);
        default:
                break;
@@ -1022,101 +1022,104 @@ xfs_qm_export_flags(
 /*
- * Go thru all the inodes in the file system, releasing their dquots.
+ * Release all the dquots on the inodes in an AG.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
 */
-void
+STATIC void
-xfs_qm_dqrele_all_inodes(
+xfs_qm_dqrele_inodes_ag(
-        struct xfs_mount *mp,
+        xfs_mount_t     *mp,
-        uint             flags)
+        int             ag,
+        uint            flags)
 {
-        xfs_inode_t     *ip, *topino;
+        xfs_inode_t     *ip = NULL;
-        uint            ireclaims;
+        xfs_perag_t     *pag = &mp->m_perag[ag];
-        struct inode    *vp;
+        int             first_index = 0;
-        boolean_t       vnode_refd;
+        int             nr_found;
-        ASSERT(mp->m_quotainfo);
-        XFS_MOUNT_ILOCK(mp);
-again:
-        ip = mp->m_inodes;
-        if (ip == NULL) {
-                XFS_MOUNT_IUNLOCK(mp);
-                return;
-        }
        do {
-                /* Skip markers inserted by xfs_sync */
+                /*
-                if (ip->i_mount == NULL) {
+                 * use a gang lookup to find the next inode in the tree
-                        ip = ip->i_mnext;
+                 * as the tree is sparse and a gang lookup walks to find
-                        continue;
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                (void**)&ip, first_index, 1);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
                }
-                /* Root inode, rbmip and rsumip have associated blocks */
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* skip quota inodes */
                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
-                        ip = ip->i_mnext;
+                        read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-                vp = VFS_I(ip);
-                if (!vp) {
+                /*
-                        ASSERT(ip->i_udquot == NULL);
+                 * If we can't get a reference on the inode, it must be
-                        ASSERT(ip->i_gdquot == NULL);
+                 * in reclaim. Leave it for the reclaim code to flush.
-                        ip = ip->i_mnext;
+                 */
+                if (!igrab(VFS_I(ip))) {
+                        read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-                vnode_refd = B_FALSE;
+                read_unlock(&pag->pag_ici_lock);
-                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                        ireclaims = mp->m_ireclaims;
+                /* avoid new inodes though we shouldn't find any here */
-                        topino = mp->m_inodes;
+                if (xfs_iflags_test(ip, XFS_INEW)) {
-                        vp = vn_grab(vp);
+                        IRELE(ip);
-                        if (!vp)
+                        continue;
-                                goto again;
-                        XFS_MOUNT_IUNLOCK(mp);
-                        /* XXX restart limit ? */
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        vnode_refd = B_TRUE;
-                } else {
-                        ireclaims = mp->m_ireclaims;
-                        topino = mp->m_inodes;
-                        XFS_MOUNT_IUNLOCK(mp);
                }
-                /*
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                 * We don't keep the mountlock across the dqrele() call,
-                 * since it can take a while..
-                 */
                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                        xfs_qm_dqrele(ip->i_udquot);
                        ip->i_udquot = NULL;
                }
-                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+                    ip->i_gdquot) {
                        xfs_qm_dqrele(ip->i_gdquot);
                        ip->i_gdquot = NULL;
                }
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_iput(ip, XFS_ILOCK_EXCL);
-                /*
-                 * Wait until we've dropped the ilock and mountlock to
+        } while (nr_found);
-                 * do the vn_rele. Or be condemned to an eternity in the
+}
-                 * inactive code in hell.
-                 */
+/*
-                if (vnode_refd)
+ * Go thru all the inodes in the file system, releasing their dquots.
-                        IRELE(ip);
+ * Note that the mount structure gets modified to indicate that quotas are off
-                XFS_MOUNT_ILOCK(mp);
+ * AFTER this, in the case of quotaoff. This also gets called from
-                /*
+ * xfs_rootumount.
-                 * If an inode was inserted or removed, we gotta
+ */
-                 * start over again.
+void
-                 */
+xfs_qm_dqrele_all_inodes(
-                if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
+        struct xfs_mount *mp,
-                        /* XXX use a sentinel */
+        uint             flags)
-                        goto again;
+{
-                }
+        int             i;
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
-        XFS_MOUNT_IUNLOCK(mp);
+        ASSERT(mp->m_quotainfo);
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                xfs_qm_dqrele_inodes_ag(mp, i, flags);
+        }
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index c27abef7b84f..ae5482965424 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -18,6 +18,13 @@
 #include <xfs.h>
 #include "debug.h"
+/* xfs_mount.h drags a lot of crap in, sorry.. */
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 static char             message[1024];  /* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
@@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...)
 }
 void
-icmn_err(register int level, char *fmt, va_list ap)
+xfs_fs_vcmn_err(
+        int                     level,
+        struct xfs_mount        *mp,
+        char                    *fmt,
+        va_list                 ap)
 {
-        ulong   flags;
+        unsigned long           flags;
-        int     len;
+        int                     len = 0;
        level &= XFS_ERR_MASK;
-        if(level > XFS_MAX_ERR_LEVEL)
+        if (level > XFS_MAX_ERR_LEVEL)
                level = XFS_MAX_ERR_LEVEL;
        spin_lock_irqsave(&xfs_err_lock,flags);
-        len = vsnprintf(message, sizeof(message), fmt, ap);
+        if (mp) {
+                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+                /*
+                 * Skip the printk if we can't print anything useful
+                 * due to an over-long device name.
+                 */
+                if (len >= sizeof(message))
+                        goto out;
+        }
+        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
        if (len >= sizeof(message))
                len = sizeof(message) - 1;
        if (message[len-1] == '\n')
                message[len-1] = 0;
        printk("%s%s\n", err_level[level], message);
+ out:
        spin_unlock_irqrestore(&xfs_err_lock,flags);
        BUG_ON(level == CE_PANIC);
 }
@@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line)
 void
 xfs_hex_dump(void *p, int length)
 {
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 75845f950814..6f4fd37c67af 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -27,8 +27,6 @@
 #define CE_ALERT        1               /* alert        */
 #define CE_PANIC        0               /* panic        */
-extern void icmn_err(int, char *, va_list)
-        __attribute__ ((format (printf, 2, 0)));
 extern void cmn_err(int, char *, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index a34ef05489b1..2d494c26717f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 void
 ktrace_free(ktrace_t *ktp)
 {
-        int     entries_size;
        if (ktp == (ktrace_t *)NULL)
                return;
        /*
         * Special treatment for the Vnode trace buffer.
         */
-        if (ktp->kt_nentries == ktrace_zentries) {
+        if (ktp->kt_nentries == ktrace_zentries)
                kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-        } else {
+        else
-                entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
                kmem_free(ktp->kt_entries);
-        }
        kmem_zone_free(ktrace_hdr_zone, ktp);
 }
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..17254b529c54 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -30,7 +30,7 @@
 #define XFS_ATTR_TRACE 1
 #define XFS_BLI_TRACE 1
 #define XFS_BMAP_TRACE 1
-#define XFS_BMBT_TRACE 1
+#define XFS_BTREE_TRACE 1
 #define XFS_DIR2_TRACE 1
 #define XFS_DQUOT_TRACE 1
 #define XFS_ILOCK_TRACE 1
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 91d69338d3b2..a8cdd73999a4 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -758,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
+        return xfs_setattr(XFS_I(vp), &iattr, 0);
 }
 /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 61b292a9fb41..f2e21817a226 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,8 @@ typedef struct xfs_agf {
 #define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 /*
 * Size of the unlinked inode hash table in the agi.
@@ -142,6 +144,9 @@ typedef struct xfs_agi {
 #define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
 #define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                                xfs_agnumber_t agno, struct xfs_buf **bpp);
 /*
 * The third a.g. block contains the a.g. freelist, an array
 * of block pointers to blocks owned by the allocation btree code.
@@ -192,17 +197,23 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_freecount; /* number of free inodes */
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
+        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
-#endif
-        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+#endif
 } xfs_perag_t;
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1956f83489f1..028e44e58ea9 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -90,6 +90,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 */
 /*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_alloc_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len)    /* length of extent */
+{
+        union xfs_btree_rec     rec;
+        rec.alloc.ar_startblock = cpu_to_be32(bno);
+        rec.alloc.ar_blockcount = cpu_to_be32(len);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+STATIC int                              /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *bno = be32_to_cpu(rec->alloc.ar_startblock);
+                *len = be32_to_cpu(rec->alloc.ar_blockcount);
+        }
+        return error;
+}
+/*
 * Compute aligned version of the found extent.
 * Takes alignment and min length into account.
 */
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
 #ifdef DEBUG
-        {
+        if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
-                xfs_alloc_block_t       *bnoblock;
+                struct xfs_btree_block  *bnoblock;
-                xfs_alloc_block_t       *cntblock;
+                struct xfs_btree_block  *cntblock;
-                if (bno_cur->bc_nlevels == 1 &&
+                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
-                    cnt_cur->bc_nlevels == 1) {
+                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-                        bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
-                        cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
+                XFS_WANT_CORRUPTED_RETURN(
-                        XFS_WANT_CORRUPTED_RETURN(
+                        bnoblock->bb_numrecs == cntblock->bb_numrecs);
-                                be16_to_cpu(bnoblock->bb_numrecs) ==
-                                be16_to_cpu(cntblock->bb_numrecs));
-                }
        }
 #endif
        /*
         * Deal with all four cases: the allocated record is contained
         * within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
        /*
         * Delete the entry from the by-size btree.
         */
-        if ((error = xfs_alloc_delete(cnt_cur, &i)))
+        if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
        XFS_WANT_CORRUPTED_RETURN(i == 1);
        /*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
                /*
                 * No remaining freespace, just delete the by-block tree entry.
                 */
-                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        } else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
-        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Get a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        ltlen = 0;
        bno_cur_lt = bno_cur_gt = NULL;
        /*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
-                                if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
                                        goto error0;
                        } while (i);
                        ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
                i = cnt_cur->bc_ptrs[0];
                for (j = 1, blen = 0, bdiff = 0;
                     !error && j && (blen < args->maxlen || bdiff > 0);
-                     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+                     error = xfs_btree_increment(cnt_cur, 0, &j)) {
                        /*
                         * For each entry, decide if it's better than
                         * the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
                /*
                 * Set up a cursor for the by-bno tree.
                 */
-                bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
+                bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
-                        args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+                        args->agbp, args->agno, XFS_BTNUM_BNO);
                /*
                 * Fix up the btree entries.
                 */
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Allocate and initialize the cursor for the leftward search.
         */
-        bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup <= bno to find the leftward search's starting point.
         */
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
         * Increment the cursor, so we will point at the entry just right
         * of the leftward entry if any, or to the leftmost entry.
         */
-        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                goto error0;
        if (!i) {
                /*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
-                        if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
-                        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the right end.
                                         */
-                                        if ((error = xfs_alloc_increment(
+                                        if ((error = xfs_btree_increment(
                                                        bno_cur_gt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the left end.
                                         */
-                                        if ((error = xfs_alloc_decrement(
+                                        if ((error = xfs_btree_decrement(
                                                        bno_cur_lt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
                bestflen = flen;
                bestfbno = fbno;
                for (;;) {
-                        if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
                                goto error0;
                        if (i == 0)
                                break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
-        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
                goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    flen;
        int             i;
-        if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+        if ((error = xfs_btree_decrement(ccur, 0, &i)))
                goto error0;
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
        /*
         * Allocate and initialize a cursor for the by-block btree.
         */
-        bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
+        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
-                0);
        cnt_cur = NULL;
        /*
         * Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
         * Look for a neighboring block on the right (higher block numbers)
         * that is contiguous with this space.
         */
-        if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+        if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
                goto error0;
        if (haveright) {
                /*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
        /*
         * Now allocate and initialize a cursor for the by-size tree.
         */
-        cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
+        cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
-                0);
        /*
         * Have both left and right contiguous neighbors.
         * Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
-                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
-                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 #ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
-                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
        else {
                nbno = bno;
                nlen = len;
-                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        }
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-        if ((error = xfs_alloc_insert(cnt_cur, &i)))
+        if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist(
 * Read in the allocation group header (free/alloc section).
 */
 int                                     /* error */
-xfs_alloc_read_agf(
+xfs_read_agf(
-        xfs_mount_t     *mp,            /* mount point structure */
+        struct xfs_mount        *mp,    /* mount point structure */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,   /* allocation group number */
-        int             flags,          /* XFS_ALLOC_FLAG_... */
+        int                     flags,  /* XFS_BUF_ */
-        xfs_buf_t       **bpp)          /* buffer for the ag freelist header */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
 {
-        xfs_agf_t       *agf;           /* ag freelist header */
+        struct xfs_agf  *agf;           /* ag freelist header */
        int             agf_ok;         /* set if agf is consistent */
-        xfs_buf_t       *bp;            /* return value */
-        xfs_perag_t     *pag;           /* per allocation group data */
        int             error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1),
+                        XFS_FSS_TO_BB(mp, 1), flags, bpp);
-                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
-                        &bp);
        if (error)
                return error;
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        if (!*bpp)
-        if (!bp) {
-                *bpp = NULL;
                return 0;
-        }
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        agf = XFS_BUF_TO_AGF(*bpp);
        /*
         * Validate the magic number of the agf block.
         */
-        agf = XFS_BUF_TO_AGF(bp);
        agf_ok =
                be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_seqno) == agno;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                                be32_to_cpu(agf->agf_length);
        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
                        XFS_RANDOM_ALLOC_READ_AGF))) {
                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
                                     XFS_ERRLEVEL_LOW, mp, agf);
-                xfs_trans_brelse(tp, bp);
+                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+        return 0;
+}
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error */
+xfs_alloc_read_agf(
+        struct xfs_mount        *mp,    /* mount point structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags,  /* XFS_ALLOC_FLAG_... */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+        struct xfs_agf          *agf;           /* ag freelist header */
+        struct xfs_perag        *pag;           /* per allocation group data */
+        int                     error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_read_agf(mp, tp, agno,
+                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+                        bpp);
+        if (error)
+                return error;
+        if (!*bpp)
+                return 0;
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        agf = XFS_BUF_TO_AGF(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagf_init) {
                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -2213,6 +2328,7 @@ xfs_alloc_read_agf(
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+                ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2221,8 +2337,6 @@ xfs_alloc_read_agf(
                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
        }
 #endif
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
-        *bpp = bp;
        return 0;
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 5aec15d0651e..588172796f7b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
 #define XFS_ALLOC_KTRACE_BUSYSEARCH     6
 #endif
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+                xfs_agnumber_t agno,
+                xfs_agblock_t bno,
+                xfs_extlen_t len);
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+                xfs_agnumber_t ag,
+                int idx);
+#endif  /* __KERNEL__ */
 /*
 * Compute and fill in value of m_ag_maxlevels.
 */
@@ -196,18 +209,4 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-                xfs_agnumber_t agno,
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-                xfs_agnumber_t ag,
-                int idx);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3ce2645508ae..733cb75a8c5d 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -35,2177 +35,464 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
-/*
- * Prototypes for internal functions.
- */
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC struct xfs_btree_cur *
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+xfs_allocbt_dup_cursor(
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+        struct xfs_btree_cur    *cur)
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+{
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
+        return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno,
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
+                        cur->bc_btnum);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+}
-                xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
-/*
+STATIC void
- * Internal functions.
+xfs_allocbt_set_root(
- */
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     inc)
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+        int                     btnum = cur->bc_btnum;
-/*
+        ASSERT(ptr->s != 0);
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
+        agf->agf_roots[btnum] = ptr->s;
- * Remove the record from its block then rebalance the tree.
+        be32_add_cpu(&agf->agf_levels[btnum], inc);
- * Return 0 for error, 1 for done, 2 to go on to the next level.
+        cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
- */
-STATIC int                              /* error */
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-xfs_alloc_delrec(
+}
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level removing record from */
+STATIC int
-        int                     *stat)  /* fail/done/go-on */
+xfs_allocbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     length,
+        int                     *stat)
 {
-        xfs_agf_t               *agf;   /* allocation group freelist header */
+        int                     error;
-        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agblock_t           bno;
-        xfs_agblock_t           bno;    /* btree block number */
-        xfs_buf_t               *bp;    /* buffer for block */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* kp points here if block is level 0 */
-        xfs_agblock_t           lbno;   /* left block's block number */
-        xfs_buf_t               *lbp;   /* left block's buffer pointer */
-        xfs_alloc_block_t       *left;  /* left btree block */
-        xfs_alloc_key_t         *lkp=NULL;      /* left block key pointer */
-        xfs_alloc_ptr_t         *lpp=NULL;      /* left block address pointer */
-        int                     lrecs=0;        /* number of records in left block */
-        xfs_alloc_rec_t         *lrp;   /* left block record pointer */
-        xfs_mount_t             *mp;    /* mount structure */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_agblock_t           rbno;   /* right block's block number */
-        xfs_buf_t               *rbp;   /* right block's buffer pointer */
-        xfs_alloc_block_t       *right; /* right btree block */
-        xfs_alloc_key_t         *rkp;   /* right block key pointer */
-        xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
-        int                     rrecs=0;        /* number of records in right block */
-        int                     numrecs;
-        xfs_alloc_rec_t         *rrp;   /* right block record pointer */
-        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-        /*
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-         * Get the index of the entry being deleted, check for nothing there.
-         */
+        /* Allocate the new block from the freelist. If we can't, give up.  */
-        ptr = cur->bc_ptrs[level];
+        error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-        if (ptr == 0) {
+                                       &bno, 1);
-                *stat = 0;
+        if (error) {
-                return 0;
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-        }
-        /*
-         * Get the buffer & block containing the record or key/ptr.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
                return error;
-#endif
+        }
-        /*
-         * Fail if we're off the end of the block.
+        if (bno == NULLAGBLOCK) {
-         */
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (ptr > numrecs) {
                *stat = 0;
                return 0;
        }
-        XFS_STATS_INC(xs_abt_delrec);
-        /*
-         * It's a nonleaf.  Excise the key and ptr being deleted, by
-         * sliding the entries past them down one.
-         * Log the changed areas of the block.
-         */
-        if (level > 0) {
-                lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&lkp[ptr - 1], &lkp[ptr],
-                                (numrecs - ptr) * sizeof(*lkp));
-                        memmove(&lpp[ptr - 1], &lpp[ptr],
-                                (numrecs - ptr) * sizeof(*lpp));
-                        xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
-                }
-        }
-        /*
-         * It's a leaf.  Excise the record being deleted, by sliding the
-         * entries past it down one.  Log the changed areas of the block.
-         */
-        else {
-                lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&lrp[ptr - 1], &lrp[ptr],
-                                (numrecs - ptr) * sizeof(*lrp));
-                        xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                /*
-                 * If it's the first record in the block, we'll need a key
-                 * structure to pass up to the next level (updkey).
-                 */
-                if (ptr == 1) {
-                        key.ar_startblock = lrp->ar_startblock;
-                        key.ar_blockcount = lrp->ar_blockcount;
-                        lkp = &key;
-                }
-        }
-        /*
-         * Decrement and log the number of entries in the block.
-         */
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * See if the longest free extent in the allocation group was
-         * changed by this operation.  True if it's the by-size btree, and
-         * this is the leaf level, and there is no right sibling block,
-         * and this was the last record.
-         */
-        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        mp = cur->bc_mp;
-        if (level == 0 &&
+        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-            cur->bc_btnum == XFS_BTNUM_CNT &&
+        new->s = cpu_to_be32(bno);
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr > numrecs) {
-                ASSERT(ptr == numrecs + 1);
-                /*
-                 * There are still records in the block.  Grab the size
-                 * from the last one.
-                 */
-                if (numrecs) {
-                        rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
-                        agf->agf_longest = rrp->ar_blockcount;
-                }
-                /*
-                 * No free extents left.
-                 */
-                else
-                        agf->agf_longest = 0;
-                mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
-                        be32_to_cpu(agf->agf_longest);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
-        }
-        /*
-         * Is this the root level?  If so, we're almost done.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                /*
-                 * If this is the root level,
-                 * and there's only one entry left,
-                 * and it's NOT the leaf level,
-                 * then we can get rid of this level.
-                 */
-                if (numrecs == 1 && level > 0) {
-                        /*
-                         * lpp is still set to the first pointer in the block.
-                         * Make it the new root of the btree.
-                         */
-                        bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-                        agf->agf_roots[cur->bc_btnum] = *lpp;
-                        be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
-                        mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
-                        /*
-                         * Put this buffer/block on the ag's freelist.
-                         */
-                        error = xfs_alloc_put_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, NULL, bno, 1);
-                        if (error)
-                                return error;
-                        /*
-                         * Since blocks move to the free list without the
-                         * coordination used in xfs_bmap_finish, we can't allow
-                         * block to be available for reallocation and
-                         * non-transaction writing (user data) until we know
-                         * that the transaction that moved it to the free list
-                         * is permanently on disk. We track the blocks by
-                         * declaring these blocks as "busy"; the busy list is
-                         * maintained on a per-ag basis and each transaction
-                         * records which entries should be removed when the
-                         * iclog commits to disk. If a busy block is
-                         * allocated, the iclog is pushed up to the LSN
-                         * that freed the block.
-                         */
-                        xfs_alloc_mark_busy(cur->bc_tp,
-                                be32_to_cpu(agf->agf_seqno), bno, 1);
-                        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+        *stat = 1;
-                                XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+        return 0;
-                        /*
+}
-                         * Update the cursor so there's one fewer level.
-                         */
-                        xfs_btree_setbuf(cur, level, NULL);
-                        cur->bc_nlevels--;
-                } else if (level > 0 &&
-                           (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we deleted the leftmost entry in the block, update the
-         * key values above us in the tree.
-         */
-        if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
-                return error;
-        /*
-         * If the number of records remaining in the block is at least
-         * the minimum, we're done.
-         */
-        if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Otherwise, we have to move some records around to keep the
-         * tree balanced.  Look at the left and right sibling blocks to
-         * see if we can re-balance by moving only one record.
-         */
-        rbno = be32_to_cpu(block->bb_rightsib);
-        lbno = be32_to_cpu(block->bb_leftsib);
-        bno = NULLAGBLOCK;
-        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-        /*
-         * Duplicate the cursor so our btree manipulations here won't
-         * disrupt the next level up.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        /*
-         * If there's a right sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (rbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the last entry in the next block.
-                 * Actually any entry but the first would suffice.
-                 */
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_increment(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(right->bb_leftsib);
-                /*
-                 * If right block is full enough so that removing one entry
-                 * won't make it too empty, and left-shifting an entry out
-                 * of right to us works, we're done.
-                 */
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_alloc_lshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level > 0 &&
-                                    (error = xfs_alloc_decrement(cur, level,
-                                            &i)))
-                                        return error;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference, and fix up the temp cursor to point
-                 * to our block again (last record).
-                 */
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLAGBLOCK) {
-                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                }
-        }
-        /*
-         * If there's a left sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (lbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the first entry in the
-                 * previous block.
-                 */
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_btree_firstrec(tcur, level);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(left->bb_rightsib);
-                /*
-                 * If left block is full enough so that removing one entry
-                 * won't make it too empty, and right-shifting an entry out
-                 * of left to us works, we're done.
-                 */
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_alloc_rshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference.
-                 */
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * Delete the temp cursor, we're done with it.
-         */
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        /*
-         * If here, we need to do a join to keep the tree balanced.
-         */
-        ASSERT(bno != NULLAGBLOCK);
-        /*
-         * See if we can join with the left neighbor block.
-         */
-        if (lbno != NULLAGBLOCK &&
-            lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "right" to be the starting block,
-                 * "left" to be the left neighbor.
-                 */
-                rbno = bno;
-                right = block;
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                rbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        return error;
-        }
-        /*
-         * If that won't work, see if we can join with the right neighbor block.
-         */
-        else if (rbno != NULLAGBLOCK &&
-                 rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "left" to be the starting block,
-                 * "right" to be the right neighbor.
-                 */
-                lbno = bno;
-                left = block;
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                lbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        return error;
-        }
-        /*
-         * Otherwise, we can't fix the imbalance.
-         * Just return.  This is probably a logic error, but it's not fatal.
-         */
-        else {
-                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * We're now going to join "left" and "right" by moving all the stuff
-         * in "right" to "left" and deleting "right".
-         */
-        if (level > 0) {
-                /*
-                 * It's a non-leaf.  Move keys and pointers.
-                 */
-                lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        } else {
-                /*
-                 * It's a leaf.  Move records.
-                 */
-                lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        }
-        /*
-         * If we joined with the left neighbor, set the buffer in the
-         * cursor to the left block, and fix up the index.
-         */
-        if (bp != lbp) {
-                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += lrecs;
-        }
-        /*
-         * If we joined with the right neighbor and there's a level above
-         * us, increment the cursor at that level.
-         */
-        else if (level + 1 < cur->bc_nlevels &&
-                 (error = xfs_alloc_increment(cur, level + 1, &i)))
-                return error;
-        /*
-         * Fix up the number of records in the surviving block.
-         */
-        lrecs += rrecs;
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        /*
-         * Fix up the right block pointer in the surviving block, and log it.
-         */
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there is a right sibling now, make it point to the
-         * remaining block.
-         */
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                xfs_alloc_block_t       *rrblock;
-                xfs_buf_t               *rrbp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+STATIC int
-                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
+xfs_allocbt_free_block(
-                                &rrbp, XFS_ALLOC_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        struct xfs_buf          *bp)
-                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+{
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-                        return error;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-                rrblock->bb_leftsib = cpu_to_be32(lbno);
+        xfs_agblock_t           bno;
-                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        int                     error;
-        }
-        /*
+        bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
-         * Free the deleting block by putting it on the freelist.
+        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
-         */
-        error = xfs_alloc_put_freelist(cur->bc_tp,
-                                         cur->bc_private.a.agbp, NULL, rbno, 1);
        if (error)
                return error;
        /*
-         * Since blocks move to the free list without the coordination
+         * Since blocks move to the free list without the coordination used in
-         * used in xfs_bmap_finish, we can't allow block to be available
+         * xfs_bmap_finish, we can't allow block to be available for
-         * for reallocation and non-transaction writing (user data)
+         * reallocation and non-transaction writing (user data) until we know
-         * until we know that the transaction that moved it to the free
+         * that the transaction that moved it to the free list is permanently
-         * list is permanently on disk. We track the blocks by declaring
+         * on disk. We track the blocks by declaring these blocks as "busy";
-         * these blocks as "busy"; the busy list is maintained on a
+         * the busy list is maintained on a per-ag basis and each transaction
-         * per-ag basis and each transaction records which entries
+         * records which entries should be removed when the iclog commits to
-         * should be removed when the iclog commits to disk. If a
+         * disk. If a busy block is allocated, the iclog is pushed up to the
-         * busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
-        /*
-         * Adjust the current level's cursor so that we're left referring
-         * to the right node, after we're done.
-         * If this leaves the ptr value 0 our caller will fix it up.
-         */
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        /*
-         * Return value means the next level up has something to do.
-         */
-        *stat = 2;
        return 0;
-error0:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
 }
 /*
- * Insert one record/level.  Return information to the caller
+ * Update the longest extent in the AGF
- * allowing the next level up to proceed if necessary.
 */
-STATIC int                              /* error */
+STATIC void
-xfs_alloc_insrec(
+xfs_allocbt_update_lastrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        int                     level,  /* level to insert record at */
+        struct xfs_btree_block  *block,
-        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
+        union xfs_btree_rec     *rec,
-        xfs_alloc_rec_t         *recp,  /* i/o: record data inserted */
+        int                     ptr,
-        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
+        int                     reason)
-        int                     *stat)  /* output: success/failure */
 {
-        xfs_agf_t               *agf;   /* allocation group freelist header */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
-        xfs_buf_t               *bp;    /* buffer for block */
+        __be32                  len;
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* key value being inserted */
-        xfs_alloc_key_t         *kp;    /* pointer to btree keys */
-        xfs_agblock_t           nbno;   /* block number of allocated block */
-        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-        xfs_alloc_key_t         nkey;   /* new key value, from split */
-        xfs_alloc_rec_t         nrec;   /* new record value, for caller */
        int                     numrecs;
-        int                     optr;   /* old ptr value */
-        xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_alloc_rec_t         *rp;    /* pointer to btree records */
-        ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
+        ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+        switch (reason) {
+        case LASTREC_UPDATE:
+                /*
+                 * If this is the last leaf block and it's the last record,
+                 * then update the size of the longest extent in the AG.
+                 */
+                if (ptr != xfs_btree_get_numrecs(block))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_INSREC:
+                if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                    be32_to_cpu(agf->agf_longest))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_DELREC:
+                numrecs = xfs_btree_get_numrecs(block);
+                if (ptr <= numrecs)
+                        return;
+                ASSERT(ptr == numrecs + 1);
-        /*
+                if (numrecs) {
-         * GCC doesn't understand the (arguably complex) control flow in
+                        xfs_alloc_rec_t *rrp;
-         * this function and complains about uninitialized structure fields
-         * without this.
-         */
-        memset(&nrec, 0, sizeof(nrec));
-        /*
+                        rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
-         * If we made it to the root level, allocate a new root block
+                        len = rrp->ar_blockcount;
-         * and we're done.
-         */
-        if (level >= cur->bc_nlevels) {
-                XFS_STATS_INC(xs_abt_insrec);
-                if ((error = xfs_alloc_newroot(cur, &i)))
-                        return error;
-                *bnop = NULLAGBLOCK;
-                *stat = i;
-                return 0;
-        }
-        /*
-         * Make a key out of the record data to be inserted, and save it.
-         */
-        key.ar_startblock = recp->ar_startblock;
-        key.ar_blockcount = recp->ar_blockcount;
-        optr = ptr = cur->bc_ptrs[level];
-        /*
-         * If we're off the left edge, return failure.
-         */
-        if (ptr == 0) {
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_abt_insrec);
-        /*
-         * Get pointers to the btree buffer and block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-        /*
-         * Check that the new entry is being inserted in the right place.
-         */
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
                } else {
-                        kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+                        len = 0;
-                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLAGBLOCK;
-        ncur = NULL;
-        /*
-         * If the block is full, we can't insert the new entry until we
-         * make the block un-full.
-         */
-        if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * First, try shifting an entry to the right neighbor.
-                 */
-                if ((error = xfs_alloc_rshift(cur, level, &i)))
-                        return error;
-                if (i) {
-                        /* nothing */
-                }
-                /*
-                 * Next, try shifting an entry to the left neighbor.
-                 */
-                else {
-                        if ((error = xfs_alloc_lshift(cur, level, &i)))
-                                return error;
-                        if (i)
-                                optr = ptr = cur->bc_ptrs[level];
-                        else {
-                                /*
-                                 * Next, try splitting the current block in
-                                 * half. If this works we have to re-set our
-                                 * variables because we could be in a
-                                 * different block now.
-                                 */
-                                if ((error = xfs_alloc_split(cur, level, &nbno,
-                                                &nkey, &ncur, &i)))
-                                        return error;
-                                if (i) {
-                                        bp = cur->bc_bufs[level];
-                                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                                        if ((error =
-                                                xfs_btree_check_sblock(cur,
-                                                        block, level, bp)))
-                                                return error;
-#endif
-                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ar_startblock = nkey.ar_startblock;
-                                        nrec.ar_blockcount = nkey.ar_blockcount;
-                                }
-                                /*
-                                 * Otherwise the insert fails.
-                                 */
-                                else {
-                                        *stat = 0;
-                                        return 0;
-                                }
-                        }
-                }
-        }
-        /*
-         * At this point we know there's room for our new entry in the block
-         * we're pointing at.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                /*
-                 * It's a non-leaf entry.  Make a hole for the new data
-                 * in the key and ptr regions of the block.
-                 */
-                kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                                return error;
                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                        return error;
-#endif
-                /*
-                 * Now stuff the new data in, bump numrecs and log the new data.
-                 */
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be32(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_keys(cur, bp, ptr, numrecs);
-                xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-                if (ptr < numrecs)
-                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                                kp + ptr);
-#endif
-        } else {
-                /*
-                 * It's a leaf entry.  Make a hole for the new record.
-                 */
-                rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                /*
-                 * Now stuff the new record in, bump numrecs
-                 * and log the new data.
-                 */
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-                if (ptr < numrecs)
-                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                                rp + ptr);
-#endif
-        }
-        /*
-         * Log the new number of records in the btree header.
-         */
-        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * If we inserted at the start of a block, update the parents' keys.
-         */
-        if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
-                return error;
-        /*
-         * Look to see if the longest extent in the allocation group
-         * needs to be updated.
-         */
-        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                break;
-        if (level == 0 &&
+        default:
-            cur->bc_btnum == XFS_BTNUM_CNT &&
+                ASSERT(0);
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
+                return;
-            be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
-                /*
-                 * If this is a leaf in the by-size btree and there
-                 * is no right sibling block and this block is bigger
-                 * than the previous longest block, update it.
-                 */
-                agf->agf_longest = recp->ar_blockcount;
-                cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
-                        = be32_to_cpu(recp->ar_blockcount);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
        }
-        /*
-         * Return the new block number, if any.
+        agf->agf_longest = len;
-         * If there is one, give back a record value and a cursor too.
+        cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
-         */
+        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
-        *bnop = nbno;
-        if (nbno != NULLAGBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Log header fields from a btree block.
+xfs_allocbt_get_minrecs(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        int                     level)
-xfs_alloc_log_block(
-        xfs_trans_t             *tp,    /* transaction pointer */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     fields) /* mask of fields: XFS_BB_... */
 {
-        int                     first;  /* first byte offset logged */
+        return cur->bc_mp->m_alloc_mnr[level != 0];
-        int                     last;   /* last byte offset logged */
+}
-        static const short      offsets[] = {   /* table of offsets */
-                offsetof(xfs_alloc_block_t, bb_magic),
-                offsetof(xfs_alloc_block_t, bb_level),
-                offsetof(xfs_alloc_block_t, bb_numrecs),
-                offsetof(xfs_alloc_block_t, bb_leftsib),
-                offsetof(xfs_alloc_block_t, bb_rightsib),
-                sizeof(xfs_alloc_block_t)
-        };
-        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+STATIC int
-        xfs_trans_log_buf(tp, bp, first, last);
+xfs_allocbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_alloc_mxr[level != 0];
 }
-/*
- * Log keys from a btree block (nonleaf).
- */
 STATIC void
-xfs_alloc_log_keys(
+xfs_allocbt_init_key_from_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     kfirst, /* index of first key to log */
-        int                     klast)  /* index of last key to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(rec->alloc.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        xfs_alloc_key_t         *kp;    /* key pointer in btree block */
-        int                     last;   /* last byte offset logged */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        key->alloc.ar_startblock = rec->alloc.ar_startblock;
-        kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+        key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
-        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_alloc_log_ptrs(
+xfs_allocbt_init_rec_from_key(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     pfirst, /* index of first pointer to log */
-        int                     plast)  /* index of last pointer to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(key->alloc.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_alloc_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rec->alloc.ar_startblock = key->alloc.ar_startblock;
-        pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+        rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_alloc_log_recs(
+xfs_allocbt_init_rec_from_cur(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     rfirst, /* index of first record to log */
-        int                     rlast)  /* index of last record to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(cur->bc_rec.a.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_alloc_rec_t         *rp;    /* record pointer for btree block */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-        rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+        rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-#ifdef DEBUG
-        {
-                xfs_agf_t       *agf;
-                xfs_alloc_rec_t *p;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
-                        ASSERT(be32_to_cpu(p->ar_startblock) +
-                               be32_to_cpu(p->ar_blockcount) <=
-                               be32_to_cpu(agf->agf_length));
-        }
-#endif
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
+STATIC void
- * Lookup the record.  The cursor is made to point to it, based on dir.
+xfs_allocbt_init_ptr_from_cur(
- * Return 0 if can't find any such record, 1 for success.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_ptr     *ptr)
-STATIC int                              /* error */
-xfs_alloc_lookup(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_lookup_t            dir,    /* <=, ==, or >= */
-        int                     *stat)  /* success/failure */
 {
-        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_alloc_block_t       *block=NULL;    /* current btree block */
-        int                     diff;   /* difference for the current key */
-        int                     error;  /* error return value */
-        int                     keyno=0;        /* current key number */
-        int                     level;  /* level in the btree */
-        xfs_mount_t             *mp;    /* file system mount point */
-        XFS_STATS_INC(xs_abt_lookup);
-        /*
-         * Get the allocation group header, and the root block number.
-         */
-        mp = cur->bc_mp;
-        {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                agno = be32_to_cpu(agf->agf_seqno);
-                agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-        }
-        /*
-         * Iterate over each level in the btree, starting at the root.
-         * For each level above the leaves, find the key we need, based
-         * on the lookup record, then follow the corresponding block
-         * pointer down to the next level.
-         */
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                xfs_buf_t       *bp;    /* buffer pointer for btree block */
-                xfs_daddr_t     d;      /* disk address of btree block */
-                /*
-                 * Get the disk address we're looking for.
-                 */
-                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                /*
-                 * If the old buffer at this level is for a different block,
-                 * throw it away, otherwise just use it.
-                 */
-                bp = cur->bc_bufs[level];
-                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = NULL;
-                if (!bp) {
-                        /*
-                         * Need to get a new buffer.  Read it, then
-                         * set it in the cursor, releasing the old one.
-                         */
-                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
-                                        agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
-                                return error;
-                        xfs_btree_setbuf(cur, level, bp);
-                        /*
-                         * Point to the btree block, now that we have the buffer
-                         */
-                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                        if ((error = xfs_btree_check_sblock(cur, block, level,
-                                        bp)))
-                                return error;
-                } else
-                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                /*
-                 * If we already had a key match at a higher level, we know
-                 * we need to use the first entry in this block.
-                 */
-                if (diff == 0)
-                        keyno = 1;
-                /*
-                 * Otherwise we need to search this block.  Do a binary search.
-                 */
-                else {
-                        int             high;   /* high entry number */
-                        xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
-                        xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
-                        int             low;    /* low entry number */
-                        /*
-                         * Get a pointer to keys or records.
-                         */
-                        if (level > 0)
-                                kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                        else
-                                krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                        /*
-                         * Set low and high entry numbers, 1-based.
-                         */
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                /*
-                                 * If the block is empty, the tree must
-                                 * be an empty leaf.
-                                 */
-                                ASSERT(level == 0 && cur->bc_nlevels == 1);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                *stat = 0;
-                                return 0;
-                        }
-                        /*
-                         * Binary search the block.
-                         */
-                        while (low <= high) {
-                                xfs_extlen_t    blockcount;     /* key value */
-                                xfs_agblock_t   startblock;     /* key value */
-                                XFS_STATS_INC(xs_abt_compare);
-                                /*
-                                 * keyno is average of low and high.
-                                 */
-                                keyno = (low + high) >> 1;
-                                /*
-                                 * Get startblock & blockcount.
-                                 */
-                                if (level > 0) {
-                                        xfs_alloc_key_t *kkp;
-                                        kkp = kkbase + keyno - 1;
-                                        startblock = be32_to_cpu(kkp->ar_startblock);
-                                        blockcount = be32_to_cpu(kkp->ar_blockcount);
-                                } else {
-                                        xfs_alloc_rec_t *krp;
-                                        krp = krbase + keyno - 1;
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
-                                        startblock = be32_to_cpu(krp->ar_startblock);
+        ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
-                                        blockcount = be32_to_cpu(krp->ar_blockcount);
-                                }
-                                /*
-                                 * Compute difference to get next direction.
-                                 */
-                                if (cur->bc_btnum == XFS_BTNUM_BNO)
-                                        diff = (int)startblock -
-                                               (int)cur->bc_rec.a.ar_startblock;
-                                else if (!(diff = (int)blockcount -
-                                            (int)cur->bc_rec.a.ar_blockcount))
-                                        diff = (int)startblock -
-                                            (int)cur->bc_rec.a.ar_startblock;
-                                /*
-                                 * Less than, move right.
-                                 */
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                /*
-                                 * Greater than, move left.
-                                 */
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                /*
-                                 * Equal, we're done.
-                                 */
-                                else
-                                        break;
-                        }
-                }
-                /*
-                 * If there are more levels, set up for the next level
-                 * by getting the block number and filling in the cursor.
-                 */
-                if (level > 0) {
-                        /*
-                         * If we moved left, need the previous key number,
-                         * unless there isn't one.
-                         */
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                                return error;
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        /*
-         * Done with the search.
-         * See if we need to adjust the results.
-         */
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE &&
-                    keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                        int     i;
-                        cur->bc_ptrs[0] = keyno;
+        ptr->s = agf->agf_roots[cur->bc_btnum];
-                        if ((error = xfs_alloc_increment(cur, 0, &i)))
-                                return error;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        /*
-         * Return if we succeeded or not.
-         */
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-                *stat = 0;
-        else
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        return 0;
 }
-/*
+STATIC __int64_t
- * Move 1 record left from cur/level if possible.
+xfs_allocbt_key_diff(
- * Update cur to reflect the new path.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key)
-STATIC int                              /* error */
-xfs_alloc_lshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
-#ifdef DEBUG
+        xfs_alloc_key_t         *kp = &key->alloc;
-        int                     i;      /* loop index */
+        __int64_t               diff;
-#endif
-        xfs_alloc_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-        xfs_alloc_block_t       *left;  /* left neighbor btree block */
-        int                     nrec;   /* new number of left block entries */
-        xfs_buf_t               *rbp;   /* buffer for right (current) block */
-        xfs_alloc_block_t       *right; /* right (current) btree block */
-        xfs_alloc_key_t         *rkp=NULL;      /* key pointer for right block */
-        xfs_alloc_ptr_t         *rpp=NULL;      /* address pointer for right block */
-        xfs_alloc_rec_t         *rrp=NULL;      /* record pointer for right block */
-        /*
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-         * Set up variables for this block as "right".
+                return (__int64_t)be32_to_cpu(kp->ar_startblock) -
-         */
+                                rec->ar_startblock;
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-#endif
-        /*
-         * If we've got no left sibling then we can't shift an entry left.
-         */
-        if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] <= 1) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the left neighbor as "left".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                        0, &lbp, XFS_ALLOC_BTREE_REF)))
-                return error;
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
        }
-        nrec = be16_to_cpu(left->bb_numrecs) + 1;
-        /*
-         * If non-leaf, copy a key and a ptr to the left block.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* key pointer for left block */
-                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-                lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
+        diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+        if (diff)
-                *lkp = *rkp;
+                return diff;
-                xfs_alloc_log_keys(cur, lbp, nrec, nrec);
-                lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                        return error;
-#endif
-                *lpp = *rpp;
-                xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
-                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-        }
-        /*
-         * If leaf, copy a record to the left block.
-         */
-        else {
-                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-                lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
+        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_alloc_log_recs(cur, lbp, nrec, nrec);
-                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-        }
-        /*
-         * Bump and log left's numrecs, decrement and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, 1);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, -1);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Slide the contents of right down one entry.
-         */
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                        level)))
-                                return error;
-                }
-#endif
-                memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-        } else {
-                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ar_startblock = rrp->ar_startblock;
-                key.ar_blockcount = rrp->ar_blockcount;
-                rkp = &key;
-        }
-        /*
-         * Update the parent key values of right.
-         */
-        if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
-                return error;
-        /*
-         * Slide the cursor value left one.
-         */
-        cur->bc_ptrs[level]--;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Allocate a new root block, fill it in.
+xfs_allocbt_kill_root(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        struct xfs_buf          *bp,
-xfs_alloc_newroot(
+        int                     level,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_ptr     *newroot)
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        int                     error;
-        xfs_agblock_t           lbno;   /* left block number */
-        xfs_buf_t               *lbp;   /* left btree buffer */
-        xfs_alloc_block_t       *left;  /* left btree block */
-        xfs_mount_t             *mp;    /* mount structure */
-        xfs_agblock_t           nbno;   /* new block number */
-        xfs_buf_t               *nbp;   /* new (root) buffer */
-        xfs_alloc_block_t       *new;   /* new (root) btree block */
-        int                     nptr;   /* new value for key index, 1 or 2 */
-        xfs_agblock_t           rbno;   /* right block number */
-        xfs_buf_t               *rbp;   /* right btree buffer */
-        xfs_alloc_block_t       *right; /* right btree block */
-        mp = cur->bc_mp;
-        ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        /*
+        XFS_BTREE_STATS_INC(cur, killroot);
-         * Get a buffer from the freelist blocks, for the new root.
-         */
-        error = xfs_alloc_get_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, &nbno, 1);
-        if (error)
-                return error;
-        /*
-         * None available, we fail.
-         */
-        if (nbno == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-        nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
-                0);
-        new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
-        /*
-         * Set the root data in the a.g. freespace structure.
-         */
-        {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                xfs_agnumber_t  seqno;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-                be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
-                seqno = be32_to_cpu(agf->agf_seqno);
-                mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-        }
        /*
-         * At the previous root level there are now two blocks: the old
+         * Update the root pointer, decreasing the level by 1 and then
-         * root, and the new block generated when it was split.
+         * free the old root.
-         * We don't know which one the cursor is pointing at, so we
-         * set up variables "left" and "right" for each case.
         */
-        lbp = cur->bc_bufs[cur->bc_nlevels - 1];
+        xfs_allocbt_set_root(cur, newroot, -1);
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+        error = xfs_allocbt_free_block(cur, bp);
-#ifdef DEBUG
+        if (error) {
-        if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-#endif
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                /*
-                 * Our block is left, pick up the right block.
-                 */
-                lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
-                rbno = be32_to_cpu(left->bb_rightsib);
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-                if ((error = xfs_btree_check_sblock(cur, right,
-                                cur->bc_nlevels - 1, rbp)))
-                        return error;
-                nptr = 1;
-        } else {
-                /*
-                 * Our block is right, pick up the left block.
-                 */
-                rbp = lbp;
-                right = left;
-                rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
-                lbno = be32_to_cpu(right->bb_leftsib);
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-                if ((error = xfs_btree_check_sblock(cur, left,
-                                cur->bc_nlevels - 1, lbp)))
-                        return error;
-                nptr = 2;
        }
-        /*
-         * Fill in the new block's btree header and log it.
-         */
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        new->bb_level = cpu_to_be16(cur->bc_nlevels);
-        new->bb_numrecs = cpu_to_be16(2);
-        new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-        new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-        xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
-        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-        /*
-         * Fill in the key data in the new root.
-         */
-        {
-                xfs_alloc_key_t         *kp;    /* btree key pointer */
-                kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
+        XFS_BTREE_STATS_INC(cur, free);
-                if (be16_to_cpu(left->bb_level) > 0) {
-                        kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
-                        kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                } else {
-                        xfs_alloc_rec_t *rp;    /* btree record pointer */
-                        rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
+        xfs_btree_setbuf(cur, level, NULL);
-                        kp[0].ar_startblock = rp->ar_startblock;
+        cur->bc_nlevels--;
-                        kp[0].ar_blockcount = rp->ar_blockcount;
-                        rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                        kp[1].ar_startblock = rp->ar_startblock;
-                        kp[1].ar_blockcount = rp->ar_blockcount;
-                }
-        }
-        xfs_alloc_log_keys(cur, nbp, 1, 2);
-        /*
-         * Fill in the pointer data in the new root.
-         */
-        {
-                xfs_alloc_ptr_t         *pp;    /* btree address pointer */
-                pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                pp[0] = cpu_to_be32(lbno);
-                pp[1] = cpu_to_be32(rbno);
-        }
-        xfs_alloc_log_ptrs(cur, nbp, 1, 2);
-        /*
-         * Fix up the cursor.
-         */
-        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-        cur->bc_ptrs[cur->bc_nlevels] = nptr;
-        cur->bc_nlevels++;
-        *stat = 1;
        return 0;
 }
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                              /* error */
-xfs_alloc_rshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
-{
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left (current) block */
-        xfs_alloc_block_t       *left;  /* left (current) btree block */
-        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-        xfs_alloc_block_t       *right; /* right neighbor btree block */
-        xfs_alloc_key_t         *rkp;   /* key pointer for right block */
-        xfs_btree_cur_t         *tcur;  /* temporary cursor */
-        /*
-         * Set up variables for this block as "left".
-         */
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * If we've got no right sibling then we can't shift an entry right.
-         */
-        if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the right neighbor as "right".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                        0, &rbp, XFS_ALLOC_BTREE_REF)))
-                return error;
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Make a hole at the start of the right neighbor block, then
-         * copy the last left block entry to the hole.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* key pointer for left block */
-                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-                xfs_alloc_ptr_t *rpp;   /* address pointer for right block */
-                lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
 #ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
+STATIC int
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
+xfs_allocbt_keys_inorder(
-                                return error;
+        struct xfs_btree_cur    *cur,
-                }
+        union xfs_btree_key     *k1,
-#endif
+        union xfs_btree_key     *k2)
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
+{
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-#ifdef DEBUG
+                return be32_to_cpu(k1->alloc.ar_startblock) <
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
+                       be32_to_cpu(k2->alloc.ar_startblock);
-                        return error;
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
        } else {
-                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
+                return be32_to_cpu(k1->alloc.ar_blockcount) <
-                xfs_alloc_rec_t *rrp;   /* record pointer for right block */
+                        be32_to_cpu(k2->alloc.ar_blockcount) ||
+                        (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
-                lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
+                         be32_to_cpu(k1->alloc.ar_startblock) <
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                         be32_to_cpu(k2->alloc.ar_startblock));
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ar_startblock = rrp->ar_startblock;
-                key.ar_blockcount = rrp->ar_blockcount;
-                rkp = &key;
-                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
        }
-        /*
-         * Decrement and log left's numrecs, bump and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Using a temporary cursor, update the parent key values of the
-         * block on the right.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_alloc_increment(tcur, level, &i)) ||
-            (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
-                goto error0;
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        *stat = 1;
-        return 0;
-error0:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
 }
-/*
+STATIC int
- * Split cur/level block in half.
+xfs_allocbt_recs_inorder(
- * Return new block number and its first record (to be inserted into parent).
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_rec     *r1,
-STATIC int                              /* error */
+        union xfs_btree_rec     *r2)
-xfs_alloc_split(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to split */
-        xfs_agblock_t           *bnop,  /* output: block number allocated */
-        xfs_alloc_key_t         *keyp,  /* output: first key of new block */
-        xfs_btree_cur_t         **curp, /* output: new cursor */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-        int                     i;      /* loop index/record number */
+                return be32_to_cpu(r1->alloc.ar_startblock) +
-        xfs_agblock_t           lbno;   /* left (current) block number */
+                        be32_to_cpu(r1->alloc.ar_blockcount) <=
-        xfs_buf_t               *lbp;   /* buffer for left block */
+                        be32_to_cpu(r2->alloc.ar_startblock);
-        xfs_alloc_block_t       *left;  /* left (current) btree block */
+        } else {
-        xfs_agblock_t           rbno;   /* right (new) block number */
+                return be32_to_cpu(r1->alloc.ar_blockcount) <
-        xfs_buf_t               *rbp;   /* buffer for right block */
+                        be32_to_cpu(r2->alloc.ar_blockcount) ||
-        xfs_alloc_block_t       *right; /* right (new) btree block */
+                        (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                         be32_to_cpu(r1->alloc.ar_startblock) <
-        /*
+                         be32_to_cpu(r2->alloc.ar_startblock));
-         * Allocate the new block from the freelist.
-         * If we can't do it, we're toast.  Give up.
-         */
-        error = xfs_alloc_get_freelist(cur->bc_tp,
-                                         cur->bc_private.a.agbp, &rbno, 1);
-        if (error)
-                return error;
-        if (rbno == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-        rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
-                rbno, 0);
-        /*
-         * Set up the new block as "right".
-         */
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-        /*
-         * "Left" is the current (according to the cursor) block.
-         */
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * Fill in the btree header for the new block.
-         */
-        right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        right->bb_level = left->bb_level;
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        /*
-         * Make sure that if there's an odd number of entries now, that
-         * each new block will have the same number of entries.
-         */
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        /*
-         * For non-leaf blocks, copy keys and addresses over to the new block.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* left btree key pointer */
-                xfs_alloc_ptr_t *lpp;   /* left btree address pointer */
-                xfs_alloc_key_t *rkp;   /* right btree key pointer */
-                xfs_alloc_ptr_t *rpp;   /* right btree address pointer */
-                lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *keyp = *rkp;
        }
-        /*
+}
-         * For leaf blocks, copy records over to the new block.
+#endif  /* DEBUG */
-         */
-        else {
-                xfs_alloc_rec_t *lrp;   /* left btree record pointer */
-                xfs_alloc_rec_t *rrp;   /* right btree record pointer */
-                lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
+#ifdef XFS_BTREE_TRACE
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+ktrace_t        *xfs_allocbt_trace_buf;
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ar_startblock = rrp->ar_startblock;
-                keyp->ar_blockcount = rrp->ar_blockcount;
-        }
-        /*
-         * Find the left block number by looking in the buffer.
-         * Adjust numrecs, sibling pointers.
-         */
-        lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be32(rbno);
-        right->bb_leftsib = cpu_to_be32(lbno);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there's a block to the new block's right, make that block
-         * point back to right instead of to left.
-         */
-        if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-                xfs_alloc_block_t       *rrblock;       /* rr btree block */
-                xfs_buf_t               *rrbp;          /* buffer for rrblock */
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+STATIC void
-                                cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
+xfs_allocbt_trace_enter(
-                                &rrbp, XFS_ALLOC_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        const char              *func,
-                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+        char                    *s,
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+        int                     type,
-                        return error;
+        int                     line,
-                rrblock->bb_leftsib = cpu_to_be32(rbno);
+        __psunsigned_t          a0,
-                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        __psunsigned_t          a1,
-        }
+        __psunsigned_t          a2,
-        /*
+        __psunsigned_t          a3,
-         * If the cursor is really in the right block, move it there.
+        __psunsigned_t          a4,
-         * If it's just pointing past the last entry in left, then we'll
+        __psunsigned_t          a5,
-         * insert there, so don't change anything in that case.
+        __psunsigned_t          a6,
-         */
+        __psunsigned_t          a7,
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
+        __psunsigned_t          a8,
-                xfs_btree_setbuf(cur, level, rbp);
+        __psunsigned_t          a9,
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
+        __psunsigned_t          a10)
-        }
+{
-        /*
+        ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
-         * If there are more levels, we'll need another cursor which refers to
+                (void *)func, (void *)s, NULL, (void *)cur,
-         * the right block, no matter where this cursor was.
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-         */
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-        if (level + 1 < cur->bc_nlevels) {
+                (void *)a8, (void *)a9, (void *)a10);
-                if ((error = xfs_btree_dup_cursor(cur, curp)))
-                        return error;
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = rbno;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Update keys at all levels from here to the root along the cursor's path.
+xfs_allocbt_trace_cursor(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        __uint32_t              *s0,
-xfs_alloc_updkey(
+        __uint64_t              *l0,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        __uint64_t              *l1)
-        xfs_alloc_key_t         *keyp,  /* new key value to update to */
-        int                     level)  /* starting level for update */
 {
-        int                     ptr;    /* index of key in block */
+        *s0 = cur->bc_private.a.agno;
+        *l0 = cur->bc_rec.a.ar_startblock;
-        /*
+        *l1 = cur->bc_rec.a.ar_blockcount;
-         * Go up the tree from this level toward the root.
-         * At each level, update the key value to the value input.
-         * Stop when we reach a level where the cursor isn't pointing
-         * at the first entry in the block.
-         */
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                xfs_alloc_block_t       *block; /* btree block */
-                xfs_buf_t               *bp;    /* buffer for block */
-#ifdef DEBUG
-                int                     error;  /* error return value */
-#endif
-                xfs_alloc_key_t         *kp;    /* ptr to btree block keys */
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                        return error;
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_alloc_log_keys(cur, bp, ptr, ptr);
-        }
-        return 0;
 }
-/*
+STATIC void
- * Externally visible routines.
+xfs_allocbt_trace_key(
- */
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
-/*
+        __uint64_t              *l0,
- * Decrement cursor by one record at the level.
+        __uint64_t              *l1)
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                     /* error */
-xfs_alloc_decrement(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level in btree, 0 is leaf */
-        int                     *stat)  /* success/failure */
 {
-        xfs_alloc_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(key->alloc.ar_startblock);
-        int                     error;  /* error return value */
+        *l1 = be32_to_cpu(key->alloc.ar_blockcount);
-        int                     lev;    /* btree level */
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the left at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        /*
-         * Decrement the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (--cur->bc_ptrs[level] > 0) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Get a pointer to the btree block.
-         */
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level,
-                        cur->bc_bufs[level])))
-                return error;
-#endif
-        /*
-         * If we just went off the left edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree decrementing pointers.
-         * Stop when we don't go off the left edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                /*
-                 * Read-ahead the left block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                xfs_buf_t       *bp;    /* buffer pointer for block */
-                agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Delete the record pointed to by cur.
+xfs_allocbt_trace_record(
- * The cursor refers to the place where the record was (could be inserted)
+        struct xfs_btree_cur    *cur,
- * when the operation returns.
+        union xfs_btree_rec     *rec,
- */
+        __uint64_t              *l0,
-int                                     /* error */
+        __uint64_t              *l1,
-xfs_alloc_delete(
+        __uint64_t              *l2)
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
 {
-        int             error;          /* error return value */
+        *l0 = be32_to_cpu(rec->alloc.ar_startblock);
-        int             i;              /* result code */
+        *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
-        int             level;          /* btree level */
+        *l2 = 0;
-        /*
-         * Go up the tree, starting at leaf level.
-         * If 2 is returned then a join was done; go to the next level.
-         * Otherwise we are done.
-         */
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_alloc_delrec(cur, level, &i)))
-                        return error;
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_alloc_decrement(cur, level, &i)))
-                                        return error;
-                                break;
-                        }
-                }
-        }
-        *stat = i;
-        return 0;
 }
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+        .rec_len                = sizeof(xfs_alloc_rec_t),
+        .key_len                = sizeof(xfs_alloc_key_t),
+        .dup_cursor             = xfs_allocbt_dup_cursor,
+        .set_root               = xfs_allocbt_set_root,
+        .kill_root              = xfs_allocbt_kill_root,
+        .alloc_block            = xfs_allocbt_alloc_block,
+        .free_block             = xfs_allocbt_free_block,
+        .update_lastrec         = xfs_allocbt_update_lastrec,
+        .get_minrecs            = xfs_allocbt_get_minrecs,
+        .get_maxrecs            = xfs_allocbt_get_maxrecs,
+        .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+        .key_diff               = xfs_allocbt_key_diff,
-/*
- * Get the data from the pointed-to record.
- */
-int                                     /* error */
-xfs_alloc_get_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agblock_t           *bno,   /* output: starting block of extent */
-        xfs_extlen_t            *len,   /* output: length of extent */
-        int                     *stat)  /* output: success/failure */
-{
-        xfs_alloc_block_t       *block; /* btree block */
 #ifdef DEBUG
-        int                     error;  /* error return value */
+        .keys_inorder           = xfs_allocbt_keys_inorder,
+        .recs_inorder           = xfs_allocbt_recs_inorder,
 #endif
-        int                     ptr;    /* record number */
-        ptr = cur->bc_ptrs[0];
+#ifdef XFS_BTREE_TRACE
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+        .trace_enter            = xfs_allocbt_trace_enter,
-#ifdef DEBUG
+        .trace_cursor           = xfs_allocbt_trace_cursor,
-        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+        .trace_key              = xfs_allocbt_trace_key,
-                return error;
+        .trace_record           = xfs_allocbt_trace_record,
 #endif
-        /*
+};
-         * Off the right end or left end, return failure.
-         */
-        if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Point to the record and extract its data.
-         */
-        {
-                xfs_alloc_rec_t         *rec;   /* record data */
-                rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                *bno = be32_to_cpu(rec->ar_startblock);
-                *len = be32_to_cpu(rec->ar_blockcount);
-        }
-        *stat = 1;
-        return 0;
-}
 /*
- * Increment cursor by one record at the level.
+ * Allocate a new allocation btree cursor.
- * For nonzero levels the leaf-ward information is untouched.
 */
-int                                     /* error */
+struct xfs_btree_cur *                  /* new alloc btree cursor */
-xfs_alloc_increment(
+xfs_allocbt_init_cursor(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,            /* file system mount point */
-        int                     level,  /* level in btree, 0 is leaf */
+        struct xfs_trans        *tp,            /* transaction pointer */
-        int                     *stat)  /* success/failure */
+        struct xfs_buf          *agbp,          /* buffer for agf structure */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_btnum_t             btnum)          /* btree identifier */
 {
-        xfs_alloc_block_t       *block; /* btree block */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-        xfs_buf_t               *bp;    /* tree block buffer */
+        struct xfs_btree_cur    *cur;
-        int                     error;  /* error return value */
-        int                     lev;    /* btree level */
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the right at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        /*
-         * Get a pointer to the btree block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Increment the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we just went off the right edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree incrementing pointers.
-         * Stop when we don't go off the right edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                bp = cur->bc_bufs[lev];
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-#endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                /*
-                 * Read-ahead the right block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-             lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
+        ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = 1;
-        }
-        *stat = 1;
-        return 0;
-}
-/*
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int                                     /* error */
-xfs_alloc_insert(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;              /* result value, 0 for failure */
-        int             level;          /* current level number in btree */
-        xfs_agblock_t   nbno;           /* new block number (split result) */
-        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-        xfs_alloc_rec_t nrec;           /* record being inserted this level */
-        xfs_btree_cur_t *pcur;          /* previous level's cursor */
-        level = 0;
+        cur->bc_tp = tp;
-        nbno = NULLAGBLOCK;
+        cur->bc_mp = mp;
-        nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+        cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-        nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+        cur->bc_btnum = btnum;
-        ncur = NULL;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        pcur = cur;
-        /*
-         * Loop going up the tree, starting at the leaf level.
-         * Stop when we don't get a split block, that must mean that
-         * the insert is finished with this level.
-         */
-        do {
-                /*
-                 * Insert nrec/nbno into this level of the tree.
-                 * Note if we fail, nbno will be null.
-                 */
-                if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                /*
-                 * See if the cursor we just used is trash.
-                 * Can't trash the caller's cursor, but otherwise we should
-                 * if ncur is a new cursor or we're about to be done.
-                 */
-                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                /*
-                 * If we got a new cursor, switch to it.
-                 */
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLAGBLOCK);
-        *stat = i;
-        return 0;
-}
-/*
+        cur->bc_ops = &xfs_allocbt_ops;
- * Lookup the record equal to [bno, len] in the btree given by cur.
+        if (btnum == XFS_BTNUM_CNT)
- */
+                cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-int                                     /* error */
-xfs_alloc_lookup_eq(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
+        cur->bc_private.a.agbp = agbp;
- * Lookup the first record greater than or equal to [bno, len]
+        cur->bc_private.a.agno = agno;
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_alloc_lookup_ge(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
+        return cur;
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_alloc_lookup_le(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
+ * Calculate number of records in an alloc btree block.
- * This either works (return 0) or gets an EFSCORRUPTED error.
 */
-int                                     /* error */
+int
-xfs_alloc_update(
+xfs_allocbt_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,
-        xfs_agblock_t           bno,    /* starting block of extent */
+        int                     blocklen,
-        xfs_extlen_t            len)    /* length of extent */
+        int                     leaf)
 {
-        xfs_alloc_block_t       *block; /* btree block to update */
+        blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
-        int                     error;  /* error return value */
-        int                     ptr;    /* current record number (updating) */
-        ASSERT(len > 0);
+        if (leaf)
-        /*
+                return blocklen / sizeof(xfs_alloc_rec_t);
-         * Pick up the a.g. freelist struct and the current block.
+        return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
-         */
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-                return error;
-#endif
-        /*
-         * Get the address of the rec to be updated.
-         */
-        ptr = cur->bc_ptrs[0];
-        {
-                xfs_alloc_rec_t         *rp;    /* pointer to updated record */
-                rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                /*
-                 * Fill in the new contents and log them.
-                 */
-                rp->ar_startblock = cpu_to_be32(bno);
-                rp->ar_blockcount = cpu_to_be32(len);
-                xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
-        }
-        /*
-         * If it's the by-size btree and it's the last leaf block and
-         * it's the last record... then update the size of the longest
-         * extent in the a.g., which we cache in the a.g. freelist header.
-         */
-        if (cur->bc_btnum == XFS_BTNUM_CNT &&
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr == be16_to_cpu(block->bb_numrecs)) {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                xfs_agnumber_t  seqno;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                seqno = be32_to_cpu(agf->agf_seqno);
-                cur->bc_mp->m_perag[seqno].pagf_longest = len;
-                agf->agf_longest = cpu_to_be32(len);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
-        }
-        /*
-         * Updating first record in leaf. Pass new key value up to our parent.
-         */
-        if (ptr == 1) {
-                xfs_alloc_key_t key;    /* key containing [bno, len] */
-                key.ar_startblock = cpu_to_be32(bno);
-                key.ar_blockcount = cpu_to_be32(len);
-                if ((error = xfs_alloc_updkey(cur, &key, 1)))
-                        return error;
-        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 5bd1a2c8bd07..a6caa0022c9b 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,7 +24,6 @@
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 /*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
 /* btree pointer type */
 typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_alloc_block_t;
-#define XFS_BUF_TO_ALLOC_BLOCK(bp)      ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
 /*
 * Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef	struct xfs_btree_sblock xfs_alloc_block_t;
 #define XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
 /*
- * Record, key, and pointer address macros for btree blocks.
+ * Btree block header size depends on a superblock flag.
- */
+ *
-#define XFS_ALLOC_REC_ADDR(bb,i,cur)    \
+ * (not quite yet, but soon)
-        XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
-#define XFS_ALLOC_KEY_ADDR(bb,i,cur)    \
-        XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
-#define XFS_ALLOC_PTR_ADDR(bb,i,cur)    \
-        XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno,
-                                xfs_extlen_t *len, int *stat);
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                                xfs_extlen_t len, int *stat);
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                                xfs_extlen_t len, int *stat);
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
 */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
-                                xfs_extlen_t len, int *stat);
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
+ * Record, key, and pointer address macros for btree blocks.
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ *
- */
+ * (note that some of these may appear unused, but they are used in userspace)
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ */
-                                xfs_extlen_t len);
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+        ((xfs_alloc_rec_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+        ((xfs_alloc_key_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_alloc_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *,
+                xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 0b3b5efe848c..53d5e70d1360 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -41,21 +41,36 @@
 #endif
 #ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val)        ((__be16)(val))
+#define cpu_to_be16(val)        ((__force __be16)(__u16)(val))
-#define cpu_to_be32(val)        ((__be32)(val))
+#define cpu_to_be32(val)        ((__force __be32)(__u32)(val))
-#define cpu_to_be64(val)        ((__be64)(val))
+#define cpu_to_be64(val)        ((__force __be64)(__u64)(val))
-#define be16_to_cpu(val)        ((__uint16_t)(val))
+#define be16_to_cpu(val)        ((__force __u16)(__be16)(val))
-#define be32_to_cpu(val)        ((__uint32_t)(val))
+#define be32_to_cpu(val)        ((__force __u32)(__be32)(val))
-#define be64_to_cpu(val)        ((__uint64_t)(val))
+#define be64_to_cpu(val)        ((__force __u64)(__be64)(val))
 #else
-#define cpu_to_be16(val)        (__swab16((__uint16_t)(val)))
+#define cpu_to_be16(val)        ((__force __be16)__swab16((__u16)(val)))
-#define cpu_to_be32(val)        (__swab32((__uint32_t)(val)))
+#define cpu_to_be32(val)        ((__force __be32)__swab32((__u32)(val)))
-#define cpu_to_be64(val)        (__swab64((__uint64_t)(val)))
+#define cpu_to_be64(val)        ((__force __be64)__swab64((__u64)(val)))
-#define be16_to_cpu(val)        (__swab16((__be16)(val)))
+#define be16_to_cpu(val)        (__swab16((__force __u16)(__be16)(val)))
-#define be32_to_cpu(val)        (__swab32((__be32)(val)))
+#define be32_to_cpu(val)        (__swab32((__force __u32)(__be32)(val)))
-#define be64_to_cpu(val)        (__swab64((__be64)(val)))
+#define be64_to_cpu(val)        (__swab64((__force __u64)(__be64)(val)))
 #endif
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+        *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+        *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+        *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
 #endif  /* __KERNEL__ */
 /* do we need conversion? */
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 8e0e463dae2d..bca7b243c319 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
 /* Get low bit set out of 32-bit argument, -1 if none set */
 static inline int xfs_lowbit32(__uint32_t v)
 {
-        unsigned long   t = v;
+        return ffs(v) - 1;
-        return (v) ? find_first_bit(&t, 32) : -1;
 }
 /* Get low bit set out of 64-bit argument, -1 if none set */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a1aab9275d5a..138308e70d14 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
 STATIC void
 xfs_bmap_disk_count_leaves(
-        xfs_extnum_t            idx,
+        struct xfs_mount        *mp,
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count);
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
 * Bmap internal routines.
 */
+STATIC int                              /* error */
+xfs_bmbt_lookup_eq(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+STATIC int                              /* error */
+xfs_bmbt_lookup_ge(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        xfs_exntst_t            state)
+{
+        union xfs_btree_rec     rec;
+        xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+        return xfs_btree_update(cur, &rec);
+}
 /*
 * Called from xfs_bmap_add_attrfork to handle btree format files.
 */
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
                *flags |= XFS_ILOG_DBROOT;
        else {
-                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
-                        XFS_DATA_FORK);
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-                if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+                if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
                                oldext)))
                                goto done;
                        cur->bc_rec.b = *new;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                        if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto done;
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = PREV;
                        cur->bc_rec.b.br_blockcount =
                                new->br_startoff - PREV.br_startoff;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        /*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
        int                     whichfork)  /* data or attr fork */
 {
        /* REFERENCED */
-        xfs_bmbt_block_t        *cblock;/* child btree block */
+        struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
        xfs_ifork_t             *ifp;   /* inode fork data */
        xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-        xfs_bmbt_block_t        *rblock;/* root btree block */
+        struct xfs_btree_block  *rblock;/* root btree block */
+        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
        rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-        ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
+        ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
-        mp = ip->i_mount;
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
        *logflagsp = 0;
 #ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
                        XFS_BMAP_BTREE_REF)))
                return error;
-        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+        cblock = XFS_BUF_TO_BLOCK(cbp);
-        if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
                        flags |= XFS_ILOG_FEXT(whichfork);
                        break;
                }
-                if ((error = xfs_bmbt_delete(cur, &i)))
+                if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
                                                got.br_startblock, temp,
                                                got.br_state)))
                                        goto done;
-                                if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto done;
                                cur->bc_rec.b = new;
-                                error = xfs_bmbt_insert(cur, &i);
+                                error = xfs_btree_insert(cur, &i);
                                if (error && error != ENOSPC)
                                        goto done;
                                /*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
        int                     *logflagsp,     /* inode logging flags */
        int                     whichfork)      /* data or attr fork */
 {
-        xfs_bmbt_block_t        *ablock;        /* allocated (child) bt block */
+        struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
        xfs_buf_t               *abp;           /* buffer for ablock */
        xfs_alloc_arg_t         args;           /* allocation arguments */
        xfs_bmbt_rec_t          *arp;           /* child record pointer */
-        xfs_bmbt_block_t        *block;         /* btree root block */
+        struct xfs_btree_block  *block;         /* btree root block */
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
        int                     error;          /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
         */
        xfs_iroot_realloc(ip, 1, whichfork);
        ifp->if_flags |= XFS_IFBROOT;
        /*
         * Fill in the root.
         */
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
        block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        block->bb_level = cpu_to_be16(1);
        block->bb_numrecs = cpu_to_be16(1);
-        block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
        /*
         * Need a cursor.  Can't allocate until bb_level is filled in.
         */
        mp = ip->i_mount;
-        cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+        cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-                whichfork);
        cur->bc_private.b.firstblock = *firstblock;
        cur->bc_private.b.flist = flist;
        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
-        ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+        ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
-        ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        for (cnt = i = 0; i < nextents; i++) {
                ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
                }
        }
        ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-        ablock->bb_numrecs = cpu_to_be16(cnt);
+        xfs_btree_set_numrecs(ablock, cnt);
        /*
         * Fill in the root key and pointer.
         */
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+        kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+        pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                                be16_to_cpu(block->bb_level)));
        *pp = cpu_to_be64(args.fsbno);
        /*
         * Do all this logging at the end so that
         * the root is at the right level.
         */
-        xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+        xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
        ASSERT(*curp == NULL);
        *curp = cur;
        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
                maxleafents = MAXAEXTNUM;
                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
        }
-        maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+        maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
        minnoderecs = mp->m_bmap_dmnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4242,9 +4292,15 @@ xfs_bmap_finish(
         * We have a new transaction, so we should return committed=1,
         * even though we're returning an error.
         */
-        if (error) {
+        if (error)
                return error;
-        }
+        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(ntp->t_ticket);
        if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
                        logcount)))
                return error;
@@ -4474,6 +4530,22 @@ xfs_bmap_one_block(
        return rval;
 }
+STATIC int
+xfs_bmap_sanity_check(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp,
+        int                     level)
+{
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+            be16_to_cpu(block->bb_level) != level ||
+            be16_to_cpu(block->bb_numrecs) == 0 ||
+            be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+                return 0;
+        return 1;
+}
 /*
 * Read in the extents to if_extents.
 * All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4558,7 @@ xfs_bmap_read_extents(
        xfs_inode_t             *ip,    /* incore inode */
        int                     whichfork) /* data or attr fork */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -4510,7 +4582,7 @@ xfs_bmap_read_extents(
         */
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4595,13 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
@@ -4549,7 +4621,7 @@ xfs_bmap_read_extents(
                xfs_extnum_t    start;
-                num_recs = be16_to_cpu(block->bb_numrecs);
+                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4633,18 @@ xfs_bmap_read_extents(
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, 0),
+                        xfs_bmap_sanity_check(mp, bp, 0),
                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
                        xfs_btree_reada_bufl(mp, nextbno, 1);
                /*
                 * Copy records into the extent records.
                 */
-                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                frp = XFS_BMBT_REC_ADDR(mp, block, 1);
                start = i;
                for (j = 0; j < num_recs; j++, i++, frp++) {
                        xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4675,7 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
        }
        ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5101,7 @@ xfs_bmapi(
                                if (abno == NULLFSBLOCK)
                                        break;
                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                        cur = xfs_btree_init_cursor(mp,
+                                        cur = xfs_bmbt_init_cursor(mp, tp,
-                                                tp, NULL, 0, XFS_BTNUM_BMAP,
                                                ip, whichfork);
                                        cur->bc_private.b.firstblock =
                                                *firstblock;
@@ -5147,9 +5218,8 @@ xfs_bmapi(
                         */
                        ASSERT(mval->br_blockcount <= len);
                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                cur = xfs_btree_init_cursor(mp,
+                                cur = xfs_bmbt_init_cursor(mp,
-                                        tp, NULL, 0, XFS_BTNUM_BMAP,
+                                        tp, ip, whichfork);
-                                        ip, whichfork);
                                cur->bc_private.b.firstblock =
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
@@ -5440,8 +5510,7 @@ xfs_bunmapi(
        logflags = 0;
        if (ifp->if_flags & XFS_IFBROOT) {
                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-                        whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
@@ -5742,14 +5811,17 @@ error0:
 STATIC int
 xfs_getbmapx_fix_eof_hole(
        xfs_inode_t             *ip,            /* xfs incore inode pointer */
-        struct getbmap          *out,           /* output structure */
+        struct getbmapx         *out,           /* output structure */
        int                     prealloced,     /* this is a file with
-                                                * preallocated data space */
+                                                 * preallocated data space */
        __int64_t               end,            /* last block requested */
        xfs_fsblock_t           startblock)
 {
        __int64_t               fixlen;
        xfs_mount_t             *mp;            /* file system mount point */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        xfs_extnum_t            lastx;          /* last extent pointer */
+        xfs_fileoff_t           fileblock;
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
@@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole(
                        out->bmv_length = fixlen;
                }
        } else {
-                out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                if (startblock == DELAYSTARTBLOCK)
+                        out->bmv_block = -2;
+                else
+                        out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+                ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+                if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+                   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+                        out->bmv_oflags |= BMV_OF_LAST;
        }
        return 1;
 }
 /*
- * Fcntl interface to xfs_bmapi.
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
 */
 int                                             /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-        struct getbmap          *bmv,           /* user bmap structure */
+        struct getbmapx         *bmv,           /* user bmap structure */
-        void                    __user *ap,     /* pointer to user's array */
+        xfs_bmap_format_t       formatter,      /* format to user */
-        int                     interface)      /* interface flags */
+        void                    *arg)           /* formatter arg */
 {
        __int64_t               bmvend;         /* last block requested */
        int                     error;          /* return value */
@@ -5790,19 +5874,17 @@ xfs_getbmap(
        int                     nexleft;        /* # of user extents left */
        int                     subnex;         /* # of bmapi's can do */
        int                     nmap;           /* number of map entries */
-        struct getbmap          out;            /* output structure */
+        struct getbmapx         out;            /* output structure */
        int                     whichfork;      /* data or attr fork */
        int                     prealloced;     /* this is a file with
                                                 * preallocated data space */
-        int                     sh_unwritten;   /* true, if unwritten */
+        int                     iflags;         /* interface flags */
-                                                /* extents listed separately */
        int                     bmapi_flags;    /* flags for xfs_bmapi */
-        __int32_t               oflags;         /* getbmapx bmv_oflags field */
        mp = ip->i_mount;
+        iflags = bmv->bmv_iflags;
-        whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+        whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-        sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
        /*      If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
         *      generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
@@ -5817,7 +5899,7 @@ xfs_getbmap(
         *      could misinterpret holes in a DMAPI file as true holes,
         *      when in fact they may represent offline user data.
         */
-        if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
+        if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
            whichfork == XFS_DATA_FORK) {
                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
@@ -5873,8 +5955,9 @@ xfs_getbmap(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (whichfork == XFS_DATA_FORK &&
+        if (((iflags & BMV_IF_DELALLOC) == 0) &&
-                (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
+            (whichfork == XFS_DATA_FORK) &&
+            (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
                error = xfs_flush_pages(ip, (xfs_off_t)0,
                                               -1, 0, FI_REMAPF);
@@ -5884,7 +5967,8 @@ xfs_getbmap(
                }
        }
-        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+        ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
+               ip->i_delayed_blks == 0);
        lock = xfs_ilock_map_shared(ip);
@@ -5896,7 +5980,7 @@ xfs_getbmap(
                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
        bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
-                        ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+                        ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
        /*
         * Allocate enough space to handle "subnex" maps at a time.
@@ -5906,9 +5990,12 @@ xfs_getbmap(
        bmv->bmv_entries = 0;
-        if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
+        if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
-                error = 0;
+                if (((iflags & BMV_IF_DELALLOC) == 0) ||
-                goto unlock_and_return;
+                    whichfork == XFS_ATTR_FORK) {
+                        error = 0;
+                        goto unlock_and_return;
+                }
        }
        nexleft = nex;
@@ -5924,52 +6011,40 @@ xfs_getbmap(
                ASSERT(nmap <= subnex);
                for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                        nexleft--;
+                        out.bmv_oflags = 0;
-                        oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
+                        if (map[i].br_state == XFS_EXT_UNWRITTEN)
-                                        BMV_OF_PREALLOC : 0;
+                                out.bmv_oflags |= BMV_OF_PREALLOC;
+                        else if (map[i].br_startblock == DELAYSTARTBLOCK)
+                                out.bmv_oflags |= BMV_OF_DELALLOC;
                        out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
                        out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                        ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                        out.bmv_unused1 = out.bmv_unused2 = 0;
+                        ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+                              (map[i].br_startblock != DELAYSTARTBLOCK));
                        if (map[i].br_startblock == HOLESTARTBLOCK &&
                            whichfork == XFS_ATTR_FORK) {
                                /* came to the end of attribute fork */
+                                out.bmv_oflags |= BMV_OF_LAST;
                                goto unlock_and_return;
                        } else {
+                                int full = 0;   /* user array is full */
                                if (!xfs_getbmapx_fix_eof_hole(ip, &out,
                                                        prealloced, bmvend,
                                                        map[i].br_startblock)) {
                                        goto unlock_and_return;
                                }
-                                /* return either getbmap/getbmapx structure. */
+                                /* format results & advance arg */
-                                if (interface & BMV_IF_EXTENDED) {
+                                error = formatter(&arg, &out, &full);
-                                        struct  getbmapx        outx;
+                                if (error || full)
+                                        goto unlock_and_return;
-                                        GETBMAP_CONVERT(out,outx);
+                                nexleft--;
-                                        outx.bmv_oflags = oflags;
-                                        outx.bmv_unused1 = outx.bmv_unused2 = 0;
-                                        if (copy_to_user(ap, &outx,
-                                                        sizeof(outx))) {
-                                                error = XFS_ERROR(EFAULT);
-                                                goto unlock_and_return;
-                                        }
-                                } else {
-                                        if (copy_to_user(ap, &out,
-                                                        sizeof(out))) {
-                                                error = XFS_ERROR(EFAULT);
-                                                goto unlock_and_return;
-                                        }
-                                }
                                bmv->bmv_offset =
                                        out.bmv_offset + out.bmv_length;
                                bmv->bmv_length = MAX((__int64_t)0,
                                        (__int64_t)(bmvend - bmv->bmv_offset));
                                bmv->bmv_entries++;
-                                ap = (interface & BMV_IF_EXTENDED) ?
-                                                (void __user *)
-                                        ((struct getbmapx __user *)ap + 1) :
-                                                (void __user *)
-                                        ((struct getbmap __user *)ap + 1);
                        }
                }
        } while (nmap && nexleft && bmv->bmv_length);
@@ -6131,7 +6206,7 @@ xfs_bmap_get_bp(
 void
 xfs_check_block(
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
        int                     root,
        short                   sz)
@@ -6143,36 +6218,29 @@ xfs_check_block(
        ASSERT(be16_to_cpu(block->bb_level) > 0);
        prevp = NULL;
-        for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
+        for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
                dmxr = mp->m_bmap_dmxr[0];
+                keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-                if (root) {
-                        keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
-                } else {
-                        keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
-                }
                if (prevp) {
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+                        ASSERT(be64_to_cpu(prevp->br_startoff) <
+                               be64_to_cpu(keyp->br_startoff));
                }
                prevp = keyp;
                /*
                 * Compare the block numbers to see if there are dups.
                 */
+                if (root)
+                        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+                else
+                        pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-                if (root) {
-                        pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
-                } else {
-                        pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
-                }
                for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-                        if (root) {
+                        if (root)
-                                thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
+                                thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
-                        } else {
+                        else
-                                thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j,
+                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
-                                                            dmxr);
-                        }
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
@@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents(
        xfs_inode_t             *ip,            /* incore inode pointer */
        int                     whichfork)      /* data or attr fork */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
@@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents(
                 */
                xfs_check_block(block, mp, 0, 0);
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
@@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents(
                xfs_extnum_t    num_recs;
-                num_recs = be16_to_cpu(block->bb_numrecs);
+                num_recs = xfs_btree_get_numrecs(block);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                /*
                 * Check all the extents to make sure they are OK.
@@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents(
                 * conform with the first entry in this one.
                 */
-                ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                ep = XFS_BMBT_REC_ADDR(mp, block, 1);
                if (i) {
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                        ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                               xfs_bmbt_disk_get_blockcount(&last) <=
+                               xfs_bmbt_disk_get_startoff(ep));
                }
                for (j = 1; j < num_recs; j++) {
-                        nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
+                        nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
+                        ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                               xfs_bmbt_disk_get_blockcount(ep) <=
+                               xfs_bmbt_disk_get_startoff(nextp));
                        ep = nextp;
                }
@@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
                bp_release = 0;
@@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks(
        int                     whichfork,      /* data or attr fork */
        int                     *count)         /* out: count of blocks */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
@@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks(
        block = ifp->if_broot;
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6485,29 @@ xfs_bmap_count_tree(
        __be64                  *pp;
        xfs_fsblock_t           bno = blockno;
        xfs_fsblock_t           nextbno;
-        xfs_bmbt_block_t        *block, *nextblock;
+        struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
                return error;
        *count += 1;
-        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        block = XFS_BUF_TO_BLOCK(bp);
        if (--level) {
                /* Not at node above leafs, count this level of nodes */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
                                0, &nbp, XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                        nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
+                        nextblock = XFS_BUF_TO_BLOCK(nbp);
-                        nextbno = be64_to_cpu(nextblock->bb_rightsib);
+                        nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
                        xfs_trans_brelse(tp, nbp);
                }
                /* Dive to the next level */
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                if (unlikely((error =
                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6520,9 @@ xfs_bmap_count_tree(
        } else {
                /* count all level 1 nodes and their leaves */
                for (;;) {
-                        nextbno = be64_to_cpu(block->bb_rightsib);
+                        nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                        xfs_bmap_disk_count_leaves(0, block, numrecs, count);
+                        xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6459,7 +6531,7 @@ xfs_bmap_count_tree(
                                XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                        block = XFS_BUF_TO_BLOCK(bp);
                }
        }
        return 0;
@@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves(
 */
 STATIC void
 xfs_bmap_disk_count_leaves(
-        xfs_extnum_t            idx,
+        struct xfs_mount        *mp,
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count)
 {
@@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves(
        xfs_bmbt_rec_t  *frp;
        for (b = 1; b <= numrecs; b++) {
-                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
+                frp = XFS_BMBT_REC_ADDR(mp, block, b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 9f3e3a836d15..284571c05ed0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
-#if defined(XFS_BMAP_TRACE)
 /*
 * Trace operations for bmap extent tracing
 */
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
        int                     whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
        xfs_bmap_trace_exlist(__func__,ip,c,w)
-#else
+#else   /* __KERNEL__ && XFS_BMAP_TRACE */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
+#endif  /* __KERNEL__ && XFS_BMAP_TRACE */
 /*
 * Convert inode from non-attributed to attributed.
@@ -206,20 +207,6 @@ xfs_bmap_compute_maxlevels(
        int                     whichfork);     /* data or attr fork */
 /*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int                                             /* error */
-xfs_bmap_finish(
-        struct xfs_trans        **tp,           /* transaction pointer addr */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        int                     *committed);    /* xact committed or not */
-/*
 * Returns the file-relative block number of the first unused block in the file.
 * This is the lowest-address hole if the file has holes, else the first block
 * past the end of file.
@@ -344,14 +331,43 @@ xfs_bunmapi(
        int                     *done);         /* set if not done yet */
 /*
- * Fcntl interface to xfs_bmapi.
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+        struct xfs_ifork        *ifp,
+        xfs_extnum_t            idx,
+        xfs_extnum_t            num);
+#ifdef __KERNEL__
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int                                             /* error */
+xfs_bmap_finish(
+        struct xfs_trans        **tp,           /* transaction pointer addr */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        int                     *committed);    /* xact committed or not */
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+/*
+ * Get inode's extents as described in bmv, and format for output.
 */
 int                                             /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-        struct getbmap          *bmv,           /* user bmap structure */
+        struct getbmapx         *bmv,           /* user bmap structure */
-        void                    __user *ap,     /* pointer to user's array */
+        xfs_bmap_format_t       formatter,      /* format to user */
-        int                     iflags);        /* interface flags */
+        void                    *arg);          /* formatter arg */
 /*
 * Check if the endoff is outside the last extent. If so the caller will grow
@@ -375,16 +391,6 @@ xfs_bmap_count_blocks(
        int                     *count);
 /*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-        struct xfs_ifork        *ifp,
-        xfs_extnum_t            idx,
-        xfs_extnum_t            num);
-/*
 * Search the extent records for the entry containing block bno.
 * If bno lies in a hole, point to the next entry.  If bno lies
 * past eof, *eofp will be set, and *prevp will contain the last
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 23efad29a5cd..8f1ec73725d3 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,1406 +37,13 @@
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#if defined(XFS_BMBT_TRACE)
-ktrace_t        *xfs_bmbt_trace_buf;
-#endif
-/*
- * Prototypes for internal btree functions.
- */
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-                __uint64_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-#if defined(XFS_BMBT_TRACE)
-static char     ARGS[] = "args";
-static char     ENTRY[] = "entry";
-static char     ERROR[] = "error";
-#undef EXIT
-static char     EXIT[] = "exit";
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        char            *s,
-        int             type,
-        int             line,
-        __psunsigned_t  a0,
-        __psunsigned_t  a1,
-        __psunsigned_t  a2,
-        __psunsigned_t  a3,
-        __psunsigned_t  a4,
-        __psunsigned_t  a5,
-        __psunsigned_t  a6,
-        __psunsigned_t  a7,
-        __psunsigned_t  a8,
-        __psunsigned_t  a9,
-        __psunsigned_t  a10)
-{
-        xfs_inode_t     *ip;
-        int             whichfork;
-        ip = cur->bc_private.b.ip;
-        whichfork = cur->bc_private.b.whichfork;
-        ktrace_enter(xfs_bmbt_trace_buf,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
-        ASSERT(ip->i_btrace);
-        ktrace_enter(ip->i_btrace,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *b,
-        int             i,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
-                (__psunsigned_t)b, i, 0, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *b,
-        int             i0,
-        int             i1,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
-                (__psunsigned_t)b, i0, i1, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        xfs_dfiloff_t           o,
-        xfs_dfsbno_t            b,
-        xfs_dfilblks_t          i,
-        int                     j,
-        int                     line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
-                o >> 32, (int)o, b >> 32, (int)b,
-                i >> 32, (int)i, (int)j, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        int             i,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
-                i, 0, 0, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_fsblock_t           f,
-        xfs_dfiloff_t           o,
-        int                     line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
-                (int)o, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_fsblock_t           f,
-        xfs_bmbt_rec_t          *r,
-        int                     line)
-{
-        xfs_dfsbno_t            b;
-        xfs_dfilblks_t          c;
-        xfs_dfsbno_t            d;
-        xfs_dfiloff_t           o;
-        xfs_bmbt_irec_t         s;
-        d = (xfs_dfsbno_t)f;
-        xfs_bmbt_disk_get_all(r, &s);
-        o = (xfs_dfiloff_t)s.br_startoff;
-        b = (xfs_dfsbno_t)s.br_startblock;
-        c = s.br_blockcount;
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
-                i, d >> 32, (int)d, o >> 32,
-                (int)o, b >> 32, (int)b, c >> 32,
-                (int)c, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_bmbt_key_t          *k,
-        int                     line)
-{
-        xfs_dfiloff_t           o;
-        o = be64_to_cpu(k->br_startoff);
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, o >> 32, (int)o, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        char            *s,
-        int             line)
-{
-        xfs_bmbt_rec_host_t     r;
-        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
-        xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
-                (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
-                cur->bc_private.b.allocated,
-                r.l0 >> 32, (int)r.l0,
-                r.l1 >> 32, (int)r.l1,
-                (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
-                (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
-                (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
-                (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-        xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-        xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-        xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGI(c,i)        \
-        xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-        xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-        xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-        xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
-#define XFS_BMBT_TRACE_CURSOR(c,s)      \
-        xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
-#else
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define XFS_BMBT_TRACE_ARGI(c,i)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define XFS_BMBT_TRACE_CURSOR(c,s)
-#endif  /* XFS_BMBT_TRACE */
-/*
- * Internal functions.
- */
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int                                      /* error */
-xfs_bmbt_delrec(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_fsblock_t           bno;            /* fs-relative block number */
-        xfs_buf_t               *bp;            /* buffer for block */
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        int                     j;              /* temp state */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-        xfs_fsblock_t           lbno;           /* left sibling block number */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        int                     lrecs=0;        /* left record count */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        int                     ptr;            /* key/record index */
-        xfs_fsblock_t           rbno;           /* right sibling block number */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_rec_t          *rp;            /* pointer to bmap btree rec */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-        int                     rrecs=0;        /* right record count */
-        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-        xfs_btree_cur_t         *tcur;          /* temporary btree cursor */
-        int                     numrecs;        /* temporary numrec count */
-        int                     numlrecs, numrrecs;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ptr = cur->bc_ptrs[level];
-        tcur = NULL;
-        if (ptr == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-#endif
-        if (ptr > numrecs) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_bmbt_delrec);
-        if (level > 0) {
-                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&kp[ptr - 1], &kp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        memmove(&pp[ptr - 1], &pp[ptr],
-                                (numrecs - ptr) * sizeof(*pp));
-                        xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
-                }
-        } else {
-                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&rp[ptr - 1], &rp[ptr],
-                                (numrecs - ptr) * sizeof(*rp));
-                        xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                if (ptr == 1) {
-                        key.br_startoff =
-                                cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
-                        kp = &key;
-                }
-        }
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-        /*
-         * We're at the root level.
-         * First, shrink the root block in-memory.
-         * Try to get rid of the next level down.
-         * If we can't then there's nothing left to do.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-                        cur->bc_private.b.whichfork);
-                if ((error = xfs_bmbt_killroot(cur))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        rbno = be64_to_cpu(block->bb_rightsib);
-        lbno = be64_to_cpu(block->bb_leftsib);
-        /*
-         * One child of root, need to get a chance to copy its contents
-         * into the root and delete it. Can't go up to next level,
-         * there's nothing to delete there.
-         */
-        if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
-            level == cur->bc_nlevels - 2) {
-                if ((error = xfs_bmbt_killroot(cur))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
-        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        bno = NULLFSBLOCK;
-        if (rbno != NULLFSBLOCK) {
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-#endif
-                bno = be64_to_cpu(right->bb_leftsib);
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                        if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                                tcur = NULL;
-                                if (level > 0) {
-                                        if ((error = xfs_bmbt_decrement(cur,
-                                                        level, &i))) {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        ERROR);
-                                                goto error0;
-                                        }
-                                }
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLFSBLOCK) {
-                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                }
-        }
-        if (lbno != NULLFSBLOCK) {
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * decrement to last in block
-                 */
-                if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-#endif
-                bno = be64_to_cpu(left->bb_rightsib);
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                        if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                                tcur = NULL;
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        tcur = NULL;
-        mp = cur->bc_mp;
-        ASSERT(bno != NULLFSBLOCK);
-        if (lbno != NULLFSBLOCK &&
-            lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                rbno = bno;
-                right = block;
-                rbp = bp;
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-        } else if (rbno != NULLFSBLOCK &&
-                   rrecs + be16_to_cpu(block->bb_numrecs) <=
-                   XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                lbno = bno;
-                left = block;
-                lbp = bp;
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        } else {
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        numlrecs = be16_to_cpu(left->bb_numrecs);
-        numrrecs = be16_to_cpu(right->bb_numrecs);
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < numrrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                }
-#endif
-                memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
-                xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-                xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
-                xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-        }
-        be16_add_cpu(&left->bb_numrecs, numrrecs);
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
-        if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
-                                be64_to_cpu(left->bb_rightsib),
-                                0, &rrbp, XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                rrblock->bb_leftsib = cpu_to_be64(lbno);
-                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-        }
-        xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
-                cur->bc_private.b.flist, mp);
-        cur->bc_private.b.ip->i_d.di_nblocks--;
-        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
-                        XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(cur->bc_tp, rbp);
-        if (bp != lbp) {
-                cur->bc_bufs[level] = lbp;
-                cur->bc_ptrs[level] += lrecs;
-                cur->bc_ra[level] = 0;
-        } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 2;
-        return 0;
-error0:
-        if (tcur)
-                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
-}
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                                      /* error */
-xfs_bmbt_insrec(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_fsblock_t           *bnop,
-        xfs_bmbt_rec_t          *recp,
-        xfs_btree_cur_t         **curp,
-        int                     *stat)          /* no-go/done/continue */
-{
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_buf_t               *bp;            /* buffer for block */
-        int                     error;          /* error return value */
-        int                     i;              /* loop index */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-        int                     logflags;       /* inode logging flags */
-        xfs_fsblock_t           nbno;           /* new block number */
-        struct xfs_btree_cur    *ncur;          /* new btree cursor */
-        __uint64_t              startoff;       /* new btree key value */
-        xfs_bmbt_rec_t          nrec;           /* new record count */
-        int                     optr;           /* old key/record index */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        int                     ptr;            /* key/record index */
-        xfs_bmbt_rec_t          *rp=NULL;       /* pointer to bmap btree rec */
-        int                     numrecs;
-        ASSERT(level < cur->bc_nlevels);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-        ncur = NULL;
-        key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
-        optr = ptr = cur->bc_ptrs[level];
-        if (ptr == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_bmbt_insrec);
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
-                } else {
-                        kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLFSBLOCK;
-        if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                        /*
-                         * A root block, that can be made bigger.
-                         */
-                        xfs_iroot_realloc(cur->bc_private.b.ip, 1,
-                                cur->bc_private.b.whichfork);
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                } else if (level == cur->bc_nlevels - 1) {
-                        if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
-                            *stat == 0) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                                logflags);
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                } else {
-                        if ((error = xfs_bmbt_rshift(cur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        if (i) {
-                                /* nothing */
-                        } else {
-                                if ((error = xfs_bmbt_lshift(cur, level, &i))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                if (i) {
-                                        optr = ptr = cur->bc_ptrs[level];
-                                } else {
-                                        if ((error = xfs_bmbt_split(cur, level,
-                                                        &nbno, &startoff, &ncur,
-                                                        &i))) {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        ERROR);
-                                                return error;
-                                        }
-                                        if (i) {
-                                                block = xfs_bmbt_get_block(
-                                                            cur, level, &bp);
-#ifdef DEBUG
-                                                if ((error =
-                                                    xfs_btree_check_lblock(cur,
-                                                            block, level, bp))) {
-                                                        XFS_BMBT_TRACE_CURSOR(
-                                                                cur, ERROR);
-                                                        return error;
-                                                }
-#endif
-                                                ptr = cur->bc_ptrs[level];
-                                                xfs_bmbt_disk_set_allf(&nrec,
-                                                        startoff, 0, 0,
-                                                        XFS_EXT_NORM);
-                                        } else {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        EXIT);
-                                                *stat = 0;
-                                                return 0;
-                                        }
-                                }
-                        }
-                }
-        }
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
-                                        level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be64(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
-                xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
-        } else {
-                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
-        }
-        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (ptr < numrecs) {
-                if (level == 0)
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
-                                rp + ptr);
-                else
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
-                                kp + ptr);
-        }
-#endif
-        if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        *bnop = nbno;
-        if (nbno != NULLFSBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-STATIC int
-xfs_bmbt_killroot(
-        xfs_btree_cur_t         *cur)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_bmbt_block_t        *cblock;
-        xfs_buf_t               *cbp;
-        xfs_bmbt_key_t          *ckp;
-        xfs_bmbt_ptr_t          *cpp;
-#ifdef DEBUG
-        int                     error;
-#endif
-        int                     i;
-        xfs_bmbt_key_t          *kp;
-        xfs_inode_t             *ip;
-        xfs_ifork_t             *ifp;
-        int                     level;
-        xfs_bmbt_ptr_t          *pp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = cur->bc_nlevels - 1;
-        ASSERT(level >= 1);
-        /*
-         * Don't deal with the root block needs to be a leaf case.
-         * We're just going to turn the thing back into extents anyway.
-         */
-        if (level == 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &cbp);
-        /*
-         * Give up if the root has multiple children.
-         */
-        if (be16_to_cpu(block->bb_numrecs) != 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        /*
-         * Only do this if the next level will fit.
-         * Then the data must be copied up to the inode,
-         * instead of freeing the root you free the next level.
-         */
-        cbp = cur->bc_bufs[level - 1];
-        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-        if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
-        ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
-        ip = cur->bc_private.b.ip;
-        ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
-        ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
-               XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
-        i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
-        if (i) {
-                xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
-                block = ifp->if_broot;
-        }
-        be16_add_cpu(&block->bb_numrecs, i);
-        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-        memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-#endif
-        memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
-        xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
-                        cur->bc_private.b.flist, cur->bc_mp);
-        ip->i_d.di_nblocks--;
-        XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
-                        XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(cur->bc_tp, cbp);
-        cur->bc_bufs[level - 1] = NULL;
-        be16_add_cpu(&block->bb_level, -1);
-        xfs_trans_log_inode(cur->bc_tp, ip,
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        cur->bc_nlevels--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *bp,
-        int             kfirst,
-        int             klast)
-{
-        xfs_trans_t     *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_bmbt_block_t        *block;
-                int                     first;
-                xfs_bmbt_key_t          *kp;
-                int                     last;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
-                first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-                last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else {
-                xfs_inode_t              *ip;
-                ip = cur->bc_private.b.ip;
-                xfs_trans_log_inode(tp, ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *bp,
-        int             pfirst,
-        int             plast)
-{
-        xfs_trans_t     *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_bmbt_block_t        *block;
-                int                     first;
-                int                     last;
-                xfs_bmbt_ptr_t          *pp;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
-                first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-                last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else {
-                xfs_inode_t             *ip;
-                ip = cur->bc_private.b.ip;
-                xfs_trans_log_inode(tp, ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- */
-STATIC int                              /* error */
-xfs_bmbt_lookup(
-        xfs_btree_cur_t         *cur,
-        xfs_lookup_t            dir,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block=NULL;
-        xfs_buf_t               *bp;
-        xfs_daddr_t             d;
-        xfs_sfiloff_t           diff;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno=0;
-        int                     high;
-        int                     i;
-        int                     keyno=0;
-        xfs_bmbt_key_t          *kkbase=NULL;
-        xfs_bmbt_key_t          *kkp;
-        xfs_bmbt_rec_t          *krbase=NULL;
-        xfs_bmbt_rec_t          *krp;
-        int                     level;
-        int                     low;
-        xfs_mount_t             *mp;
-        xfs_bmbt_ptr_t          *pp;
-        xfs_bmbt_irec_t         *rp;
-        xfs_fileoff_t           startoff;
-        xfs_trans_t             *tp;
-        XFS_STATS_INC(xs_bmbt_lookup);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, (int)dir);
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        rp = &cur->bc_rec.b;
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                if (level < cur->bc_nlevels - 1) {
-                        d = XFS_FSB_TO_DADDR(mp, fsbno);
-                        bp = cur->bc_bufs[level];
-                        if (bp && XFS_BUF_ADDR(bp) != d)
-                                bp = NULL;
-                        if (!bp) {
-                                if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
-                                                0, &bp, XFS_BMAP_BTREE_REF))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                xfs_btree_setbuf(cur, level, bp);
-                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                                if ((error = xfs_btree_check_lblock(cur, block,
-                                                level, bp))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                        } else
-                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                } else
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                if (diff == 0)
-                        keyno = 1;
-                else {
-                        if (level > 0)
-                                kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                        else
-                                krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                ASSERT(level == 0);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 0;
-                                return 0;
-                        }
-                        while (low <= high) {
-                                XFS_STATS_INC(xs_bmbt_compare);
-                                keyno = (low + high) >> 1;
-                                if (level > 0) {
-                                        kkp = kkbase + keyno - 1;
-                                        startoff = be64_to_cpu(kkp->br_startoff);
-                                } else {
-                                        krp = krbase + keyno - 1;
-                                        startoff = xfs_bmbt_disk_get_startoff(krp);
-                                }
-                                diff = (xfs_sfiloff_t)
-                                                (startoff - rp->br_startoff);
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                else
-                                        break;
-                        }
-                }
-                if (level > 0) {
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
-                        fsbno = be64_to_cpu(*pp);
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
-                        cur->bc_ptrs[0] = keyno;
-                        if ((error = xfs_bmbt_increment(cur, 0, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
-                        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-        } else {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        }
-        return 0;
-}
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                      /* error */
-xfs_bmbt_lshift(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        int                     error;          /* error return value */
-#ifdef DEBUG
-        int                     i;              /* loop counter */
-#endif
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp=NULL;      /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        int                     lrecs;          /* left record count */
-        xfs_bmbt_rec_t          *lrp=NULL;      /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp=NULL;      /* right btree key */
-        xfs_bmbt_ptr_t          *rpp=NULL;      /* right address pointer */
-        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-        int                     rrecs;          /* right record count */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        if (level == cur->bc_nlevels - 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (cur->bc_ptrs[level] <= 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        mp = cur->bc_mp;
-        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
-                        &lbp, XFS_BMAP_BTREE_REF))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        lrecs = be16_to_cpu(left->bb_numrecs) + 1;
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                *lkp = *rkp;
-                xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
-                lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                *lpp = *rpp;
-                xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
-        }
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
-        else
-                xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
-        rrecs = be16_to_cpu(right->bb_numrecs) - 1;
-        right->bb_numrecs = cpu_to_be16(rrecs);
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
-                                        level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
-                memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
-                xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
-                xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
-        } else {
-                memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
-                xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                rkp = &key;
-        }
-        if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        cur->bc_ptrs[level]--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                      /* error */
-xfs_bmbt_rshift(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        if (level == cur->bc_nlevels - 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        mp = cur->bc_mp;
-        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
-                        &rbp, XFS_BMAP_BTREE_REF))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                rkp = &key;
-        }
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
-        else
-                xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-                goto error1;
-        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-                goto error1;
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-error0:
-        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
-}
 /*
 * Determine the extent state.
 */
@@ -1453,229 +60,15 @@ xfs_extent_state(
        return XFS_EXT_NORM;
 }
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                                      /* error */
-xfs_bmbt_split(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_fsblock_t           *bnop,
-        __uint64_t              *startoff,
-        xfs_btree_cur_t         **curp,
-        int                     *stat)          /* success/failure */
-{
-        xfs_alloc_arg_t         args;           /* block allocation args */
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        xfs_fsblock_t           lbno;           /* left sibling block number */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
-        args.tp = cur->bc_tp;
-        args.mp = cur->bc_mp;
-        lbp = cur->bc_bufs[level];
-        lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-        args.fsbno = cur->bc_private.b.firstblock;
-        args.firstblock = args.fsbno;
-        args.minleft = 0;
-        if (args.fsbno == NULLFSBLOCK) {
-                args.fsbno = lbno;
-                args.type = XFS_ALLOCTYPE_START_BNO;
-                /*
-                 * Make sure there is sufficient room left in the AG to
-                 * complete a full tree split for an extent insert.  If
-                 * we are converting the middle part of an extent then
-                 * we may need space for two tree splits.
-                 *
-                 * We are relying on the caller to make the correct block
-                 * reservation for this operation to succeed.  If the
-                 * reservation amount is insufficient then we may fail a
-                 * block allocation here and corrupt the filesystem.
-                 */
-                args.minleft = xfs_trans_get_block_res(args.tp);
-        } else if (cur->bc_private.b.flist->xbf_low)
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        else
-                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        args.mod = args.alignment = args.total = args.isfl =
-                args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return XFS_ERROR(ENOSPC);
-        }
-        if ((error = xfs_alloc_vextent(&args))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (args.fsbno == NULLFSBLOCK && args.minleft) {
-                /*
-                 * Could not find an AG with enough free space to satisfy
-                 * a full btree split.  Try again without minleft and if
-                 * successful activate the lowspace algorithm.
-                 */
-                args.fsbno = 0;
-                args.type = XFS_ALLOCTYPE_FIRST_AG;
-                args.minleft = 0;
-                if ((error = xfs_alloc_vextent(&args))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_private.b.flist->xbf_low = 1;
-        }
-        if (args.fsbno == NULLFSBLOCK) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        cur->bc_private.b.firstblock = args.fsbno;
-        cur->bc_private.b.allocated++;
-        cur->bc_private.b.ip->i_d.di_nblocks++;
-        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                        XFS_TRANS_DQ_BCOUNT, 1L);
-        rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-        right->bb_level = left->bb_level;
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *startoff = be64_to_cpu(rkp->br_startoff);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, i, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *startoff = xfs_bmbt_disk_get_startoff(rrp);
-        }
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be64(args.fsbno);
-        right->bb_leftsib = cpu_to_be64(lbno);
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
-                if ((error = xfs_btree_read_bufl(args.mp, args.tp,
-                                be64_to_cpu(right->bb_rightsib), 0, &rrbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
-                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-        }
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-                xfs_btree_setbuf(cur, level, rbp);
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-        }
-        if (level + 1 < cur->bc_nlevels) {
-                if ((error = xfs_btree_dup_cursor(cur, curp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = args.fsbno;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
-        xfs_btree_cur_t         *cur,
-        xfs_bmbt_key_t          *keyp,  /* on-disk format */
-        int                     level)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-#ifdef DEBUG
-        int                     error;
-#endif
-        xfs_bmbt_key_t          *kp;
-        int                     ptr;
-        ASSERT(level >= 1);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_bmbt_log_keys(cur, bp, ptr, ptr);
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
 /*
 * Convert on-disk form of btree root to in-memory form.
 */
 void
 xfs_bmdr_to_bmbt(
+        struct xfs_mount        *mp,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen,
-        xfs_bmbt_block_t        *rblock,
+        struct xfs_btree_block  *rblock,
        int                     rblocklen)
 {
        int                     dmxr;
@@ -1688,129 +81,19 @@ xfs_bmdr_to_bmbt(
        rblock->bb_level = dblock->bb_level;
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        rblock->bb_numrecs = dblock->bb_numrecs;
-        rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
-        fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
+        fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-        tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-        fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+        fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-        tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                             /* error */
-xfs_bmbt_decrement(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno;
-        int                     lev;
-        xfs_mount_t             *mp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ASSERT(level < cur->bc_nlevels);
-        if (level < cur->bc_nlevels - 1)
-                xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        if (--cur->bc_ptrs[level] > 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                if (lev < cur->bc_nlevels - 1)
-                        xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        if (lev == cur->bc_nlevels) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Delete the record pointed to by cur.
- */
-int                                     /* error */
-xfs_bmbt_delete(
-        xfs_btree_cur_t *cur,
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;
-        int             level;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_bmbt_delrec(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_bmbt_decrement(cur, level,
-                                                &i))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                break;
-                        }
-                }
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = i;
-        return 0;
-}
-/*
 * Convert a compressed bmap extent record to an uncompressed form.
 * This code must be in sync with the routines xfs_bmbt_get_startoff,
 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
@@ -1864,31 +147,6 @@ xfs_bmbt_get_all(
 }
 /*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_buf_t               **bpp)
-{
-        xfs_ifork_t             *ifp;
-        xfs_bmbt_block_t        *rval;
-        if (level < cur->bc_nlevels - 1) {
-                *bpp = cur->bc_bufs[level];
-                rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
-        } else {
-                *bpp = NULL;
-                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                        cur->bc_private.b.whichfork);
-                rval = ifp->if_broot;
-        }
-        return rval;
-}
-/*
 * Extract the blockcount field from an in memory bmap extent record.
 */
 xfs_filblks_t
@@ -1950,7 +208,8 @@ xfs_bmbt_disk_get_all(
        xfs_bmbt_rec_t  *r,
        xfs_bmbt_irec_t *s)
 {
-        __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
+        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+                                get_unaligned_be64(&r->l1), s);
 }
 /*
@@ -1974,348 +233,6 @@ xfs_bmbt_disk_get_startoff(
                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                             /* error */
-xfs_bmbt_increment(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno;
-        int                     lev;
-        xfs_mount_t             *mp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ASSERT(level < cur->bc_nlevels);
-        if (level < cur->bc_nlevels - 1)
-                xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                if (lev < cur->bc_nlevels - 1)
-                        xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        if (lev == cur->bc_nlevels) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_ptrs[lev] = 1;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Insert the current record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int                                     /* error */
-xfs_bmbt_insert(
-        xfs_btree_cur_t *cur,
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;
-        int             level;
-        xfs_fsblock_t   nbno;
-        xfs_btree_cur_t *ncur;
-        xfs_bmbt_rec_t  nrec;
-        xfs_btree_cur_t *pcur;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = 0;
-        nbno = NULLFSBLOCK;
-        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-        ncur = NULL;
-        pcur = cur;
-        do {
-                if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        cur->bc_private.b.allocated +=
-                                pcur->bc_private.b.allocated;
-                        pcur->bc_private.b.allocated = 0;
-                        ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
-                               XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
-                        cur->bc_private.b.firstblock =
-                                pcur->bc_private.b.firstblock;
-                        ASSERT(cur->bc_private.b.flist ==
-                               pcur->bc_private.b.flist);
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLFSBLOCK);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = i;
-        return 0;
-error0:
-        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-        return error;
-}
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
-        xfs_btree_cur_t         *cur,
-        xfs_buf_t               *bp,
-        int                     fields)
-{
-        int                     first;
-        int                     last;
-        xfs_trans_t             *tp;
-        static const short      offsets[] = {
-                offsetof(xfs_bmbt_block_t, bb_magic),
-                offsetof(xfs_bmbt_block_t, bb_level),
-                offsetof(xfs_bmbt_block_t, bb_numrecs),
-                offsetof(xfs_bmbt_block_t, bb_leftsib),
-                offsetof(xfs_bmbt_block_t, bb_rightsib),
-                sizeof(xfs_bmbt_block_t)
-        };
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
-                                  &last);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else
-                xfs_trans_log_inode(tp, cur->bc_private.b.ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Log record values from the btree block.
- */
-void
-xfs_bmbt_log_recs(
-        xfs_btree_cur_t         *cur,
-        xfs_buf_t               *bp,
-        int                     rfirst,
-        int                     rlast)
-{
-        xfs_bmbt_block_t        *block;
-        int                     first;
-        int                     last;
-        xfs_bmbt_rec_t          *rp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
-        ASSERT(bp);
-        tp = cur->bc_tp;
-        block = XFS_BUF_TO_BMBT_BLOCK(bp);
-        rp = XFS_BMAP_REC_DADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(tp, bp, first, last);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-int                                     /* error */
-xfs_bmbt_lookup_eq(
-        xfs_btree_cur_t *cur,
-        xfs_fileoff_t   off,
-        xfs_fsblock_t   bno,
-        xfs_filblks_t   len,
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.b.br_startoff = off;
-        cur->bc_rec.b.br_startblock = bno;
-        cur->bc_rec.b.br_blockcount = len;
-        return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-int                                     /* error */
-xfs_bmbt_lookup_ge(
-        xfs_btree_cur_t *cur,
-        xfs_fileoff_t   off,
-        xfs_fsblock_t   bno,
-        xfs_filblks_t   len,
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.b.br_startoff = off;
-        cur->bc_rec.b.br_startblock = bno;
-        cur->bc_rec.b.br_blockcount = len;
-        return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int                                             /* error */
-xfs_bmbt_newroot(
-        xfs_btree_cur_t         *cur,           /* btree cursor */
-        int                     *logflags,      /* logging flags for inode */
-        int                     *stat)          /* return status - 0 fail */
-{
-        xfs_alloc_arg_t         args;           /* allocation arguments */
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_buf_t               *bp;            /* buffer for block */
-        xfs_bmbt_block_t        *cblock;        /* child btree block */
-        xfs_bmbt_key_t          *ckp;           /* child key pointer */
-        xfs_bmbt_ptr_t          *cpp;           /* child ptr pointer */
-        int                     error;          /* error return code */
-#ifdef DEBUG
-        int                     i;              /* loop counter */
-#endif
-        xfs_bmbt_key_t          *kp;            /* pointer to bmap btree key */
-        int                     level;          /* btree level */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = cur->bc_nlevels - 1;
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        /*
-         * Copy the root into a real block.
-         */
-        args.mp = cur->bc_mp;
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        args.tp = cur->bc_tp;
-        args.fsbno = cur->bc_private.b.firstblock;
-        args.mod = args.minleft = args.alignment = args.total = args.isfl =
-                args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        args.firstblock = args.fsbno;
-        if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                args.fsbno = be64_to_cpu(*pp);
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else if (cur->bc_private.b.flist->xbf_low)
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        else
-                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (args.fsbno == NULLFSBLOCK) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        cur->bc_private.b.firstblock = args.fsbno;
-        cur->bc_private.b.allocated++;
-        cur->bc_private.b.ip->i_d.di_nblocks++;
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                          XFS_TRANS_DQ_BCOUNT, 1L);
-        bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
-        cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
-        *cblock = *block;
-        be16_add_cpu(&block->bb_level, 1);
-        block->bb_numrecs = cpu_to_be16(1);
-        cur->bc_nlevels++;
-        cur->bc_ptrs[level + 1] = 1;
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-        memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
-        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-#endif
-        memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        *pp = cpu_to_be64(args.fsbno);
-        xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
-                cur->bc_private.b.whichfork);
-        xfs_btree_setbuf(cur, level, bp);
-        /*
-         * Do all this logging at the end so that
-         * the root is at the right level.
-         */
-        xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-        xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *logflags |=
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
-        *stat = 1;
-        return 0;
-}
 /*
 * Set all the fields in a bmap extent record from the arguments.
@@ -2512,7 +429,8 @@ xfs_bmbt_set_state(
 */
 void
 xfs_bmbt_to_bmdr(
-        xfs_bmbt_block_t        *rblock,
+        struct xfs_mount        *mp,
+        struct xfs_btree_block  *rblock,
        int                     rblocklen,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen)
@@ -2524,67 +442,22 @@ xfs_bmbt_to_bmdr(
        __be64                  *tpp;
        ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
-        ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
+        ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
-        ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
+        ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
-        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
-        fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-        tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
+        tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-        fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-        tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+        tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
-        xfs_btree_cur_t         *cur,
-        xfs_fileoff_t           off,
-        xfs_fsblock_t           bno,
-        xfs_filblks_t           len,
-        xfs_exntst_t            state)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;
-        xfs_bmbt_key_t          key;
-        int                     ptr;
-        xfs_bmbt_rec_t          *rp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
-                (xfs_dfilblks_t)len, (int)state);
-        block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        ptr = cur->bc_ptrs[0];
-        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-        xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
-        xfs_bmbt_log_recs(cur, bp, ptr, ptr);
-        if (ptr > 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        key.br_startoff = cpu_to_be64(off);
-        if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
-/*
 * Check extent records, which have just been read, for
 * any bit in the extent flag field. ASSERT on debug
 * kernels, as this condition should not occur.
@@ -2608,3 +481,451 @@ xfs_check_nostate_extents(
        }
        return 0;
 }
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+        struct xfs_btree_cur    *cur)
+{
+        struct xfs_btree_cur    *new;
+        new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+        /*
+         * Copy the firstblock, flist, and flags values,
+         * since init cursor doesn't get them.
+         */
+        new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+        new->bc_private.b.flist = cur->bc_private.b.flist;
+        new->bc_private.b.flags = cur->bc_private.b.flags;
+        return new;
+}
+STATIC void
+xfs_bmbt_update_cursor(
+        struct xfs_btree_cur    *src,
+        struct xfs_btree_cur    *dst)
+{
+        ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+               (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+        ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+        dst->bc_private.b.allocated += src->bc_private.b.allocated;
+        dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+        src->bc_private.b.allocated = 0;
+}
+STATIC int
+xfs_bmbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     length,
+        int                     *stat)
+{
+        xfs_alloc_arg_t         args;           /* block allocation args */
+        int                     error;          /* error return value */
+        memset(&args, 0, sizeof(args));
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        args.fsbno = cur->bc_private.b.firstblock;
+        args.firstblock = args.fsbno;
+        if (args.fsbno == NULLFSBLOCK) {
+                args.fsbno = be64_to_cpu(start->l);
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                /*
+                 * Make sure there is sufficient room left in the AG to
+                 * complete a full tree split for an extent insert.  If
+                 * we are converting the middle part of an extent then
+                 * we may need space for two tree splits.
+                 *
+                 * We are relying on the caller to make the correct block
+                 * reservation for this operation to succeed.  If the
+                 * reservation amount is insufficient then we may fail a
+                 * block allocation here and corrupt the filesystem.
+                 */
+                args.minleft = xfs_trans_get_block_res(args.tp);
+        } else if (cur->bc_private.b.flist->xbf_low) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        }
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+                error = XFS_ERROR(ENOSPC);
+                goto error0;
+        }
+        error = xfs_alloc_vextent(&args);
+        if (error)
+                goto error0;
+        if (args.fsbno == NULLFSBLOCK && args.minleft) {
+                /*
+                 * Could not find an AG with enough free space to satisfy
+                 * a full btree split.  Try again without minleft and if
+                 * successful activate the lowspace algorithm.
+                 */
+                args.fsbno = 0;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.minleft = 0;
+                error = xfs_alloc_vextent(&args);
+                if (error)
+                        goto error0;
+                cur->bc_private.b.flist->xbf_low = 1;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        cur->bc_private.b.ip->i_d.di_nblocks++;
+        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                        XFS_TRANS_DQ_BCOUNT, 1L);
+        new->l = cpu_to_be64(args.fsbno);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+ error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_bmbt_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_trans        *tp = cur->bc_tp;
+        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+        xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+        ip->i_d.di_nblocks--;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(tp, bp);
+        return 0;
+}
+STATIC int
+xfs_bmbt_get_minrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0) / 2;
+        }
+        return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+int
+xfs_bmbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0);
+        }
+        return cur->bc_mp->m_bmap_dmxr[level != 0];
+}
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level != cur->bc_nlevels - 1)
+                return cur->bc_mp->m_bmap_dmxr[level != 0];
+        return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+                                level == 0);
+}
+STATIC void
+xfs_bmbt_init_key_from_rec(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        key->bmbt.br_startoff =
+                cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+STATIC void
+xfs_bmbt_init_rec_from_key(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(key->bmbt.br_startoff != 0);
+        xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                               0, 0, XFS_EXT_NORM);
+}
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        ptr->l = 0;
+}
+STATIC __int64_t
+xfs_bmbt_key_diff(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key)
+{
+        return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                      cur->bc_rec.b.br_startoff;
+}
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *k1,
+        union xfs_btree_key     *k2)
+{
+        return be64_to_cpu(k1->bmbt.br_startoff) <
+                be64_to_cpu(k2->bmbt.br_startoff);
+}
+STATIC int
+xfs_bmbt_recs_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *r1,
+        union xfs_btree_rec     *r2)
+{
+        return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+                xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+                xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif  /* DEBUG */
+#ifdef XFS_BTREE_TRACE
+ktrace_t        *xfs_bmbt_trace_buf;
+STATIC void
+xfs_bmbt_trace_enter(
+        struct xfs_btree_cur    *cur,
+        const char              *func,
+        char                    *s,
+        int                     type,
+        int                     line,
+        __psunsigned_t          a0,
+        __psunsigned_t          a1,
+        __psunsigned_t          a2,
+        __psunsigned_t          a3,
+        __psunsigned_t          a4,
+        __psunsigned_t          a5,
+        __psunsigned_t          a6,
+        __psunsigned_t          a7,
+        __psunsigned_t          a8,
+        __psunsigned_t          a9,
+        __psunsigned_t          a10)
+{
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        int                     whichfork = cur->bc_private.b.whichfork;
+        ktrace_enter(xfs_bmbt_trace_buf,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+        ktrace_enter(ip->i_btrace,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+}
+STATIC void
+xfs_bmbt_trace_cursor(
+        struct xfs_btree_cur    *cur,
+        __uint32_t              *s0,
+        __uint64_t              *l0,
+        __uint64_t              *l1)
+{
+        struct xfs_bmbt_rec_host r;
+        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+        *s0 = (cur->bc_nlevels << 24) |
+              (cur->bc_private.b.flags << 16) |
+               cur->bc_private.b.allocated;
+        *l0 = r.l0;
+        *l1 = r.l1;
+}
+STATIC void
+xfs_bmbt_trace_key(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        __uint64_t              *l0,
+        __uint64_t              *l1)
+{
+        *l0 = be64_to_cpu(key->bmbt.br_startoff);
+        *l1 = 0;
+}
+STATIC void
+xfs_bmbt_trace_record(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        __uint64_t              *l0,
+        __uint64_t              *l1,
+        __uint64_t              *l2)
+{
+        struct xfs_bmbt_irec    irec;
+        xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+        *l0 = irec.br_startoff;
+        *l1 = irec.br_startblock;
+        *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+        .rec_len                = sizeof(xfs_bmbt_rec_t),
+        .key_len                = sizeof(xfs_bmbt_key_t),
+        .dup_cursor             = xfs_bmbt_dup_cursor,
+        .update_cursor          = xfs_bmbt_update_cursor,
+        .alloc_block            = xfs_bmbt_alloc_block,
+        .free_block             = xfs_bmbt_free_block,
+        .get_maxrecs            = xfs_bmbt_get_maxrecs,
+        .get_minrecs            = xfs_bmbt_get_minrecs,
+        .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+        .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+        .key_diff               = xfs_bmbt_key_diff,
+#ifdef DEBUG
+        .keys_inorder           = xfs_bmbt_keys_inorder,
+        .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+#ifdef XFS_BTREE_TRACE
+        .trace_enter            = xfs_bmbt_trace_enter,
+        .trace_cursor           = xfs_bmbt_trace_cursor,
+        .trace_key              = xfs_bmbt_trace_key,
+        .trace_record           = xfs_bmbt_trace_record,
+#endif
+};
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                          /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* inode owning the btree */
+        int                     whichfork)      /* data or attr fork */
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_cur    *cur;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+        cur->bc_btnum = XFS_BTNUM_BMAP;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        cur->bc_ops = &xfs_bmbt_ops;
+        cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+        cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+        cur->bc_private.b.ip = ip;
+        cur->bc_private.b.firstblock = NULLFSBLOCK;
+        cur->bc_private.b.flist = NULL;
+        cur->bc_private.b.allocated = 0;
+        cur->bc_private.b.flags = 0;
+        cur->bc_private.b.whichfork = whichfork;
+        return cur;
+}
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmbt_rec_t);
+        return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= sizeof(xfs_bmdr_block_t);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmdr_rec_t);
+        return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cd0d4b4bb816..a4555abb6622 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -21,9 +21,10 @@
 #define XFS_BMAP_MAGIC  0x424d4150      /* 'BMAP' */
 struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
+struct xfs_trans;
 /*
 * Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
 /* btree pointer type */
 typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-/* btree block header type */
+/*
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+ * Btree block header size depends on a superblock flag.
+ *
-#define XFS_BUF_TO_BMBT_BLOCK(bp)       ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
+ * (not quite yet, but soon)
+ */
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur)  ((cur)->bc_private.b.forksize)
+#define XFS_BMBT_BLOCK_LEN(mp)  XFS_BTREE_LBLOCK_LEN
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur)  \
-        ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
-                    (cur)->bc_private.b.whichfork)->if_broot_bytes)
+        ((xfs_bmbt_rec_t *) \
+                ((char *)(block) + \
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-                XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
-                        xfs_bmdr, (lev) == 0) : \
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
-                ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+        ((xfs_bmbt_key_t *) \
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
+                ((char *)(block) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-                        XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
+                 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-                                xfs_bmbt, (lev) == 0) : \
-                        ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_bmbt_ptr_t *) \
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
+                ((char *)(block) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-                        XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
+                 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
-                                xfs_bmdr, (lev) == 0) : \
+                 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-                        ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
+#define XFS_BMDR_REC_ADDR(block, index) \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+        ((xfs_bmdr_rec_t *) \
-                        XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
+                ((char *)(block) + \
-                                xfs_bmbt, (lev) == 0) : \
+                 sizeof(struct xfs_bmdr_block) + \
-                        ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
+                 ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-#define XFS_BMAP_REC_DADDR(bb,i,cur)    (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
+#define XFS_BMDR_KEY_ADDR(block, index) \
+        ((xfs_bmdr_key_t *) \
-#define XFS_BMAP_REC_IADDR(bb,i,cur)    (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
-#define XFS_BMAP_KEY_DADDR(bb,i,cur)    \
+                 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
-#define XFS_BMAP_KEY_IADDR(bb,i,cur)    \
+        ((xfs_bmdr_ptr_t *) \
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
-#define XFS_BMAP_PTR_DADDR(bb,i,cur)    \
+                 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(   \
+                 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-                                be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur)    \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(   \
-                                be16_to_cpu((bb)->bb_level), cur)))
 /*
 * These are to be used when we know the size of the block and
 * we don't have a cursor.
 */
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
-        (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i))
+        XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-#define XFS_BMAP_BROOT_NUMRECS(bb)      be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz)      XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
 #define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
-        (int)(sizeof(xfs_bmbt_block_t) + \
+        (int)(XFS_BTREE_LBLOCK_LEN + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
 #define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
 */
 #define XFS_BM_MAXLEVELS(mp,w)          ((mp)->m_bm_maxlevels[(w)])
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
-        (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
-         be16_to_cpu((bb)->bb_level) == level && \
-         be16_to_cpu((bb)->bb_numrecs) > 0 && \
-         be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-#ifdef __KERNEL__
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI   1
-#define XFS_BMBT_KTRACE_ARGBII  2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI    4
-#define XFS_BMBT_KTRACE_ARGIFK  5
-#define XFS_BMBT_KTRACE_ARGIFR  6
-#define XFS_BMBT_KTRACE_ARGIK   7
-#define XFS_BMBT_KTRACE_CUR     8
-#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmbt_trace_buf;
-#endif
 /*
 * Prototypes for xfs_bmap.c to call.
 */
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
+                        struct xfs_btree_block *, int);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
 extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
-                                                int, struct xfs_buf **bpp);
 extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
 extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
-                                int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
-                                xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
-                                xfs_fsblock_t, xfs_filblks_t, int *);
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
 extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
+                        xfs_bmdr_block_t *, int);
-                                xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_inode *, int);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cc593a84c345..7ed59267420d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -34,7 +34,9 @@
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                              /* number of records fitting in block */
-xfs_btree_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block) /* generic btree block pointer */
-{
-        switch (cur->bc_btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                return (int)XFS_ALLOC_BLOCK_MAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        case XFS_BTNUM_BMAP:
-                return (int)XFS_BMAP_BLOCK_IMAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        case XFS_BTNUM_INO:
-                return (int)XFS_INOBT_BLOCK_MAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        default:
-                ASSERT(0);
-                return 0;
-        }
-}
-/*
- * External routines.
- */
-#ifdef DEBUG
-/*
- * Debug routine: check that block header is ok.
- */
-void
-xfs_btree_check_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block, /* generic btree block pointer */
-        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer containing block, if any */
-{
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
-                xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
-                        bp);
-        else
-                xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
-                        bp);
-}
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-        xfs_btnum_t     btnum,          /* btree identifier */
-        void            *ak1,           /* pointer to left (lower) key */
-        void            *ak2)           /* pointer to right (higher) key */
-{
-        switch (btnum) {
-        case XFS_BTNUM_BNO: {
-                xfs_alloc_key_t *k1;
-                xfs_alloc_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
-                break;
-            }
-        case XFS_BTNUM_CNT: {
-                xfs_alloc_key_t *k1;
-                xfs_alloc_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
-                       (k1->ar_blockcount == k2->ar_blockcount &&
-                        be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
-                break;
-            }
-        case XFS_BTNUM_BMAP: {
-                xfs_bmbt_key_t  *k1;
-                xfs_bmbt_key_t  *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
-                break;
-            }
-        case XFS_BTNUM_INO: {
-                xfs_inobt_key_t *k1;
-                xfs_inobt_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
-                break;
-            }
-        default:
-                ASSERT(0);
-        }
-}
-#endif  /* DEBUG */
-/*
+STATIC int                              /* error (0 or EFSCORRUPTED) */
- * Checking routine: check that long form block header is ok.
- */
-/* ARGSUSED */
-int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_lblock_t      *block, /* btree long form block pointer */
+        struct xfs_btree_block  *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer for block, if any */
+        struct xfs_buf          *bp)    /* buffer for block, if any */
 {
        int                     lblock_ok; /* block passes checks */
-        xfs_mount_t             *mp;    /* file system mount point */
+        struct xfs_mount        *mp;    /* file system mount point */
        mp = cur->bc_mp;
        lblock_ok =
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                        cur->bc_ops->get_maxrecs(cur, level) &&
-                block->bb_leftsib &&
+                block->bb_u.l.bb_leftsib &&
-                (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
+                (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
-                 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
+                 XFS_FSB_SANITY_CHECK(mp,
-                block->bb_rightsib &&
+                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
-                (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
+                block->bb_u.l.bb_rightsib &&
-                 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
+                (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
-        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
+        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
                if (bp)
                        xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
        return 0;
 }
-/*
+STATIC int                              /* error (0 or EFSCORRUPTED) */
- * Checking routine: check that (long) pointer is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_dfsbno_t    ptr,            /* btree block disk address */
-        int             level)          /* btree block level */
-{
-        xfs_mount_t     *mp;            /* file system mount point */
-        mp = cur->bc_mp;
-        XFS_WANT_CORRUPTED_RETURN(
-                level > 0 &&
-                ptr != NULLDFSBNO &&
-                XFS_FSB_SANITY_CHECK(mp, ptr));
-        return 0;
-}
-#ifdef DEBUG
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-        xfs_btnum_t     btnum,          /* btree identifier */
-        void            *ar1,           /* pointer to left (lower) record */
-        void            *ar2)           /* pointer to right (higher) record */
-{
-        switch (btnum) {
-        case XFS_BTNUM_BNO: {
-                xfs_alloc_rec_t *r1;
-                xfs_alloc_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ar_startblock) +
-                       be32_to_cpu(r1->ar_blockcount) <=
-                       be32_to_cpu(r2->ar_startblock));
-                break;
-            }
-        case XFS_BTNUM_CNT: {
-                xfs_alloc_rec_t *r1;
-                xfs_alloc_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
-                       (r1->ar_blockcount == r2->ar_blockcount &&
-                        be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
-                break;
-            }
-        case XFS_BTNUM_BMAP: {
-                xfs_bmbt_rec_t  *r1;
-                xfs_bmbt_rec_t  *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(xfs_bmbt_disk_get_startoff(r1) +
-                       xfs_bmbt_disk_get_blockcount(r1) <=
-                       xfs_bmbt_disk_get_startoff(r2));
-                break;
-            }
-        case XFS_BTNUM_INO: {
-                xfs_inobt_rec_t *r1;
-                xfs_inobt_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
-                       be32_to_cpu(r2->ir_startino));
-                break;
-            }
-        default:
-                ASSERT(0);
-        }
-}
-#endif  /* DEBUG */
-/*
- * Checking routine: check that block header is ok.
- */
-/* ARGSUSED */
-int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_sblock_t      *block, /* btree short form block pointer */
+        struct xfs_btree_block  *block, /* btree short form block pointer */
        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer containing block */
+        struct xfs_buf          *bp)    /* buffer containing block */
 {
-        xfs_buf_t               *agbp;  /* buffer for ag. freespace struct */
+        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
-        xfs_agf_t               *agf;   /* ag. freespace structure */
+        struct xfs_agf          *agf;   /* ag. freespace structure */
        xfs_agblock_t           agflen; /* native ag. freespace length */
        int                     sblock_ok; /* block passes checks */
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                        cur->bc_ops->get_maxrecs(cur, level) &&
-                (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
+                (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
-                 be32_to_cpu(block->bb_leftsib) < agflen) &&
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
-                block->bb_leftsib &&
+                block->bb_u.s.bb_leftsib &&
-                (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
+                (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
-                 be32_to_cpu(block->bb_rightsib) < agflen) &&
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
-                block->bb_rightsib;
+                block->bb_u.s.bb_rightsib;
        if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,27 +128,78 @@ xfs_btree_check_sblock(
 }
 /*
- * Checking routine: check that (short) pointer is ok.
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return xfs_btree_check_lblock(cur, block, level, bp);
+        else
+                return xfs_btree_check_sblock(cur, block, level, bp);
+}
+/*
+ * Check that (long) pointer is ok.
 */
 int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_dfsbno_t            bno,    /* btree block disk address */
+        int                     level)  /* btree block level */
+{
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                bno != NULLDFSBNO &&
+                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sptr(
-        xfs_btree_cur_t *cur,           /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agblock_t   ptr,            /* btree block disk address */
+        xfs_agblock_t           bno,    /* btree block disk address */
-        int             level)          /* btree block level */
+        int                     level)  /* btree block level */
 {
-        xfs_buf_t       *agbp;          /* buffer for ag. freespace struct */
+        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
-        xfs_agf_t       *agf;           /* ag. freespace structure */
-        agbp = cur->bc_private.a.agbp;
-        agf = XFS_BUF_TO_AGF(agbp);
        XFS_WANT_CORRUPTED_RETURN(
                level > 0 &&
-                ptr != NULLAGBLOCK && ptr != 0 &&
+                bno != NULLAGBLOCK &&
-                ptr < be32_to_cpu(agf->agf_length));
+                bno != 0 &&
+                bno < agblocks);
        return 0;
 }
 /*
+ * Check that block ptr is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_ptr     *ptr,   /* btree block disk address */
+        int                     index,  /* offset from ptr to check */
+        int                     level)  /* btree block level */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                return xfs_btree_check_lptr(cur,
+                                be64_to_cpu((&ptr->l)[index]), level);
+        } else {
+                return xfs_btree_check_sptr(cur,
+                                be32_to_cpu((&ptr->s)[index]), level);
+        }
+}
+#endif
+/*
 * Delete the btree cursor.
 */
 void
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
        tp = cur->bc_tp;
        mp = cur->bc_mp;
        /*
         * Allocate a new cursor like the old one.
         */
-        new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
+        new = cur->bc_ops->dup_cursor(cur);
-                cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
-                cur->bc_private.b.whichfork);
        /*
         * Copy the record currently in the cursor.
         */
        new->bc_rec = cur->bc_rec;
        /*
         * For each level current, re-get the buffer and copy the ptr value.
         */
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
                } else
                        new->bc_bufs[i] = NULL;
        }
-        /*
-         * For bmap btrees, copy the firstblock, flist, and flags values,
-         * since init cursor doesn't get them.
-         */
-        if (new->bc_btnum == XFS_BTNUM_BMAP) {
-                new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-                new->bc_private.b.flist = cur->bc_private.b.flist;
-                new->bc_private.b.flags = cur->bc_private.b.flags;
-        }
        *ncur = new;
        return 0;
 }
 /*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                XFS_BTREE_LBLOCK_LEN :
+                XFS_BTREE_SBLOCK_LEN;
+}
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                sizeof(__be64) : sizeof(__be32);
+}
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->rec_len;
+}
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->key_len;
+}
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        int                     level)
+{
+        return xfs_btree_block_len(cur) +
+                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+                (n - 1) * xfs_btree_ptr_len(cur);
+}
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_rec *)
+                ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_key *)
+                ((char *)block + xfs_btree_key_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        int                     level = xfs_btree_get_level(block);
+        ASSERT(block->bb_level != 0);
+        return (union xfs_btree_ptr *)
+                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+/*
 * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
 */
-STATIC xfs_btree_block_t *              /* generic btree block pointer */
+STATIC struct xfs_btree_block *         /* generic btree block pointer */
 xfs_btree_get_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     level,  /* level in btree */
-        xfs_buf_t               **bpp)  /* buffer containing the block */
+        struct xfs_buf          **bpp)  /* buffer containing the block */
-{
+{
-        xfs_btree_block_t       *block; /* return value */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-        xfs_buf_t               *bp;    /* return buffer */
+            (level == cur->bc_nlevels - 1)) {
-        xfs_ifork_t             *ifp;   /* inode fork pointer */
+                *bpp = NULL;
-        int                     whichfork; /* data or attr fork */
+                return xfs_btree_get_iroot(cur);
-        if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
-                whichfork = cur->bc_private.b.whichfork;
-                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
-                block = (xfs_btree_block_t *)ifp->if_broot;
-                bp = NULL;
-        } else {
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_BLOCK(bp);
        }
-        ASSERT(block != NULL);
-        *bpp = bp;
+        *bpp = cur->bc_bufs[level];
-        return block;
+        return XFS_BUF_TO_BLOCK(*bpp);
 }
 /*
@@ -505,97 +502,6 @@ xfs_btree_get_bufs(
 }
 /*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
- */
-xfs_btree_cur_t *                       /* new btree cursor */
-xfs_btree_init_cursor(
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_buf_t       *agbp,          /* (A only) buffer for agf structure */
-                                        /* (I only) buffer for agi structure */
-        xfs_agnumber_t  agno,           /* (AI only) allocation group number */
-        xfs_btnum_t     btnum,          /* btree identifier */
-        xfs_inode_t     *ip,            /* (B only) inode owning the btree */
-        int             whichfork)      /* (B only) data or attr fork */
-{
-        xfs_agf_t       *agf;           /* (A) allocation group freespace */
-        xfs_agi_t       *agi;           /* (I) allocation group inodespace */
-        xfs_btree_cur_t *cur;           /* return value */
-        xfs_ifork_t     *ifp;           /* (I) inode fork pointer */
-        int             nlevels=0;      /* number of levels in the btree */
-        ASSERT(xfs_btree_cur_zone != NULL);
-        /*
-         * Allocate a new cursor.
-         */
-        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-        /*
-         * Deduce the number of btree levels from the arguments.
-         */
-        switch (btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                agf = XFS_BUF_TO_AGF(agbp);
-                nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-                break;
-        case XFS_BTNUM_BMAP:
-                ifp = XFS_IFORK_PTR(ip, whichfork);
-                nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-                break;
-        case XFS_BTNUM_INO:
-                agi = XFS_BUF_TO_AGI(agbp);
-                nlevels = be32_to_cpu(agi->agi_level);
-                break;
-        default:
-                ASSERT(0);
-        }
-        /*
-         * Fill in the common fields.
-         */
-        cur->bc_tp = tp;
-        cur->bc_mp = mp;
-        cur->bc_nlevels = nlevels;
-        cur->bc_btnum = btnum;
-        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        /*
-         * Fill in private fields.
-         */
-        switch (btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                /*
-                 * Allocation btree fields.
-                 */
-                cur->bc_private.a.agbp = agbp;
-                cur->bc_private.a.agno = agno;
-                break;
-        case XFS_BTNUM_INO:
-                /*
-                 * Inode allocation btree fields.
-                 */
-                cur->bc_private.a.agbp = agbp;
-                cur->bc_private.a.agno = agno;
-                break;
-        case XFS_BTNUM_BMAP:
-                /*
-                 * Bmap btree fields.
-                 */
-                cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-                cur->bc_private.b.ip = ip;
-                cur->bc_private.b.firstblock = NULLFSBLOCK;
-                cur->bc_private.b.flist = NULL;
-                cur->bc_private.b.allocated = 0;
-                cur->bc_private.b.flags = 0;
-                cur->bc_private.b.whichfork = whichfork;
-                break;
-        default:
-                ASSERT(0);
-        }
-        return cur;
-}
-/*
 * Check for the cursor referring to the last block at the given level.
 */
 int                                     /* 1=is last block, 0=not last block */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to check */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        block = xfs_btree_get_block(cur, level, &bp);
        xfs_btree_check_block(cur, block, level, bp);
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
                return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
        else
                return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
 * Change the cursor to point to the first record at the given level.
 * Other levels are unaffected.
 */
-int                                     /* success=1, failure=0 */
+STATIC int                              /* success=1, failure=0 */
 xfs_btree_firstrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        /*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
        /*
         * It's empty, there is no such record.
         */
-        if (!block->bb_h.bb_numrecs)
+        if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
 * Change the cursor to point to the last record in the current block
 * at the given level.  Other levels are unaffected.
 */
-int                                     /* success=1, failure=0 */
+STATIC int                              /* success=1, failure=0 */
 xfs_btree_lastrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        /*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
        /*
         * It's empty, there is no such record.
         */
-        if (!block->bb_h.bb_numrecs)
+        if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to numrecs, that's the last record/key.
         */
-        cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
        return 1;
 }
@@ -817,66 +723,84 @@ xfs_btree_reada_bufs(
        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
+STATIC int
+xfs_btree_readahead_lblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block  *block)
+{
+        int                     rval = 0;
+        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+                rval++;
+        }
+        return rval;
+}
+STATIC int
+xfs_btree_readahead_sblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block *block)
+{
+        int                     rval = 0;
+        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     left, 1);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     right, 1);
+                rval++;
+        }
+        return rval;
+}
 /*
 * Read-ahead btree blocks, at the given level.
 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 */
-int
+STATIC int
-xfs_btree_readahead_core(
+xfs_btree_readahead(
-        xfs_btree_cur_t         *cur,           /* btree cursor */
+        struct xfs_btree_cur    *cur,           /* btree cursor */
        int                     lev,            /* level in btree */
        int                     lr)             /* left/right bits */
 {
-        xfs_alloc_block_t       *a;
+        struct xfs_btree_block  *block;
-        xfs_bmbt_block_t        *b;
-        xfs_inobt_block_t       *i;
+        /*
-        int                     rval = 0;
+         * No readahead needed if we are at the root level and the
+         * btree root is stored in the inode.
+         */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (lev == cur->bc_nlevels - 1))
+                return 0;
+        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+                return 0;
-        ASSERT(cur->bc_bufs[lev] != NULL);
        cur->bc_ra[lev] |= lr;
-        switch (cur->bc_btnum) {
+        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
+                return xfs_btree_readahead_lblock(cur, lr, block);
-                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
+        return xfs_btree_readahead_sblock(cur, lr, block);
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(a->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(a->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        case XFS_BTNUM_BMAP:
-                b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
-                if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
-                        xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
-                        xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        case XFS_BTNUM_INO:
-                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
-                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(i->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(i->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        default:
-                ASSERT(0);
-        }
-        return rval;
 }
 /*
@@ -889,7 +813,7 @@ xfs_btree_setbuf(
        int                     lev,    /* level in btree */
        xfs_buf_t               *bp)    /* new buffer to set */
 {
-        xfs_btree_block_t       *b;     /* btree block */
+        struct xfs_btree_block  *b;     /* btree block */
        xfs_buf_t               *obp;   /* old buffer pointer */
        obp = cur->bc_bufs[lev];
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
        if (!bp)
                return;
        b = XFS_BUF_TO_BLOCK(bp);
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
                if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
        }
 }
+STATIC int
+xfs_btree_ptr_is_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+        else
+                return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+STATIC void
+xfs_btree_set_ptr_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(NULLFSBLOCK);
+        else
+                ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->l = block->bb_u.l.bb_rightsib;
+                else
+                        ptr->l = block->bb_u.l.bb_leftsib;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->s = block->bb_u.s.bb_rightsib;
+                else
+                        ptr->s = block->bb_u.s.bb_leftsib;
+        }
+}
+STATIC void
+xfs_btree_set_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.l.bb_rightsib = ptr->l;
+                else
+                        block->bb_u.l.bb_leftsib = ptr->l;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.s.bb_rightsib = ptr->s;
+                else
+                        block->bb_u.s.bb_leftsib = ptr->s;
+        }
+}
+STATIC void
+xfs_btree_init_block(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     numrecs,
+        struct xfs_btree_block  *new)   /* new block */
+{
+        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        new->bb_level = cpu_to_be16(level);
+        new->bb_numrecs = cpu_to_be16(numrecs);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+        } else {
+                new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+        }
+}
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateѕ to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        int                     level)
+{
+        union xfs_btree_ptr     ptr;
+        if (level > 0)
+                return 0;
+        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+                return 0;
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &ptr))
+                return 0;
+        return 1;
+}
+STATIC void
+xfs_btree_buf_to_ptr(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        else {
+                ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        }
+}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+        } else {
+                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+                ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                        be32_to_cpu(ptr->s));
+        }
+}
+STATIC void
+xfs_btree_set_refs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        switch (cur->bc_btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                break;
+        case XFS_BTNUM_INO:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                break;
+        case XFS_BTNUM_BMAP:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                break;
+        default:
+                ASSERT(0);
+        }
+}
+STATIC int
+xfs_btree_get_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                 mp->m_bsize, flags);
+        ASSERT(*bpp);
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
+}
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     level,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        int                     error;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                   mp->m_bsize, flags, bpp);
+        if (error)
+                return error;
+        ASSERT(*bpp != NULL);
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        xfs_btree_set_refs(cur, *bpp);
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        error = xfs_btree_check_block(cur, *block, level, *bpp);
+        if (error)
+                xfs_trans_brelse(cur->bc_tp, *bpp);
+        return error;
+}
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *dst_key,
+        union xfs_btree_key     *src_key,
+        int                     numkeys)
+{
+        ASSERT(numkeys >= 0);
+        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *dst_rec,
+        union xfs_btree_rec     *src_rec,
+        int                     numrecs)
+{
+        ASSERT(numrecs >= 0);
+        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *dst_ptr,
+        union xfs_btree_ptr     *src_ptr,
+        int                     numptrs)
+{
+        ASSERT(numptrs >= 0);
+        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        int                     dir,
+        int                     numkeys)
+{
+        char                    *dst_key;
+        ASSERT(numkeys >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     dir,
+        int                     numrecs)
+{
+        char                    *dst_rec;
+        ASSERT(numrecs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     dir,
+        int                     numptrs)
+{
+        char                    *dst_ptr;
+        ASSERT(numptrs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                  xfs_btree_key_offset(cur, first),
+                                  xfs_btree_key_offset(cur, last + 1) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        xfs_trans_log_buf(cur->bc_tp, bp,
+                          xfs_btree_rec_offset(cur, first),
+                          xfs_btree_rec_offset(cur, last + 1) - 1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     first,  /* index of first pointer to log */
+        int                     last)   /* index of last pointer to log */
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+                int                     level = xfs_btree_get_level(block);
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                xfs_btree_ptr_offset(cur, first, level),
+                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     fields) /* mask of fields: XFS_BB_... */
+{
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        static const short      soffsets[] = {  /* table of offsets (short) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+                XFS_BTREE_SBLOCK_LEN
+        };
+        static const short      loffsets[] = {  /* table of offsets (long) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+                XFS_BTREE_LBLOCK_LEN
+        };
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+        if (bp) {
+                xfs_btree_offsets(fields,
+                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                        loffsets : soffsets,
+                                  XFS_BB_NUM_BITS, &first, &last);
+                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_increment(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        union xfs_btree_ptr     ptr;
+        struct xfs_buf          *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the right at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* We're done if we remain in the block after the increment. */
+        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+                goto out1;
+        /* Fail if we just went off the right edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, increment);
+        /*
+         * March up the tree incrementing pointers.
+         * Stop when we don't go off the right edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                block = xfs_btree_get_block(cur, lev, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, lev, bp);
+                if (error)
+                        goto error0;
+#endif
+                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                        break;
+                /* Read-ahead the right block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        /*
+         * If we went off the root then we are either seriously
+         * confused or have the tree root in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                        0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = 1;
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_decrement(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        xfs_buf_t               *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        union xfs_btree_ptr     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the left at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        /* We're done if we remain in the block after the decrement. */
+        if (--cur->bc_ptrs[level] > 0)
+                goto out1;
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we just went off the left edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, decrement);
+        /*
+         * March up the tree decrementing pointers.
+         * Stop when we don't go off the left edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                /* Read-ahead the left block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         * or the root of the tree is in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                        0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_btree_lookup_get_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in the btree */
+        union xfs_btree_ptr     *pp,    /* ptr to btree block */
+        struct xfs_btree_block  **blkp) /* return btree block */
+{
+        struct xfs_buf          *bp;    /* buffer pointer for btree block */
+        int                     error = 0;
+        /* special case the root block if in an inode */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1)) {
+                *blkp = xfs_btree_get_iroot(cur);
+                return 0;
+        }
+        /*
+         * If the old buffer at this level for the disk address we are
+         * looking for re-use it.
+         *
+         * Otherwise throw it away and get a new one.
+         */
+        bp = cur->bc_bufs[level];
+        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+                *blkp = XFS_BUF_TO_BLOCK(bp);
+                return 0;
+        }
+        error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+        if (error)
+                return error;
+        xfs_btree_setbuf(cur, level, bp);
+        return 0;
+}
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     keyno,
+        struct xfs_btree_block  *block,
+        union xfs_btree_key     *kp)
+{
+        if (level == 0) {
+                cur->bc_ops->init_key_from_rec(kp,
+                                xfs_btree_rec_addr(cur, keyno, block));
+                return kp;
+        }
+        return xfs_btree_key_addr(cur, keyno, block);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int                                     /* error */
+xfs_btree_lookup(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* current btree block */
+        __int64_t               diff;   /* difference for the current key */
+        int                     error;  /* error return value */
+        int                     keyno;  /* current key number */
+        int                     level;  /* level in the btree */
+        union xfs_btree_ptr     *pp;    /* ptr to btree block */
+        union xfs_btree_ptr     ptr;    /* ptr to btree block */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, dir);
+        XFS_BTREE_STATS_INC(cur, lookup);
+        block = NULL;
+        keyno = 0;
+        /* initialise start pointer from cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+        pp = &ptr;
+        /*
+         * Iterate over each level in the btree, starting at the root.
+         * For each level above the leaves, find the key we need, based
+         * on the lookup record, then follow the corresponding block
+         * pointer down to the next level.
+         */
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                /* Get the block we need to do the lookup on. */
+                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+                if (error)
+                        goto error0;
+                if (diff == 0) {
+                        /*
+                         * If we already had a key match at a higher level, we
+                         * know we need to use the first entry in this block.
+                         */
+                        keyno = 1;
+                } else {
+                        /* Otherwise search this block. Do a binary search. */
+                        int     high;   /* high entry number */
+                        int     low;    /* low entry number */
+                        /* Set low and high entry numbers, 1-based. */
+                        low = 1;
+                        high = xfs_btree_get_numrecs(block);
+                        if (!high) {
+                                /* Block is empty, must be an empty leaf. */
+                                ASSERT(level == 0 && cur->bc_nlevels == 1);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 0;
+                                return 0;
+                        }
+                        /* Binary search the block. */
+                        while (low <= high) {
+                                union xfs_btree_key     key;
+                                union xfs_btree_key     *kp;
+                                XFS_BTREE_STATS_INC(cur, compare);
+                                /* keyno is average of low and high. */
+                                keyno = (low + high) >> 1;
+                                /* Get current search key */
+                                kp = xfs_lookup_get_search_key(cur, level,
+                                                keyno, block, &key);
+                                /*
+                                 * Compute difference to get next direction:
+                                 *  - less than, move right
+                                 *  - greater than, move left
+                                 *  - equal, we're done
+                                 */
+                                diff = cur->bc_ops->key_diff(cur, kp);
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                else
+                                        break;
+                        }
+                }
+                /*
+                 * If there are more levels, set up for the next level
+                 * by getting the block number and filling in the cursor.
+                 */
+                if (level > 0) {
+                        /*
+                         * If we moved left, need the previous key number,
+                         * unless there isn't one.
+                         */
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        pp = xfs_btree_ptr_addr(cur, keyno, block);
+#ifdef DEBUG
+                        error = xfs_btree_check_ptr(cur, pp, 0, level);
+                        if (error)
+                                goto error0;
+#endif
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        /* Done with the search. See if we need to adjust the results. */
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+                if (dir == XFS_LOOKUP_GE &&
+                    keyno > xfs_btree_get_numrecs(block) &&
+                    !xfs_btree_ptr_is_null(cur, &ptr)) {
+                        int     i;
+                        cur->bc_ptrs[0] = keyno;
+                        error = xfs_btree_increment(cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                        *stat = 1;
+                        return 0;
+                }
+        } else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        /* Return if we succeeded or not. */
+        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+                *stat = 0;
+        else if (dir != XFS_LOOKUP_EQ || diff == 0)
+                *stat = 1;
+        else
+                *stat = 0;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *keyp,
+        int                     level)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        union xfs_btree_key     *kp;
+        int                     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+        /*
+         * Go up the tree from this level toward the root.
+         * At each level, update the key value to the value input.
+         * Stop when we reach a level where the cursor isn't pointing
+         * at the first entry in the block.
+         */
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+                int             error;
+#endif
+                block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, level, bp);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                xfs_btree_copy_keys(cur, kp, keyp, 1);
+                xfs_btree_log_keys(cur, bp, ptr, ptr);
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        int                     error;
+        int                     ptr;
+        union xfs_btree_rec     *rp;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGR(cur, rec);
+        /* Pick up the current block. */
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Get the address of the rec to be updated. */
+        ptr = cur->bc_ptrs[0];
+        rp = xfs_btree_rec_addr(cur, ptr, block);
+        /* Fill in the new contents and log them. */
+        xfs_btree_copy_recs(cur, rp, rec, 1);
+        xfs_btree_log_recs(cur, bp, ptr, ptr);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, 0)) {
+                cur->bc_ops->update_lastrec(cur, block, rec,
+                                            ptr, LASTREC_UPDATE);
+        }
+        /* Updating first rec in leaf. Pass new key value up to our parent. */
+        if (ptr == 1) {
+                union xfs_btree_key     key;
+                cur->bc_ops->init_key_from_rec(&key, rec);
+                error = xfs_btree_updkey(cur, &key, 1);
+                if (error)
+                        goto error0;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_lshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs;          /* left record count */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        int                     rrecs;          /* right record count */
+        union xfs_btree_ptr     lptr;           /* left btree pointer */
+        union xfs_btree_key     *rkp = NULL;    /* right btree key */
+        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+        int                     error;          /* error return value */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1)
+                goto out0;
+        /* Set up variables for this block as "right". */
+        right = xfs_btree_get_block(cur, level, &rbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, right, level, rbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no left sibling then we can't shift an entry left. */
+        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &lptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] <= 1)
+                goto out0;
+        /* Set up the left neighbor as "left". */
+        error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        rrecs = xfs_btree_get_numrecs(right);
+        /*
+         * We add one entry to the left side and remove one for the right side.
+         * Accout for it here, the changes will be updated on disk and logged
+         * later.
+         */
+        lrecs++;
+        rrecs--;
+        XFS_BTREE_STATS_INC(cur, lshift);
+        XFS_BTREE_STATS_ADD(cur, moves, 1);
+        /*
+         * If non-leaf, copy a key and a ptr to the left block.
+         * Log the changes to the left block.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, rpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, 1);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->keys_inorder(cur,
+                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, 1);
+                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->recs_inorder(cur,
+                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+        }
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Slide the contents of right down one entry.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+                int                     i;              /* loop index */
+                for (i = 0; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur,
+                                xfs_btree_key_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_shift_ptrs(cur,
+                                xfs_btree_ptr_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+        } else {
+                /* It's a leaf. operate on records */
+                xfs_btree_shift_recs(cur,
+                        xfs_btree_rec_addr(cur, 2, right),
+                        -1, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                cur->bc_ops->init_key_from_rec(&key,
+                        xfs_btree_rec_addr(cur, 1, right));
+                rkp = &key;
+        }
+        /* Update the parent key values of right. */
+        error = xfs_btree_updkey(cur, rkp, level + 1);
+        if (error)
+                goto error0;
+        /* Slide the cursor value left one. */
+        cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_rshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        union xfs_btree_ptr     rptr;           /* right block pointer */
+        union xfs_btree_key     *rkp;           /* right btree key */
+        int                     rrecs;          /* right record count */
+        int                     lrecs;          /* left record count */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1))
+                goto out0;
+        /* Set up variables for this block as "left". */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no right sibling then we can't shift an entry right. */
+        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &rptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (cur->bc_ptrs[level] >= lrecs)
+                goto out0;
+        /* Set up the right neighbor as "right". */
+        error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        rrecs = xfs_btree_get_numrecs(right);
+        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, rshift);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Make a hole at the start of the right neighbor block, then
+         * copy the last left block entry to the hole.
+         */
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                union xfs_btree_ptr     *rpp;
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = rrecs - 1; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, lpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_keys(cur, rkp, lkp, 1);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                        xfs_btree_key_addr(cur, 2, right)));
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec     *lrp;
+                union xfs_btree_rec     *rrp;
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_recs(cur, rrp, lrp, 1);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+                cur->bc_ops->init_key_from_rec(&key, rrp);
+                rkp = &key;
+                ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                        xfs_btree_rec_addr(cur, 2, right)));
+        }
+        /*
+         * Decrement and log left's numrecs, bump and log right's numrecs.
+         */
+        xfs_btree_set_numrecs(left, --lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, ++rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Using a temporary cursor, update the parent key values of the
+         * block on the right.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        i = xfs_btree_lastrec(tcur, level);
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        error = xfs_btree_increment(tcur, level, &i);
+        if (error)
+                goto error1;
+        error = xfs_btree_updkey(tcur, rkp, level + 1);
+        if (error)
+                goto error1;
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+error1:
+        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                      /* error */
+xfs_btree_split(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        union xfs_btree_ptr     *ptrp,
+        union xfs_btree_key     *key,
+        struct xfs_btree_cur    **curp,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        int                     lrecs;
+        int                     rrecs;
+        int                     src_index;
+        int                     error;          /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+        XFS_BTREE_STATS_INC(cur, split);
+        /* Set up left block (current one). */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block as "right". */
+        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* Fill in the btree header for the new right block. */
+        xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+        /*
+         * Split the entries between the old and the new block evenly.
+         * Make sure that if there's an odd number of entries now, that
+         * each new block will have the same number of entries.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        rrecs = lrecs / 2;
+        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+                rrecs++;
+        src_index = (lrecs - rrecs + 1);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Copy btree block entries from the left block over to the
+         * new block, the right. Update the right block and log the
+         * changes.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, src_index, left);
+                lpp = xfs_btree_ptr_addr(cur, src_index, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = src_index; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+                /* Grab the keys to the entries moved to the right block */
+                xfs_btree_copy_keys(cur, key, rkp, 1);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, src_index, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                cur->bc_ops->init_key_from_rec(key,
+                        xfs_btree_rec_addr(cur, 1, right));
+        }
+        /*
+         * Find the left block number by looking in the buffer.
+         * Adjust numrecs, sibling pointers.
+         */
+        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        lrecs -= rrecs;
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there's a block to the new block's right, make that block
+         * point back to right instead of to left.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+                error = xfs_btree_read_buf_block(cur, &rrptr, level,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * If the cursor is really in the right block, move it there.
+         * If it's just pointing past the last entry in left, then we'll
+         * insert there, so don't change anything in that case.
+         */
+        if (cur->bc_ptrs[level] > lrecs + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= lrecs;
+        }
+        /*
+         * If there are more levels, we'll need another cursor which refers
+         * the right block, no matter where this cursor was.
+         */
+        if (level + 1 < cur->bc_nlevels) {
+                error = xfs_btree_dup_cursor(cur, curp);
+                if (error)
+                        goto error0;
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *ptrp = rptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                             /* error */
+xfs_btree_new_iroot(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     *logflags,      /* logging flags for inode */
+        int                     *stat)          /* return status - 0 fail */
+{
+        struct xfs_buf          *cbp;           /* buffer for cblock */
+        struct xfs_btree_block  *block;         /* btree block */
+        struct xfs_btree_block  *cblock;        /* child btree block */
+        union xfs_btree_key     *ckp;           /* child key pointer */
+        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+        union xfs_btree_key     *kp;            /* pointer to btree key */
+        union xfs_btree_ptr     *pp;            /* pointer to block addr */
+        union xfs_btree_ptr     nptr;           /* new block addr */
+        int                     level;          /* btree level */
+        int                     error;          /* error return code */
+#ifdef DEBUG
+        int                     i;              /* loop counter */
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        level = cur->bc_nlevels - 1;
+        block = xfs_btree_get_iroot(cur);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Copy the root into a real block. */
+        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+        if (error)
+                goto error0;
+        memcpy(cblock, block, xfs_btree_block_len(cur));
+        be16_add_cpu(&block->bb_level, 1);
+        xfs_btree_set_numrecs(block, 1);
+        cur->bc_nlevels++;
+        cur->bc_ptrs[level + 1] = 1;
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+                error = xfs_btree_check_ptr(cur, pp, i, level);
+                if (error)
+                        goto error0;
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+#ifdef DEBUG
+        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+        xfs_iroot_realloc(cur->bc_private.b.ip,
+                          1 - xfs_btree_get_numrecs(cblock),
+                          cur->bc_private.b.whichfork);
+        xfs_btree_setbuf(cur, level, cbp);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        *logflags |=
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+        *stat = 1;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                              /* error */
+xfs_btree_new_root(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* one half of the old root block */
+        struct xfs_buf          *bp;    /* buffer containing block */
+        int                     error;  /* error return value */
+        struct xfs_buf          *lbp;   /* left buffer pointer */
+        struct xfs_btree_block  *left;  /* left btree block */
+        struct xfs_buf          *nbp;   /* new (root) buffer */
+        struct xfs_btree_block  *new;   /* new (root) btree block */
+        int                     nptr;   /* new value for key index, 1 or 2 */
+        struct xfs_buf          *rbp;   /* right buffer pointer */
+        struct xfs_btree_block  *right; /* right btree block */
+        union xfs_btree_ptr     rptr;
+        union xfs_btree_ptr     lptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        /* initialise our start point from the cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block. */
+        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+        if (error)
+                goto error0;
+        /* Set the root in the holding structure  increasing the level by 1. */
+        cur->bc_ops->set_root(cur, &lptr, 1);
+        /*
+         * At the previous root level there are now two blocks: the old root,
+         * and the new block generated when it was split.  We don't know which
+         * one the cursor is pointing at, so we set up variables "left" and
+         * "right" for each case.
+         */
+        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /* Our block is left, pick up the right block. */
+                lbp = bp;
+                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+                left = block;
+                error = xfs_btree_read_buf_block(cur, &rptr,
+                                        cur->bc_nlevels - 1, 0, &right, &rbp);
+                if (error)
+                        goto error0;
+                bp = rbp;
+                nptr = 1;
+        } else {
+                /* Our block is right, pick up the left block. */
+                rbp = bp;
+                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+                right = block;
+                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+                error = xfs_btree_read_buf_block(cur, &lptr,
+                                        cur->bc_nlevels - 1, 0, &left, &lbp);
+                if (error)
+                        goto error0;
+                bp = lbp;
+                nptr = 2;
+        }
+        /* Fill in the new block's btree header and log it. */
+        xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                        !xfs_btree_ptr_is_null(cur, &rptr));
+        /* Fill in the key data in the new root. */
+        if (xfs_btree_get_level(left) > 0) {
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_key_addr(cur, 1, left), 1);
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_key_addr(cur, 1, right), 1);
+        } else {
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_rec_addr(cur, 1, left));
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_rec_addr(cur, 1, right));
+        }
+        xfs_btree_log_keys(cur, nbp, 1, 2);
+        /* Fill in the pointer data in the new root. */
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+        xfs_btree_log_ptrs(cur, nbp, 1, 2);
+        /* Fix up the cursor. */
+        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+        cur->bc_ptrs[cur->bc_nlevels] = nptr;
+        cur->bc_nlevels++;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+}
+STATIC int
+xfs_btree_make_block_unfull(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* btree level */
+        int                     numrecs,/* # of recs in block */
+        int                     *oindex,/* old tree index */
+        int                     *index, /* new tree index */
+        union xfs_btree_ptr     *nptr,  /* new btree ptr */
+        struct xfs_btree_cur    **ncur, /* new btree cursor */
+        union xfs_btree_rec     *nrec,  /* new record */
+        int                     *stat)
+{
+        union xfs_btree_key     key;    /* new btree key value */
+        int                     error = 0;
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1) {
+                struct xfs_inode *ip = cur->bc_private.b.ip;
+                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                        /* A root block that can be made bigger. */
+                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+                } else {
+                        /* A root block that needs replacing */
+                        int     logflags = 0;
+                        error = xfs_btree_new_iroot(cur, &logflags, stat);
+                        if (error || *stat == 0)
+                                return error;
+                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+                }
+                return 0;
+        }
+        /* First, try shifting an entry to the right neighbor. */
+        error = xfs_btree_rshift(cur, level, stat);
+        if (error || *stat)
+                return error;
+        /* Next, try shifting an entry to the left neighbor. */
+        error = xfs_btree_lshift(cur, level, stat);
+        if (error)
+                return error;
+        if (*stat) {
+                *oindex = *index = cur->bc_ptrs[level];
+                return 0;
+        }
+        /*
+         * Next, try splitting the current block in half.
+         *
+         * If this works we have to re-set our variables because we
+         * could be in a different block now.
+         */
+        error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+        if (error || *stat == 0)
+                return error;
+        *index = cur->bc_ptrs[level];
+        cur->bc_ops->init_rec_from_key(&key, nrec);
+        return 0;
+}
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level to insert record at */
+        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+        union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer for block */
+        union xfs_btree_key     key;    /* btree key */
+        union xfs_btree_ptr     nptr;   /* new block ptr */
+        struct xfs_btree_cur    *ncur;  /* new btree cursor */
+        union xfs_btree_rec     nrec;   /* new record count */
+        int                     optr;   /* old key/record index */
+        int                     ptr;    /* key/record index */
+        int                     numrecs;/* number of records */
+        int                     error;  /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+        ncur = NULL;
+        /*
+         * If we have an external root pointer, and we've made it to the
+         * root level, allocate a new root block and we're done.
+         */
+        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level >= cur->bc_nlevels)) {
+                error = xfs_btree_new_root(cur, stat);
+                xfs_btree_set_ptr_null(cur, ptrp);
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return error;
+        }
+        /* If we're off the left edge, return failure. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Make a key out of the record data to be inserted, and save it. */
+        cur->bc_ops->init_key_from_rec(&key, recp);
+        optr = ptr;
+        XFS_BTREE_STATS_INC(cur, insrec);
+        /* Get pointers to the btree buffer and block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+        /* Check that the new entry is being inserted in the right place. */
+        if (ptr <= numrecs) {
+                if (level == 0) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                                xfs_btree_rec_addr(cur, ptr, block)));
+                } else {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                                xfs_btree_key_addr(cur, ptr, block)));
+                }
+        }
+#endif
+        /*
+         * If the block is full, we can't insert the new entry until we
+         * make the block un-full.
+         */
+        xfs_btree_set_ptr_null(cur, &nptr);
+        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+                error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                        &optr, &ptr, &nptr, &ncur, &nrec, stat);
+                if (error || *stat == 0)
+                        goto error0;
+        }
+        /*
+         * The current block may have changed if the block was
+         * previously full and we have just made space in it.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * At this point we know there's room for our new entry in the block
+         * we're pointing at.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *kp;
+                union xfs_btree_ptr     *pp;
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                pp = xfs_btree_ptr_addr(cur, ptr, block);
+#ifdef DEBUG
+                for (i = numrecs - ptr; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, pp, i, level);
+                        if (error)
+                                return error;
+                }
+#endif
+                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_keys(cur, kp, &key, 1);
+                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+                numrecs++;
+                xfs_btree_set_numrecs(block, numrecs);
+                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+                xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                                xfs_btree_key_addr(cur, ptr + 1, block)));
+                }
+#endif
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec             *rp;
+                rp = xfs_btree_rec_addr(cur, ptr, block);
+                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_recs(cur, rp, recp, 1);
+                xfs_btree_set_numrecs(block, ++numrecs);
+                xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                                xfs_btree_rec_addr(cur, ptr + 1, block)));
+                }
+#endif
+        }
+        /* Log the new number of records in the btree header. */
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /* If we inserted at the start of a block, update the parents' keys. */
+        if (optr == 1) {
+                error = xfs_btree_updkey(cur, &key, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, recp,
+                                            ptr, LASTREC_INSREC);
+        }
+        /*
+         * Return the new block number, if any.
+         * If there is one, give back a record value and a cursor too.
+         */
+        *ptrp = nptr;
+        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+                *recp = nrec;
+                *curp = ncur;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)
+{
+        int                     error;  /* error return value */
+        int                     i;      /* result value, 0 for failure */
+        int                     level;  /* current level number in btree */
+        union xfs_btree_ptr     nptr;   /* new block number (split result) */
+        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+        union xfs_btree_rec     rec;    /* record to insert */
+        level = 0;
+        ncur = NULL;
+        pcur = cur;
+        xfs_btree_set_ptr_null(cur, &nptr);
+        cur->bc_ops->init_rec_from_cur(cur, &rec);
+        /*
+         * Loop going up the tree, starting at the leaf level.
+         * Stop when we don't get a split block, that must mean that
+         * the insert is finished with this level.
+         */
+        do {
+                /*
+                 * Insert nrec/nptr into this level of the tree.
+                 * Note if we fail, nptr will be null.
+                 */
+                error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+                if (error) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                level++;
+                /*
+                 * See if the cursor we just used is trash.
+                 * Can't trash the caller's cursor, but otherwise we should
+                 * if ncur is a new cursor or we're about to be done.
+                 */
+                if (pcur != cur &&
+                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                        /* Save the state from the cursor before we trash it */
+                        if (cur->bc_ops->update_cursor)
+                                cur->bc_ops->update_cursor(pcur, cur);
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                /* If we got a new cursor, switch to it. */
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = NULL;
+                }
+        } while (!xfs_btree_ptr_is_null(cur, &nptr));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+int
+xfs_btree_kill_iroot(
+        struct xfs_btree_cur    *cur)
+{
+        int                     whichfork = cur->bc_private.b.whichfork;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_block  *block;
+        struct xfs_btree_block  *cblock;
+        union xfs_btree_key     *kp;
+        union xfs_btree_key     *ckp;
+        union xfs_btree_ptr     *pp;
+        union xfs_btree_ptr     *cpp;
+        struct xfs_buf          *cbp;
+        int                     level;
+        int                     index;
+        int                     numrecs;
+#ifdef DEBUG
+        union xfs_btree_ptr     ptr;
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        ASSERT(cur->bc_nlevels > 1);
+        /*
+         * Don't deal with the root block needs to be a leaf case.
+         * We're just going to turn the thing back into extents anyway.
+         */
+        level = cur->bc_nlevels - 1;
+        if (level == 1)
+                goto out0;
+        /*
+         * Give up if the root has multiple children.
+         */
+        block = xfs_btree_get_iroot(cur);
+        if (xfs_btree_get_numrecs(block) != 1)
+                goto out0;
+        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+        numrecs = xfs_btree_get_numrecs(cblock);
+        /*
+         * Only do this if the next level will fit.
+         * Then the data must be copied up to the inode,
+         * instead of freeing the root you free the next level.
+         */
+        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, killroot);
+#ifdef DEBUG
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+        if (index) {
+                xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                  cur->bc_private.b.whichfork);
+                block = ifp->if_broot;
+        }
+        be16_add_cpu(&block->bb_numrecs, index);
+        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < numrecs; i++) {
+                int             error;
+                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+        cur->bc_ops->free_block(cur, cbp);
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level - 1] = NULL;
+        be16_add_cpu(&block->bb_level, -1);
+        xfs_trans_log_inode(cur->bc_tp, ip,
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        cur->bc_nlevels--;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+STATIC int
+xfs_btree_dec_cursor(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)
+{
+        int                     error;
+        int                     i;
+        if (level > 0) {
+                error = xfs_btree_decrement(cur, level, &i);
+                if (error)
+                        return error;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                      /* error */
+xfs_btree_delrec(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     level,          /* level removing record from */
+        int                     *stat)          /* fail/done/go-on */
+{
+        struct xfs_btree_block  *block;         /* btree block */
+        union xfs_btree_ptr     cptr;           /* current block ptr */
+        struct xfs_buf          *bp;            /* buffer for block */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        union xfs_btree_key     key;            /* storage for keyp */
+        union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs = 0;      /* left record count */
+        int                     ptr;            /* key/record index */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        int                     rrecs = 0;      /* right record count */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        int                     numrecs;        /* temporary numrec count */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        tcur = NULL;
+        /* Get the index of the entry being deleted, check for nothing there. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Get the buffer & block containing the record or key/ptr. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we're off the end of the block. */
+        if (ptr > numrecs) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, delrec);
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+        /* Excise the entries being deleted. */
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+#ifdef DEBUG
+                for (i = 0; i < numrecs - ptr; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                if (ptr < numrecs) {
+                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need to pass a
+                 * key up to the next level (updkey).
+                 */
+                if (ptr == 1)
+                        keyp = xfs_btree_key_addr(cur, 1, block);
+        } else {
+                /* It's a leaf. operate on records */
+                if (ptr < numrecs) {
+                        xfs_btree_shift_recs(cur,
+                                xfs_btree_rec_addr(cur, ptr + 1, block),
+                                -1, numrecs - ptr);
+                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                if (ptr == 1) {
+                        cur->bc_ops->init_key_from_rec(&key,
+                                        xfs_btree_rec_addr(cur, 1, block));
+                        keyp = &key;
+                }
+        }
+        /*
+         * Decrement and log the number of entries in the block.
+         */
+        xfs_btree_set_numrecs(block, --numrecs);
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, NULL,
+                                            ptr, LASTREC_DELREC);
+        }
+        /*
+         * We're at the root level.  First, shrink the root block in-memory.
+         * Try to get rid of the next level down.  If we can't then there's
+         * nothing left to do.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                          cur->bc_private.b.whichfork);
+                        error = xfs_btree_kill_iroot(cur);
+                        if (error)
+                                goto error0;
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        *stat = 1;
+                        return 0;
+                }
+                /*
+                 * If this is the root level, and there's only one entry left,
+                 * and it's NOT the leaf level, then we can get rid of this
+                 * level.
+                 */
+                if (numrecs == 1 && level > 0) {
+                        union xfs_btree_ptr     *pp;
+                        /*
+                         * pp is still set to the first pointer in the block.
+                         * Make it the new root of the btree.
+                         */
+                        pp = xfs_btree_ptr_addr(cur, 1, block);
+                        error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                        if (error)
+                                goto error0;
+                } else if (level > 0) {
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                }
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we deleted the leftmost entry in the block, update the
+         * key values above us in the tree.
+         */
+        if (ptr == 1) {
+                error = xfs_btree_updkey(cur, keyp, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If the number of records remaining in the block is at least
+         * the minimum, we're done.
+         */
+        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        /*
+         * Otherwise, we have to move some records around to keep the
+         * tree balanced.  Look at the left and right sibling blocks to
+         * see if we can re-balance by moving only one record.
+         */
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                /*
+                 * One child of root, need to get a chance to copy its contents
+                 * into the root and delete it. Can't go up to next level,
+                 * there's nothing to delete there.
+                 */
+                if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                    xfs_btree_ptr_is_null(cur, &lptr) &&
+                    level == cur->bc_nlevels - 2) {
+                        error = xfs_btree_kill_iroot(cur);
+                        if (!error)
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        return 0;
+                }
+        }
+        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+               !xfs_btree_ptr_is_null(cur, &lptr));
+        /*
+         * Duplicate the cursor so our btree manipulations here won't
+         * disrupt the next level up.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        /*
+         * If there's a right sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /*
+                 * Move the temp cursor to the last entry in the next block.
+                 * Actually any entry but the first would suffice.
+                 */
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_increment(tcur, level, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(tcur, right, level, rbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+                /*
+                 * If right block is full enough so that removing one entry
+                 * won't make it too empty, and left-shifting an entry out
+                 * of right to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(right) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_lshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                                if (error)
+                                        goto error0;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference, and fix up the temp cursor to point
+                 * to our block again (last record).
+                 */
+                rrecs = xfs_btree_get_numrecs(right);
+                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                        i = xfs_btree_firstrec(tcur, level);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        error = xfs_btree_decrement(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                }
+        }
+        /*
+         * If there's a left sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                /*
+                 * Move the temp cursor to the first entry in the
+                 * previous block.
+                 */
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_decrement(tcur, level, &i);
+                if (error)
+                        goto error0;
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, left, level, lbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+                /*
+                 * If left block is full enough so that removing one entry
+                 * won't make it too empty, and right-shifting an entry out
+                 * of left to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(left) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_rshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference.
+                 */
+                lrecs = xfs_btree_get_numrecs(left);
+        }
+        /* Delete the temp cursor, we're done with it. */
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        tcur = NULL;
+        /* If here, we need to do a join to keep the tree balanced. */
+        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+            lrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "right" to be the starting block,
+                 * "left" to be the left neighbor.
+                 */
+                rptr = cptr;
+                right = block;
+                rbp = bp;
+                error = xfs_btree_read_buf_block(cur, &lptr, level,
+                                                        0, &left, &lbp);
+                if (error)
+                        goto error0;
+        /*
+         * If that won't work, see if we can join with the right neighbor block.
+         */
+        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                   rrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "left" to be the starting block,
+                 * "right" to be the right neighbor.
+                 */
+                lptr = cptr;
+                left = block;
+                lbp = bp;
+                error = xfs_btree_read_buf_block(cur, &rptr, level,
+                                                        0, &right, &rbp);
+                if (error)
+                        goto error0;
+        /*
+         * Otherwise, we can't fix the imbalance.
+         * Just return.  This is probably a logic error, but it's not fatal.
+         */
+        } else {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        rrecs = xfs_btree_get_numrecs(right);
+        lrecs = xfs_btree_get_numrecs(left);
+        /*
+         * We're now going to join "left" and "right" by moving all the stuff
+         * in "right" to "left" and deleting "right".
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = 1; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        }
+        XFS_BTREE_STATS_INC(cur, join);
+        /*
+         * Fix up the the number of records and right block pointer in the
+         * surviving block, and log it.
+         */
+        xfs_btree_set_numrecs(left, lrecs + rrecs);
+        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /* If there is a right sibling, point it to the remaining block. */
+        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+                error = xfs_btree_read_buf_block(cur, &cptr, level,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /* Free the deleted block. */
+        error = cur->bc_ops->free_block(cur, rbp);
+        if (error)
+                goto error0;
+        XFS_BTREE_STATS_INC(cur, free);
+        /*
+         * If we joined with the left neighbor, set the buffer in the
+         * cursor to the left block, and fix up the index.
+         */
+        if (bp != lbp) {
+                cur->bc_bufs[level] = lbp;
+                cur->bc_ptrs[level] += lrecs;
+                cur->bc_ra[level] = 0;
+        }
+        /*
+         * If we joined with the right neighbor and there's a level above
+         * us, increment the cursor at that level.
+         */
+        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                   (level + 1 < cur->bc_nlevels)) {
+                error = xfs_btree_increment(cur, level + 1, &i);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * Readjust the ptr at this level if it's not a leaf, since it's
+         * still pointing at the deletion point, which makes the cursor
+         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+         * We can't use decrement because it would change the next level up.
+         */
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        /* Return value means the next level up has something to do. */
+        *stat = 2;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        if (tcur)
+                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_btree_delete(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     level;
+        int                     i;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        /*
+         * Go up the tree, starting at leaf level.
+         *
+         * If 2 is returned then a join was done; go to the next level.
+         * Otherwise we are done.
+         */
+        for (level = 0, i = 2; i == 2; level++) {
+                error = xfs_btree_delrec(cur, level, &i);
+                if (error)
+                        goto error0;
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                error = xfs_btree_decrement(cur, level, &i);
+                                if (error)
+                                        goto error0;
+                                break;
+                        }
+                }
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_btree_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_rec     **recp, /* output: btree record */
+        int                     *stat)  /* output: success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer pointer */
+        int                     ptr;    /* record number */
+#ifdef DEBUG
+        int                     error;  /* error return value */
+#endif
+        ptr = cur->bc_ptrs[0];
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * Off the right end or left end, return failure.
+         */
+        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Point to the record and extract its data.
+         */
+        *recp = xfs_btree_rec_addr(cur, ptr, block);
+        *stat = 1;
+        return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 1f528a2a3754..789fffdf8b2f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -39,39 +39,19 @@ extern kmem_zone_t	*xfs_btree_cur_zone;
 #define XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
 /*
- * Short form header: space allocation btrees.
+ * Generic btree header.
- */
+ *
-typedef struct xfs_btree_sblock {
+ * This is a comination of the actual format used on disk for short and long
-        __be32          bb_magic;       /* magic number for block type */
+ * format btrees.  The first three fields are shared by both format, but
-        __be16          bb_level;       /* 0 is a leaf */
+ * the pointers are different and should be used with care.
-        __be16          bb_numrecs;     /* current # of data records */
+ *
-        __be32          bb_leftsib;     /* left sibling block or NULLAGBLOCK */
+ * To get the size of the actual short or long form headers please use
-        __be32          bb_rightsib;    /* right sibling block or NULLAGBLOCK */
+ * the size macros below.  Never use sizeof(xfs_btree_block).
-} xfs_btree_sblock_t;
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
-        __be32          bb_magic;       /* magic number for block type */
-        __be16          bb_level;       /* 0 is a leaf */
-        __be16          bb_numrecs;     /* current # of data records */
-        __be64          bb_leftsib;     /* left sibling block or NULLDFSBNO */
-        __be64          bb_rightsib;    /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
-/*
- * Combined header and structure, used by common code.
 */
-typedef struct xfs_btree_hdr
+struct xfs_btree_block {
-{
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
-} xfs_btree_hdr_t;
-typedef struct xfs_btree_block {
-        xfs_btree_hdr_t bb_h;           /* header */
        union {
                struct {
                        __be32          bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
                        __be64          bb_rightsib;
                } l;                    /* long form pointers */
        } bb_u;                         /* rest */
-} xfs_btree_block_t;
+};
+#define XFS_BTREE_SBLOCK_LEN    16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN    24      /* size of a long form block */
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+        __be32                  s;      /* short form ptr */
+        __be64                  l;      /* long form ptr */
+};
+union xfs_btree_key {
+        xfs_bmbt_key_t          bmbt;
+        xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+        xfs_alloc_key_t         alloc;
+        xfs_inobt_key_t         inobt;
+};
+union xfs_btree_rec {
+        xfs_bmbt_rec_t          bmbt;
+        xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+        xfs_alloc_rec_t         alloc;
+        xfs_inobt_rec_t         inobt;
+};
 /*
 * For logging record fields.
@@ -96,46 +105,131 @@ typedef struct xfs_btree_block {
 #define XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
 /*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
-/*
 * Magic numbers for btree blocks.
 */
 extern const __uint32_t xfs_magics[];
 /*
- * Maximum and minimum records in a btree block.
+ * Generic stats interface
- * Given block size, type prefix, and leaf flag (0 or 1).
+ */
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
+#define __XFS_BTREE_STATS_INC(type, stat) \
- * compiler warnings.
+        XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
- */
+#define XFS_BTREE_STATS_INC(cur, stat)  \
-#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)       \
+do {    \
-        ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
+        switch (cur->bc_btnum) {  \
-         (((lf) * (uint)sizeof(t ## _rec_t)) + \
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
-          ((1 - (lf)) * \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
-           ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
-#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)       \
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
-        (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+        }       \
-/*
+} while (0)
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- * (first entry numbered 1).
+        XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
- */
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
-#define XFS_BTREE_REC_ADDR(t,bb,i)      \
+do {    \
-        ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        switch (cur->bc_btnum) {  \
-         ((i) - 1) * sizeof(t ## _rec_t)))
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
-#define XFS_BTREE_KEY_ADDR(t,bb,i)      \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
-        ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
-         ((i) - 1) * sizeof(t ## _key_t)))
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
-#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr)  \
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
-        ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        }       \
-         (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+} while (0)
 #define XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+struct xfs_btree_ops {
+        /* size of the key and record structures */
+        size_t  key_len;
+        size_t  rec_len;
+        /* cursor operations */
+        struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+        void    (*update_cursor)(struct xfs_btree_cur *src,
+                                 struct xfs_btree_cur *dst);
+        /* update btree root pointer */
+        void    (*set_root)(struct xfs_btree_cur *cur,
+                                union xfs_btree_ptr *nptr, int level_change);
+        int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
+                                int level, union xfs_btree_ptr *newroot);
+        /* block allocation / freeing */
+        int     (*alloc_block)(struct xfs_btree_cur *cur,
+                               union xfs_btree_ptr *start_bno,
+                               union xfs_btree_ptr *new_bno,
+                               int length, int *stat);
+        int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+        /* update last record information */
+        void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                  struct xfs_btree_block *block,
+                                  union xfs_btree_rec *rec,
+                                  int ptr, int reason);
+        /* records in block/level */
+        int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+        int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+        /* records on disk.  Matter for the root in inode case. */
+        int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+        /* init values of btree structures */
+        void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_rec *rec);
+        void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_ptr *ptr);
+        /* difference between key value and cursor value */
+        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                              union xfs_btree_key *key);
+#ifdef DEBUG
+        /* check that k1 is lower than k2 */
+        int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_key *k1,
+                                union xfs_btree_key *k2);
+        /* check that r1 is lower than r2 */
+        int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_rec *r1,
+                                union xfs_btree_rec *r2);
+#endif
+        /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+        void            (*trace_enter)(struct xfs_btree_cur *, const char *,
+                                       char *, int, int, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t);
+        void            (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+                                        __uint64_t *, __uint64_t *);
+        void            (*trace_key)(struct xfs_btree_cur *,
+                                     union xfs_btree_key *, __uint64_t *,
+                                     __uint64_t *);
+        void            (*trace_record)(struct xfs_btree_cur *,
+                                        union xfs_btree_rec *, __uint64_t *,
+                                        __uint64_t *, __uint64_t *);
+#endif
+};
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE  0
+#define LASTREC_INSREC  1
+#define LASTREC_DELREC  2
 /*
 * Btree cursor structure.
 * This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
 {
        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
        struct xfs_mount        *bc_mp; /* file system mount struct */
+        const struct xfs_btree_ops *bc_ops;
+        uint                    bc_flags; /* btree features - below */
        union {
                xfs_alloc_rec_incore_t  a;
                xfs_bmbt_irec_t         b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS             (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE         (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE        (1<<2)  /* track last rec externally */
 #define XFS_BTREE_NOERROR       0
 #define XFS_BTREE_ERROR         1
 /*
 * Convert from buffer to btree block header.
 */
-#define XFS_BUF_TO_BLOCK(bp)    ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_LBLOCK(bp)   ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_SBLOCK(bp)   ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
-#ifdef __KERNEL__
-#ifdef DEBUG
 /*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
 */
-void
+int
 xfs_btree_check_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_block_t       *block, /* generic btree block pointer */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
-        int                     level,  /* level of the btree block */
-        struct xfs_buf          *bp);   /* buffer containing block, if any */
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-        xfs_btnum_t             btnum,  /* btree identifier */
-        void                    *ak1,   /* pointer to left (lower) key */
-        void                    *ak2);  /* pointer to right (higher) key */
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-        xfs_btnum_t             btnum,  /* btree identifier */
-        void                    *ar1,   /* pointer to left (lower) record */
-        void                    *ar2);  /* pointer to right (higher) record */
-#else
-#define xfs_btree_check_block(a,b,c,d)
-#define xfs_btree_check_key(a,b,c)
-#define xfs_btree_check_rec(a,b,c)
-#endif  /* DEBUG */
-/*
- * Checking routine: check that long form block header is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_lblock_t      *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
        struct xfs_buf          *bp);   /* buffer containing block, if any */
 /*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
 */
 int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_dfsbno_t            ptr,    /* btree block disk address */
        int                     level); /* btree block level */
-#define xfs_btree_check_lptr_disk(cur, ptr, level) \
-        xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
-/*
- * Checking routine: check that short form block header is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_sblock_t      *block, /* btree short form block pointer */
-        int                     level,  /* level of the btree block */
-        struct xfs_buf          *bp);   /* buffer containing block */
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agblock_t           ptr,    /* btree block disk address */
-        int                     level); /* btree block level */
 /*
 * Delete the btree cursor.
 */
@@ -281,15 +323,6 @@ xfs_btree_dup_cursor(
        xfs_btree_cur_t         **ncur);/* output cursor */
 /*
- * Change the cursor to point to the first record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_firstrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level); /* level to change */
-/*
 * Get a buffer for the block, return it with no data read.
 * Long-form addressing.
 */
@@ -313,20 +346,6 @@ xfs_btree_get_bufs(
        uint                    lock);  /* lock flags for get_buf */
 /*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t *                       /* new btree cursor */
-xfs_btree_init_cursor(
-        struct xfs_mount        *mp,    /* file system mount point */
-        struct xfs_trans        *tp,    /* transaction pointer */
-        struct xfs_buf          *agbp,  /* (A only) buffer for agf structure */
-        xfs_agnumber_t          agno,   /* (A only) allocation group number */
-        xfs_btnum_t             btnum,  /* btree identifier */
-        struct xfs_inode        *ip,    /* (B only) inode owning the btree */
-        int                     whichfork); /* (B only) data/attr fork */
-/*
 * Check for the cursor referring to the last block at the given level.
 */
 int                                     /* 1=is last block, 0=not last block */
@@ -335,15 +354,6 @@ xfs_btree_islastblock(
        int                     level); /* level to check */
 /*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_lastrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level); /* level to change */
-/*
 * Compute first and last byte offsets for the fields given.
 * Interprets the offsets table, which contains struct field offsets.
 */
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
        xfs_extlen_t            count); /* count of filesystem blocks */
 /*
- * Read-ahead btree blocks, at the given level.
+ * Set the buffer for level "lev" in the cursor to bp, releasing
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * any previous buffer.
 */
-int                                     /* readahead block count */
+void
-xfs_btree_readahead_core(
+xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
-        int                     lr);    /* left/right bits */
+        struct xfs_buf          *bp);   /* new buffer to set */
-static inline int                       /* readahead block count */
-xfs_btree_readahead(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     lev,    /* level in btree */
-        int                     lr)     /* left/right bits */
-{
-        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-                return 0;
-        return xfs_btree_readahead_core(cur, lev, lr);
+/*
-}
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_kill_iroot(struct xfs_btree_cur *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
 /*
- * Set the buffer for level "lev" in the cursor to bp, releasing
+ * Helpers.
- * any previous buffer.
 */
-void
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
-xfs_btree_setbuf(
+{
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        return be16_to_cpu(block->bb_numrecs);
-        int                     lev,    /* level in btree */
+}
-        struct xfs_buf          *bp);   /* new buffer to set */
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+                __uint16_t numrecs)
+{
+        block->bb_numrecs = cpu_to_be16(numrecs);
+}
-#endif  /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+        return be16_to_cpu(block->bb_level);
+}
 /*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 000000000000..44ff942a0fda
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+STATIC void
+xfs_btree_trace_ptr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     ptr,
+        __psunsigned_t          *high,
+        __psunsigned_t          *low)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                __u64 val = be64_to_cpu(ptr.l);
+                *high = val >> 32;
+                *low = (int)val;
+        } else {
+                *high = 0;
+                *low = be32_to_cpu(ptr.s);
+        }
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *b,
+        int                     i,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+                                 line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *b,
+        int                     i0,
+        int                     i1,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+                                 line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        xfs_dfiloff_t           o,
+        xfs_dfsbno_t            b,
+        xfs_dfilblks_t          i,
+        int                     j,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+                                 line,
+                                 o >> 32, (int)o,
+                                 b >> 32, (int)b,
+                                 i >> 32, (int)i,
+                                 (int)j, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+                                 line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_ptr     ptr,
+        union xfs_btree_key     *key,
+        int                     line)
+{
+        __psunsigned_t          high, low;
+        __uint64_t              l0, l1;
+        xfs_btree_trace_ptr(cur, ptr, &high, &low);
+        cur->bc_ops->trace_key(cur, key, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+                                 line, i, high, low,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_ptr     ptr,
+        union xfs_btree_rec     *rec,
+        int                     line)
+{
+        __psunsigned_t          high, low;
+        __uint64_t              l0, l1, l2;
+        xfs_btree_trace_ptr(cur, ptr, &high, &low);
+        cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+                              line, i,
+                              high, low,
+                              l0 >> 32, (int)l0,
+                              l1 >> 32, (int)l1,
+                              l2 >> 32, (int)l2,
+                              0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_key     *key,
+        int                     line)
+{
+        __uint64_t              l0, l1;
+        cur->bc_ops->trace_key(cur, key, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+                                 line, i,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 0, 0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     line)
+{
+        __uint64_t              l0, l1, l2;
+        cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+                              line,
+                              l0 >> 32, (int)l0,
+                              l1 >> 32, (int)l1,
+                              l2 >> 32, (int)l2,
+                              0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     type,
+        int                     line)
+{
+        __uint32_t              s0;
+        __uint64_t              l0, l1;
+        char                    *s;
+        switch (type) {
+        case XBT_ARGS:
+                s = "args";
+                break;
+        case XBT_ENTRY:
+                s = "entry";
+                break;
+        case XBT_ERROR:
+                s = "error";
+                break;
+        case XBT_EXIT:
+                s = "exit";
+                break;
+        default:
+                s = "unknown";
+                break;
+        }
+        cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+                                 s0,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 (__psunsigned_t)cur->bc_bufs[0],
+                                 (__psunsigned_t)cur->bc_bufs[1],
+                                 (__psunsigned_t)cur->bc_bufs[2],
+                                 (__psunsigned_t)cur->bc_bufs[3],
+                                 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+                                 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 000000000000..b3f5eb3c3c6c
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define __XFS_BTREE_TRACE_H__
+struct xfs_btree_cur;
+struct xfs_buf;
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#ifdef XFS_BTREE_TRACE
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI   1
+#define XFS_BTREE_KTRACE_ARGBII  2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI    4
+#define XFS_BTREE_KTRACE_ARGIPK  5
+#define XFS_BTREE_KTRACE_ARGIPR  6
+#define XFS_BTREE_KTRACE_ARGIK   7
+#define XFS_BTREE_KTRACE_ARGR    8
+#define XFS_BTREE_KTRACE_CUR     9
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS        0
+#define XBT_ENTRY       1
+#define XBT_ERROR       2
+#define XBT_EXIT        3
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+                struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+                struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
+                xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+                union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+#define XFS_ALLOCBT_TRACE_SIZE  4096    /* size of global trace buffer */
+extern ktrace_t *xfs_allocbt_trace_buf;
+#define XFS_INOBT_TRACE_SIZE    4096    /* size of global trace buffer */
+extern ktrace_t *xfs_inobt_trace_buf;
+#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
+extern ktrace_t *xfs_bmbt_trace_buf;
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)  \
+        xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)      \
+        xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)  \
+        xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGI(c, i)      \
+        xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k)      \
+        xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)      \
+        xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)  \
+        xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r)      \
+        xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define XFS_BTREE_TRACE_CURSOR(c, t)    \
+        xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+#endif  /* XFS_BTREE_TRACE */
+#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 002fc2617c8e..92af4098c7e8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
        xfs_buf_log_item_t      *bip,
        int                     stale)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
        xfs_buftrace("XFS_UNPIN", bp);
        freed = atomic_dec_and_test(&bip->bli_refcount);
-        mp = bip->bli_item.li_mountp;
+        ailp = bip->bli_item.li_ailp;
        xfs_bunpin(bp);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
                xfs_buftrace("XFS_UNPIN STALE", bp);
                /*
                 * If we get called here because of an IO error, we may
-                 * or may not have the item on the AIL. xfs_trans_delete_ail()
+                 * or may not have the item on the AIL. xfs_trans_ail_delete()
                 * will take care of that situation.
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
-                        spin_lock(&mp->m_ail_lock);
+                        spin_lock(&ailp->xa_lock);
-                        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
                        xfs_buf_item_relse(bp);
                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
                }
@@ -707,8 +707,8 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-        if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
+        if (bp->b_mount != mp)
-                XFS_BUF_SET_FSPRIVATE3(bp, mp);
+                bp->b_mount = mp;
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -731,6 +731,7 @@ xfs_buf_item_init(
        bip->bli_item.li_type = XFS_LI_BUF;
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
+        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks(
                        xfs_buf_do_callbacks(bp, lip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
+                        xfs_biodone(bp);
-                        /*
-                         * XFS_SHUT flag gets set when we go thru the
-                         * entire buffer cache and deliberately start
-                         * throwing away delayed write buffers.
-                         * Since there's no biowait done on those,
-                         * we should just brelse them.
-                         */
-                        if (XFS_BUF_ISSHUT(bp)) {
-                            XFS_BUF_UNSHUT(bp);
-                                xfs_buf_relse(bp);
-                        } else {
-                                xfs_biodone(bp);
-                        }
                        return;
                }
@@ -1122,27 +1109,23 @@ xfs_buf_iodone(
        xfs_buf_t               *bp,
        xfs_buf_log_item_t      *bip)
 {
-        struct xfs_mount        *mp;
+        struct xfs_ail          *ailp = bip->bli_item.li_ailp;
        ASSERT(bip->bli_buf == bp);
        xfs_buf_rele(bp);
-        mp = bip->bli_item.li_mountp;
        /*
         * If we are forcibly shutting down, this may well be
         * off the AIL already. That's because we simulate the
         * log-committed callbacks to unpin these buffers. Or we may never
         * have put this item on AIL because of the transaction was
-         * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+         * aborted forcibly. xfs_trans_ail_delete() takes care of these.
         *
         * Either way, AIL is useless if we're forcing a shutdown.
         */
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
-        /*
+        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
-         * xfs_trans_delete_ail() drops the AIL lock.
-         */
-        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
        xfs_buf_item_free(bip);
 }
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644
index d2ce5dd70d87..000000000000
--- a/fs/xfs/xfs_clnt.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX.  In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters.  Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
-        int     flags;          /* flags -> see XFSMNT_... macros below */
-        int     flags2;         /* flags -> see XFSMNT2_... macros below */
-        int     logbufs;        /* Number of log buffers, -1 to default */
-        int     logbufsize;     /* Size of log buffers, -1 to default */
-        char    fsname[MAXNAMELEN+1];   /* data device name */
-        char    rtname[MAXNAMELEN+1];   /* realtime device filename */
-        char    logname[MAXNAMELEN+1];  /* journal device filename */
-        char    mtpt[MAXNAMELEN+1];     /* filesystem mount point */
-        int     sunit;          /* stripe unit (BBs) */
-        int     swidth;         /* stripe width (BBs), multiple of sunit */
-        uchar_t iosizelog;      /* log2 of the preferred I/O size */
-        int     ihashsize;      /* inode hash table size (buckets) */
-};
-/*
- * XFS mount option flags -- args->flags1
- */
-#define XFSMNT_ATTR2            0x00000001      /* allow ATTR2 EA format */
-#define XFSMNT_WSYNC            0x00000002      /* safe mode nfs mount
-                                                 * compatible */
-#define XFSMNT_INO64            0x00000004      /* move inode numbers up
-                                                 * past 2^32 */
-#define XFSMNT_UQUOTA           0x00000008      /* user quota accounting */
-#define XFSMNT_PQUOTA           0x00000010      /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF        0x00000020      /* user quota limit
-                                                 * enforcement */
-#define XFSMNT_PQUOTAENF        0x00000040      /* IRIX project quota limit
-                                                 * enforcement */
-#define XFSMNT_QUIET            0x00000080      /* don't report mount errors */
-#define XFSMNT_NOALIGN          0x00000200      /* don't allocate at
-                                                 * stripe boundaries*/
-#define XFSMNT_RETERR           0x00000400      /* return error to user */
-#define XFSMNT_NORECOVERY       0x00000800      /* no recovery, implies
-                                                 * read-only mount */
-#define XFSMNT_SHARED           0x00001000      /* shared XFS mount */
-#define XFSMNT_IOSIZE           0x00002000      /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC     0x00004000      /* o_sync is REALLY o_sync */
-                                                /* (osyncisdsync is default) */
-#define XFSMNT_NOATTR2          0x00008000      /* turn off ATTR2 EA format */
-#define XFSMNT_32BITINODES      0x00200000      /* restrict inodes to 32
-                                                 * bits of address space */
-#define XFSMNT_GQUOTA           0x00400000      /* group quota accounting */
-#define XFSMNT_GQUOTAENF        0x00800000      /* group quota limit
-                                                 * enforcement */
-#define XFSMNT_NOUUID           0x01000000      /* Ignore fs uuid */
-#define XFSMNT_DMAPI            0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER          0x04000000      /* use write barriers */
-#define XFSMNT_IKEEP            0x08000000      /* inode cluster delete */
-#define XFSMNT_SWALLOC          0x10000000      /* turn on stripe width
-                                                 * allocation */
-#define XFSMNT_DIRSYNC          0x40000000      /* sync creat,link,unlink,rename
-                                                 * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2           0x80000000      /* more flags set in flags2 */
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE   0x00000001      /* don't report large preferred
-                                                 * I/O size in stat(2) */
-#define XFSMNT2_FILESTREAMS     0x00000002      /* enable the filestreams
-                                                 * allocator */
-#endif  /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8be0b00ede9a..70b710c1792d 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
 typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
 typedef struct xfs_da_node_entry xfs_da_node_entry_t;
-#define XFS_DA_MAXHASH  ((xfs_dahash_t)-1) /* largest valid hash value */
 #define XFS_LBSIZE(mp)  (mp)->m_sb.sb_blocksize
-#define XFS_LBLOG(mp)   (mp)->m_sb.sb_blocklog
-#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry)      \
-        (((bno) << (mp)->m_dircook_elog) | (entry))
-#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)   \
-        (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define XFS_DA_COOKIE_HASH(mp,cookie)           ((xfs_dahash_t)cookie)
-#define XFS_DA_COOKIE_BNO(mp,cookie)            \
-        ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-                (xfs_dablk_t)0 : \
-                (xfs_dablk_t)((xfs_off_t)(cookie) >> \
-                                ((mp)->m_dircook_elog + 32))))
-#define XFS_DA_COOKIE_ENTRY(mp,cookie)          \
-        ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-                (xfs_dablk_t)0 : \
-                (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
-                                ((1 << (mp)->m_dircook_elog) - 1))))
 /*========================================================================
 * Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
 };
-#ifdef __KERNEL__
 /*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
 *========================================================================*/
 /*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 75b0cd4da0ea..b4c1ee713492 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -49,9 +49,8 @@
 */
 int
 xfs_swapext(
-        xfs_swapext_t   __user *sxu)
+        xfs_swapext_t   *sxp)
 {
-        xfs_swapext_t   *sxp;
        xfs_inode_t     *ip, *tip;
        struct file     *file, *target_file;
        int             error = 0;
@@ -62,11 +61,6 @@ xfs_swapext(
                goto out;
        }
-        if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
-                error = XFS_ERROR(EFAULT);
-                goto out_free_sxp;
-        }
        /* Pull information for the target fd */
        file = fget((int)sxp->sx_fdtarget);
        if (!file) {
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index da178205be68..4f55a6306558 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -46,7 +46,7 @@ typedef struct xfs_swapext
 /*
 * Syscall interface for xfs_swapext
 */
-int     xfs_swapext(struct xfs_swapext __user *sx);
+int     xfs_swapext(struct xfs_swapext *sx);
 int     xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
                struct xfs_swapext *sxp);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c9065eaf2a4d..162e8726df5e 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -18,32 +18,29 @@
 #ifndef __XFS_DINODE_H__
 #define __XFS_DINODE_H__
-struct xfs_buf;
+#define XFS_DINODE_MAGIC                0x494e  /* 'IN' */
-struct xfs_mount;
+#define XFS_DINODE_GOOD_VERSION(v)      (((v) == 1 || (v) == 2))
-#define XFS_DINODE_VERSION_1    1
-#define XFS_DINODE_VERSION_2    2
-#define XFS_DINODE_GOOD_VERSION(v)      \
-        (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
-#define XFS_DINODE_MAGIC        0x494e  /* 'IN' */
-/*
- * Disk inode structure.
- * This is just the header; the inode is expanded to fill a variable size
- * with the last field expanding.  It is split into the core and "other"
- * because we only need the core part in the in-core inode.
- */
 typedef struct xfs_timestamp {
        __be32          t_sec;          /* timestamp seconds */
        __be32          t_nsec;         /* timestamp nanoseconds */
 } xfs_timestamp_t;
 /*
- * Note: Coordinate changes to this structure with the XFS_DI_* #defines
+ * On-disk inode structure.
- * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode
+ *
- * in xfs_inode.h.
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
 */
-typedef struct xfs_dinode_core {
+typedef struct xfs_dinode {
        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
        __be16          di_mode;        /* mode and type of file */
        __u8            di_version;     /* inode version */
@@ -69,34 +66,12 @@ typedef struct xfs_dinode_core {
        __be16          di_dmstate;     /* DMIG state info */
        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
        __be32          di_gen;         /* generation number */
-} xfs_dinode_core_t;
-#define DI_MAX_FLUSH 0xffff
+        /* di_next_unlinked is the only non-core field in the old dinode */
+        __be32          di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
-typedef struct xfs_dinode
+#define DI_MAX_FLUSH 0xffff
-{
-        xfs_dinode_core_t       di_core;
-        /*
-         * In adding anything between the core and the union, be
-         * sure to update the macros like XFS_LITINO below and
-         * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
-         */
-        __be32                  di_next_unlinked;/* agi unlinked list ptr */
-        union {
-                xfs_bmdr_block_t di_bmbt;       /* btree root block */
-                xfs_bmbt_rec_32_t di_bmx[1];    /* extent list */
-                xfs_dir2_sf_t   di_dir2sf;      /* shortform directory v2 */
-                char            di_c[1];        /* local contents */
-                __be32          di_dev;         /* device for S_IFCHR/S_IFBLK */
-                uuid_t          di_muuid;       /* mount point value */
-                char            di_symlink[1];  /* local symbolic link */
-        }               di_u;
-        union {
-                xfs_bmdr_block_t di_abmbt;      /* btree root block */
-                xfs_bmbt_rec_32_t di_abmx[1];   /* extent list */
-                xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
-        }               di_a;
-} xfs_dinode_t;
 /*
 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -107,50 +82,14 @@ typedef struct xfs_dinode
 #define XFS_MAXLINK_1           65535U
 /*
- * Bit names for logging disk inodes only
- */
-#define XFS_DI_MAGIC            0x0000001
-#define XFS_DI_MODE             0x0000002
-#define XFS_DI_VERSION          0x0000004
-#define XFS_DI_FORMAT           0x0000008
-#define XFS_DI_ONLINK           0x0000010
-#define XFS_DI_UID              0x0000020
-#define XFS_DI_GID              0x0000040
-#define XFS_DI_NLINK            0x0000080
-#define XFS_DI_PROJID           0x0000100
-#define XFS_DI_PAD              0x0000200
-#define XFS_DI_ATIME            0x0000400
-#define XFS_DI_MTIME            0x0000800
-#define XFS_DI_CTIME            0x0001000
-#define XFS_DI_SIZE             0x0002000
-#define XFS_DI_NBLOCKS          0x0004000
-#define XFS_DI_EXTSIZE          0x0008000
-#define XFS_DI_NEXTENTS         0x0010000
-#define XFS_DI_NAEXTENTS        0x0020000
-#define XFS_DI_FORKOFF          0x0040000
-#define XFS_DI_AFORMAT          0x0080000
-#define XFS_DI_DMEVMASK         0x0100000
-#define XFS_DI_DMSTATE          0x0200000
-#define XFS_DI_FLAGS            0x0400000
-#define XFS_DI_GEN              0x0800000
-#define XFS_DI_NEXT_UNLINKED    0x1000000
-#define XFS_DI_U                0x2000000
-#define XFS_DI_A                0x4000000
-#define XFS_DI_NUM_BITS         27
-#define XFS_DI_ALL_BITS         ((1 << XFS_DI_NUM_BITS) - 1)
-#define XFS_DI_CORE_BITS        (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
-/*
 * Values for di_format
 */
-typedef enum xfs_dinode_fmt
+typedef enum xfs_dinode_fmt {
-{
+        XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
-        XFS_DINODE_FMT_DEV,             /* CHR, BLK: di_dev */
+        XFS_DINODE_FMT_LOCAL,           /* bulk data */
-        XFS_DINODE_FMT_LOCAL,           /* DIR, REG: di_c */
+        XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
-                                        /* LNK: di_symlink */
+        XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
-        XFS_DINODE_FMT_EXTENTS,         /* DIR, REG, LNK: di_bmx */
+        XFS_DINODE_FMT_UUID             /* uuid_t */
-        XFS_DINODE_FMT_BTREE,           /* DIR, REG, LNK: di_bmbt */
-        XFS_DINODE_FMT_UUID             /* MNT: di_uuid */
 } xfs_dinode_fmt_t;
 /*
@@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt
 */
 #define XFS_LITINO(mp)  ((mp)->m_litino)
 #define XFS_BROOT_SIZE_ADJ      \
-        (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+        (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 /*
 * Inode data & attribute fork sizes, per inode.
 */
-#define XFS_DFORK_Q(dip)                ((dip)->di_core.di_forkoff != 0)
+#define XFS_DFORK_Q(dip)                ((dip)->di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_core.di_forkoff << 3))
+#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_forkoff << 3))
 #define XFS_DFORK_DSIZE(dip,mp) \
        (XFS_DFORK_Q(dip) ? \
@@ -187,23 +126,42 @@ typedef enum xfs_dinode_fmt
                XFS_DFORK_DSIZE(dip, mp) : \
                XFS_DFORK_ASIZE(dip, mp))
-#define XFS_DFORK_DPTR(dip)                 ((dip)->di_u.di_c)
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+        ((char *)(dip) + sizeof(struct xfs_dinode))
 #define XFS_DFORK_APTR(dip)     \
-        ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
+        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)    \
        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
 #define XFS_DFORK_FORMAT(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-                (dip)->di_core.di_format : \
+                (dip)->di_format : \
-                (dip)->di_core.di_aformat)
+                (dip)->di_aformat)
 #define XFS_DFORK_NEXTENTS(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-                be32_to_cpu((dip)->di_core.di_nextents) : \
+                be32_to_cpu((dip)->di_nextents) : \
-                be16_to_cpu((dip)->di_core.di_anextents))
+                be16_to_cpu((dip)->di_anextents))
 #define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)XFS_BUF_PTR(bp))
 /*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+/*
 * Values for di_flags
 * There should be a one-to-one correspondence between these flags and the
 * XFS_XFLAG_s.
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index deecc9d238f8..6ac44b550d39 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -34,13 +34,6 @@ struct xfs_mount;
 struct xfs_trans;
 /*
- * Maximum size of a shortform directory.
- */
-#define XFS_DIR2_SF_MAX_SIZE    \
-        (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
-         (uint)sizeof(xfs_agino_t))
-/*
 * Inode number stored as 8 8-bit values.
 */
 typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index a1e55fb9d5dd..e71e2581c0c3 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -25,7 +25,6 @@
 #include "xfs_inum.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_clnt.h"
 static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
 };
 int
-xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_dmops_get(struct xfs_mount *mp)
 {
-        if (args->flags & XFSMNT_DMAPI) {
+        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                cmn_err(CE_WARN,
                        "XFS: dmapi support not available in this kernel.");
                return EINVAL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f227ecd1a294..92d5cd5bf4f2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
-static void
-xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
-{
-        if (mp != NULL) {
-                char    *newfmt;
-                int     len = 16 + mp->m_fsname_len + strlen(fmt);
-                newfmt = kmem_alloc(len, KM_SLEEP);
-                sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
-                icmn_err(level, newfmt, ap);
-                kmem_free(newfmt);
-        } else {
-                icmn_err(level, fmt, ap);
-        }
-}
 void
 xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 11543f10b0c6..0c93051c4651 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
 struct xfs_mount;
-/* PRINTFLIKE4 */
+extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
+                char *fmt, va_list ap)
+        __attribute__ ((format (printf, 3, 0)));
 extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...);
+                        char *fmt, ...)
-/* PRINTFLIKE3 */
+        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_hex_dump(void *p, int length);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8aa28f751b2a..05a4bdd4be39 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 STATIC void
 xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        mp = efip->efi_item.li_mountp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /*
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 STATIC void
 xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
        xfs_log_item_desc_t     *lidp;
-        mp = efip->efi_item.li_mountp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
                /*
                 * free the xaction descriptor pointing to this item
                 */
                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
                xfs_trans_free_item(tp, lidp);
-                /*
-                 * pull the item off the AIL.
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t	*mp,
        efip->efi_item.li_type = XFS_LI_EFI;
        efip->efi_item.li_ops = &xfs_efi_item_ops;
        efip->efi_item.li_mountp = mp;
+        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -345,25 +340,22 @@ void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        int             extents_left;
+        int                     extents_left;
-        mp = efip->efi_item.li_mountp;
        ASSERT(efip->efi_next_extent > 0);
        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
        ASSERT(efip->efi_next_extent >= nextents);
        efip->efi_next_extent -= nextents;
        extents_left = efip->efi_next_extent;
        if (extents_left == 0) {
-                /*
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t	*mp,
        efdp->efd_item.li_type = XFS_LI_EFD;
        efdp->efd_item.li_ops = &xfs_efd_item_ops;
        efdp->efd_item.li_mountp = mp;
+        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 01c0cc88d3f3..589c41c38446 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -113,22 +113,14 @@ struct getbmapx {
 #define BMV_IF_ATTRFORK         0x1     /* return attr fork rather than data */
 #define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
-#define BMV_IF_VALID    (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
+#define BMV_IF_DELALLOC         0x8     /* rtn status BMV_OF_DELALLOC if req */
-#ifdef __KERNEL__
+#define BMV_IF_VALID    \
-#define BMV_IF_EXTENDED 0x40000000      /* getpmapx if set */
+        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
-#endif
 /*      bmv_oflags values - returned for for each non-header segment */
 #define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC         0x2     /* segment = delayed allocation */
-/*      Convert getbmap <-> getbmapx - move fields from p1 to p2. */
+#define BMV_OF_LAST             0x4     /* segment is the last in the file */
-#define GETBMAP_CONVERT(p1,p2) {        \
-        p2.bmv_offset = p1.bmv_offset;  \
-        p2.bmv_block = p1.bmv_block;    \
-        p2.bmv_length = p1.bmv_length;  \
-        p2.bmv_count = p1.bmv_count;    \
-        p2.bmv_entries = p1.bmv_entries;  }
 /*
 * Structure for XFS_IOC_FSSETDM.
@@ -426,10 +418,6 @@ typedef struct xfs_handle {
 #define XFS_IOC_GETXFLAGS       FS_IOC_GETFLAGS
 #define XFS_IOC_SETXFLAGS       FS_IOC_SETFLAGS
 #define XFS_IOC_GETVERSION      FS_IOC_GETVERSION
-/* 32-bit compat counterparts */
-#define XFS_IOC32_GETXFLAGS     FS_IOC32_GETFLAGS
-#define XFS_IOC32_SETXFLAGS     FS_IOC32_SETFLAGS
-#define XFS_IOC32_GETVERSION    FS_IOC32_GETVERSION
 /*
 * ioctl commands that replace IRIX fcntl()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 84583cf73db3..852b6d32e8d0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-        xfs_btree_sblock_t      *block;
+        struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = 0;
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(mp, bp);
                if (error) {
                        goto error0;
@@ -435,6 +435,9 @@ xfs_growfs_data(
        xfs_growfs_data_t       *in)
 {
        int error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_data_private(mp, in);
@@ -448,6 +451,9 @@ xfs_growfs_log(
        xfs_growfs_log_t        *in)
 {
        int error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_log_private(mp, in);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index aad8c5da38af..e6ebbaeb4dc6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -41,68 +41,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-/*
- * Log specified fields for the inode given by bp and off.
- */
-STATIC void
-xfs_ialloc_log_di(
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_buf_t       *bp,            /* inode buffer */
-        int             off,            /* index of inode in buffer */
-        int             fields)         /* bitmask of fields to log */
-{
-        int                     first;          /* first byte number */
-        int                     ioffset;        /* off in bytes */
-        int                     last;           /* last byte number */
-        xfs_mount_t             *mp;            /* mount point structure */
-        static const short      offsets[] = {   /* field offsets */
-                                                /* keep in sync with bits */
-                offsetof(xfs_dinode_core_t, di_magic),
-                offsetof(xfs_dinode_core_t, di_mode),
-                offsetof(xfs_dinode_core_t, di_version),
-                offsetof(xfs_dinode_core_t, di_format),
-                offsetof(xfs_dinode_core_t, di_onlink),
-                offsetof(xfs_dinode_core_t, di_uid),
-                offsetof(xfs_dinode_core_t, di_gid),
-                offsetof(xfs_dinode_core_t, di_nlink),
-                offsetof(xfs_dinode_core_t, di_projid),
-                offsetof(xfs_dinode_core_t, di_pad),
-                offsetof(xfs_dinode_core_t, di_atime),
-                offsetof(xfs_dinode_core_t, di_mtime),
-                offsetof(xfs_dinode_core_t, di_ctime),
-                offsetof(xfs_dinode_core_t, di_size),
-                offsetof(xfs_dinode_core_t, di_nblocks),
-                offsetof(xfs_dinode_core_t, di_extsize),
-                offsetof(xfs_dinode_core_t, di_nextents),
-                offsetof(xfs_dinode_core_t, di_anextents),
-                offsetof(xfs_dinode_core_t, di_forkoff),
-                offsetof(xfs_dinode_core_t, di_aformat),
-                offsetof(xfs_dinode_core_t, di_dmevmask),
-                offsetof(xfs_dinode_core_t, di_dmstate),
-                offsetof(xfs_dinode_core_t, di_flags),
-                offsetof(xfs_dinode_core_t, di_gen),
-                offsetof(xfs_dinode_t, di_next_unlinked),
-                offsetof(xfs_dinode_t, di_u),
-                offsetof(xfs_dinode_t, di_a),
-                sizeof(xfs_dinode_t)
-        };
-        ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
-        ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
-        mp = tp->t_mountp;
-        /*
-         * Get the inode-relative first and last bytes for these fields
-         */
-        xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
-        /*
-         * Convert to buffer offsets and log it.
-         */
-        ioffset = off << mp->m_sb.sb_inodelog;
-        first += ioffset;
-        last += ioffset;
-        xfs_trans_log_buf(tp, bp, first, last);
-}
 /*
 * Allocation group level functions.
@@ -119,6 +57,102 @@ xfs_ialloc_cluster_alignment(
 }
 /*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_inobt_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_inobt_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free)   /* free inode mask */
+{
+        union xfs_btree_rec     rec;
+        rec.inobt.ir_startino = cpu_to_be32(ino);
+        rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+        rec.inobt.ir_free = cpu_to_be64(free);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_inobt_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        __int32_t               *fcnt,  /* output: number of free inodes */
+        xfs_inofree_t           *free,  /* output: free inode mask */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *ino = be32_to_cpu(rec->inobt.ir_startino);
+                *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+                *free = be64_to_cpu(rec->inobt.ir_free);
+        }
+        return error;
+}
+/*
 * Allocate new inodes in the allocation group specified by agbp.
 * Return 0 for success, else error code.
 */
@@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc(
         * able to use the file system.
         */
        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-                version = XFS_DINODE_VERSION_2;
+                version = 2;
        else
-                version = XFS_DINODE_VERSION_1;
+                version = 1;
        /*
         * Seed the new inode cluster with a random generation number. This
@@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc(
                                         XFS_BUF_LOCK);
                ASSERT(fbuf);
                ASSERT(!XFS_BUF_GETERROR(fbuf));
                /*
-                 * Set initial values for the inodes in this buffer.
+                 * Initialize all inodes in this buffer and then log them.
+                 *
+                 * XXX: It would be much better if we had just one transaction to
+                 *      log a whole cluster of inodes instead of all the indivdual
+                 *      transactions causing a lot of log traffic.
                 */
                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
+                        int     ioffset = i << args.mp->m_sb.sb_inodelog;
+                        uint    isize = sizeof(struct xfs_dinode);
                        free = XFS_MAKE_IPTR(args.mp, fbuf, i);
-                        free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                        free->di_core.di_version = version;
+                        free->di_version = version;
-                        free->di_core.di_gen = cpu_to_be32(gen);
+                        free->di_gen = cpu_to_be32(gen);
                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        xfs_ialloc_log_di(tp, fbuf, i,
+                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-                                XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
                }
                xfs_trans_inode_alloc_buf(tp, fbuf);
        }
@@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc(
        /*
         * Insert records describing the new inode chunk into the btree.
         */
-        cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
+        cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
-                        XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc(
                        return error;
                }
                ASSERT(i == 0);
-                if ((error = xfs_inobt_insert(cur, &i))) {
+                if ((error = xfs_btree_insert(cur, &i))) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -676,8 +716,7 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
-        cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
-                                    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
         * This must work because we've just allocated some.
@@ -697,7 +736,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
@@ -741,7 +780,7 @@ nextag:
                        /*
                         * Search left with tcur, back up 1 record.
                         */
-                        if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+                        if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
                        doneleft = !i;
                        if (!doneleft) {
@@ -755,7 +794,7 @@ nextag:
                        /*
                         * Search right with cur, go forward 1 record.
                         */
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
                        doneright = !i;
                        if (!doneright) {
@@ -817,7 +856,7 @@ nextag:
                                 * further left.
                                 */
                                if (useleft) {
-                                        if ((error = xfs_inobt_decrement(tcur, 0,
+                                        if ((error = xfs_btree_decrement(tcur, 0,
                                                        &i)))
                                                goto error1;
                                        doneleft = !i;
@@ -837,7 +876,7 @@ nextag:
                                 * further right.
                                 */
                                else {
-                                        if ((error = xfs_inobt_increment(cur, 0,
+                                        if ((error = xfs_btree_increment(cur, 0,
                                                        &i)))
                                                goto error1;
                                        doneright = !i;
@@ -892,7 +931,7 @@ nextag:
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (rec.ir_freecount > 0)
                                        break;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
@@ -926,7 +965,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1061,7 @@ xfs_difree(
        /*
         * Initialize the cursor.
         */
-        cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                (xfs_inode_t *)0, 0);
 #ifdef DEBUG
        if (cur->bc_nlevels == 1) {
                int freecount = 0;
@@ -1036,7 +1074,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1098,8 +1136,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
-                if ((error = xfs_inobt_delete(cur, &i))) {
+                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1141,7 +1179,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1158,36 +1196,28 @@ error0:
 }
 /*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
 */
-/*ARGSUSED*/
 int
-xfs_dilocate(
+xfs_imap(
-        xfs_mount_t     *mp,    /* file system mount structure */
+        xfs_mount_t      *mp,   /* file system mount structure */
-        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_trans_t      *tp,   /* transaction pointer */
        xfs_ino_t       ino,    /* inode to locate */
-        xfs_fsblock_t   *bno,   /* output: block containing inode */
+        struct xfs_imap *imap,  /* location map structure */
-        int             *len,   /* output: num blocks in inode cluster */
+        uint            flags)  /* flags for inode btree lookup */
-        int             *off,   /* output: index in block of inode */
-        uint            flags)  /* flags concerning inode lookup */
 {
        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
-        xfs_buf_t       *agbp;  /* agi buffer */
        xfs_agino_t     agino;  /* inode number within alloc group */
        xfs_agnumber_t  agno;   /* allocation group number */
        int             blks_per_cluster; /* num blocks per inode cluster */
        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
-        xfs_agino_t     chunk_agino;    /* first agino in inode chunk */
-        __int32_t       chunk_cnt;      /* count of free inodes in chunk */
-        xfs_inofree_t   chunk_free;     /* mask of free inodes in chunk */
        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
-        xfs_btree_cur_t *cur;   /* inode btree cursor */
        int             error;  /* error code */
-        int             i;      /* temp state */
        int             offset; /* index of inode in its buffer */
        int             offset_agbno;   /* blks from chunk start to inode */
        ASSERT(ino != NULLFSINO);
        /*
         * Split up the inode number into its parts.
         */
@@ -1198,24 +1228,24 @@ xfs_dilocate(
            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
                /* no diagnostics for bulkstat, ino comes from userspace */
-                if (flags & XFS_IMAP_BULKSTAT)
+                if (flags & XFS_IGET_BULKSTAT)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: agno (%d) >= "
+                                        "xfs_imap: agno (%d) >= "
                                        "mp->m_sb.sb_agcount (%d)",
                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: agbno (0x%llx) >= "
+                                        "xfs_imap: agbno (0x%llx) >= "
                                        "mp->m_sb.sb_agblocks (0x%lx)",
                                        (unsigned long long) agbno,
                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: ino (0x%llx) != "
+                                        "xfs_imap: ino (0x%llx) != "
                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
                                        "(0x%llx)",
                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
@@ -1224,65 +1254,89 @@ xfs_dilocate(
 #endif /* DEBUG */
                return XFS_ERROR(EINVAL);
        }
-        if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
-            !(flags & XFS_IMAP_LOOKUP)) {
+        /*
+         * If the inode cluster size is the same as the blocksize or
+         * smaller we get to the buffer by simple arithmetics.
+         */
+        if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
-                *off = offset;
+                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                *len = 1;
+                imap->im_len = XFS_FSB_TO_BB(mp, 1);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-        if (*bno != NULLFSBLOCK) {
+        /*
+         * If we get a block number passed from bulkstat we can use it to
+         * find the buffer easily.
+         */
+        if (imap->im_blkno) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
-                *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+                cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
-                        offset;
+                offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
-                *len = blks_per_cluster;
+                imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
+        /*
+         * If the inode chunks are aligned then use simple maths to
+         * find the location. Otherwise we have to do a btree
+         * lookup to find the location.
+         */
        if (mp->m_inoalign_mask) {
                offset_agbno = agbno & mp->m_inoalign_mask;
                chunk_agbno = agbno - offset_agbno;
        } else {
+                xfs_btree_cur_t *cur;   /* inode btree cursor */
+                xfs_agino_t     chunk_agino; /* first agino in inode chunk */
+                __int32_t       chunk_cnt; /* count of free inodes in chunk */
+                xfs_inofree_t   chunk_free; /* mask of free inodes in chunk */
+                xfs_buf_t       *agbp;  /* agi buffer */
+                int             i;      /* temp state */
                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
                up_read(&mp->m_peraglock);
                if (error) {
-#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
                                        "xfs_ialloc_read_agi() returned "
                                        "error %d, agno %d",
                                        error, agno);
-#endif /* DEBUG */
                        return error;
                }
-                cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-                        (xfs_inode_t *)0, 0);
+                cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+                error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
-#ifdef DEBUG
+                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
                        goto error0;
                }
-                if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-                                &chunk_free, &i))) {
+                error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-#ifdef DEBUG
+                                &chunk_free, &i);
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                if (error) {
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
                        goto error0;
                }
                if (i == 0) {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
 #endif /* DEBUG */
                        error = XFS_ERROR(EINVAL);
                }
+ error0:
                xfs_trans_brelse(tp, agbp);
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
@@ -1290,19 +1344,35 @@ xfs_dilocate(
                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
                offset_agbno = agbno - chunk_agbno;
        }
        ASSERT(agbno >= chunk_agbno);
        cluster_agbno = chunk_agbno +
                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
                XFS_INO_TO_OFFSET(mp, ino);
-        *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
-        *off = offset;
+        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
-        *len = blks_per_cluster;
+        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+        imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+        /*
+         * If the inode number maps to a block outside the bounds
+         * of the file system then return NULL rather than calling
+         * read_buf and panicing when we get an error from the
+         * driver.
+         */
+        if ((imap->im_blkno + imap->im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        (unsigned long long) imap->im_blkno,
+                        (unsigned long long) imap->im_len,
+                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+                return XFS_ERROR(EINVAL);
+        }
        return 0;
-error0:
-        xfs_trans_brelse(tp, agbp);
-        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-        return error;
 }
 /*
@@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi(
        xfs_trans_log_buf(tp, bp, first, last);
 }
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+        struct xfs_agi          *agi)
+{
+        int                     i;
+        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+                ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
 /*
 * Read in the allocation group header (inode allocation section)
 */
 int
-xfs_ialloc_read_agi(
+xfs_read_agi(
-        xfs_mount_t     *mp,            /* file system mount structure */
+        struct xfs_mount        *mp,    /* file system mount structure */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,   /* allocation group number */
-        xfs_buf_t       **bpp)          /* allocation group hdr buf */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
 {
-        xfs_agi_t       *agi;           /* allocation group header */
+        struct xfs_agi          *agi;   /* allocation group header */
-        int             agi_ok;         /* agi is consistent */
+        int                     agi_ok; /* agi is consistent */
-        xfs_buf_t       *bp;            /* allocation group hdr buf */
+        int                     error;
-        xfs_perag_t     *pag;           /* per allocation group data */
-        int             error;
        ASSERT(agno != NULLAGNUMBER);
-        error = xfs_trans_read_buf(
-                        mp, tp, mp->m_ddev_targp,
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                        XFS_FSS_TO_BB(mp, 1), 0, bpp);
        if (error)
                return error;
-        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+        ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+        agi = XFS_BUF_TO_AGI(*bpp);
        /*
         * Validate the magic number of the agi block.
         */
-        agi = XFS_BUF_TO_AGI(bp);
+        agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-        agi_ok =
+                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+                be32_to_cpu(agi->agi_seqno) == agno;
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
                        XFS_RANDOM_IALLOC_READ_AGI))) {
-                XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
                                     mp, agi);
-                xfs_trans_brelse(tp, bp);
+                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+        xfs_check_agi_unlinked(agi);
+        return 0;
+}
+int
+xfs_ialloc_read_agi(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+        struct xfs_agi          *agi;   /* allocation group header */
+        struct xfs_perag        *pag;   /* per allocation group data */
+        int                     error;
+        error = xfs_read_agi(mp, tp, agno, bpp);
+        if (error)
+                return error;
+        agi = XFS_BUF_TO_AGI(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagi_init) {
                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
                pag->pagi_count = be32_to_cpu(agi->agi_count);
                pag->pagi_init = 1;
-        } else {
-                /*
-                 * It's possible for these to be out of sync if
-                 * we are in the middle of a forced shutdown.
-                 */
-                ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                        XFS_FORCED_SHUTDOWN(mp));
        }
-#ifdef DEBUG
+        /*
-        {
+         * It's possible for these to be out of sync if
-                int     i;
+         * we are in the middle of a forced shutdown.
+         */
-                for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                        ASSERT(agi->agi_unlinked[i]);
+                XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
-        *bpp = bp;
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 4e30ec1d13bc..50f558a4e0a8 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -20,6 +20,7 @@
 struct xfs_buf;
 struct xfs_dinode;
+struct xfs_imap;
 struct xfs_mount;
 struct xfs_trans;
@@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 }
-#ifdef __KERNEL__
 /*
 * Allocate an inode on disk.
 * Mode is used to tell whether the new inode will need space, and whether
@@ -105,17 +105,14 @@ xfs_difree(
        xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
 /*
- * Return the location of the inode in bno/len/off,
+ * Return the location of the inode in imap, for mapping it into a buffer.
- * for mapping it into a buffer.
 */
 int
-xfs_dilocate(
+xfs_imap(
        struct xfs_mount *mp,           /* file system mount structure */
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_ino_t       ino,            /* inode to locate */
-        xfs_fsblock_t   *bno,           /* output: block containing inode */
+        struct xfs_imap *imap,          /* location map structure */
-        int             *len,           /* output: num blocks in cluster*/
-        int             *off,           /* output: index in block of inode */
        uint            flags);         /* flags for inode btree lookup */
 /*
@@ -154,6 +151,24 @@ xfs_ialloc_pagi_init(
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_agnumber_t  agno);          /* allocation group number */
-#endif  /* __KERNEL__ */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                __int32_t fcnt, xfs_inofree_t free, int *stat);
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                __int32_t fcnt, xfs_inofree_t free, int *stat);
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 83502f3edef0..99f2408e8d8e 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -35,2044 +35,349 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-                xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
-/*
+STATIC int
- * Single level of the xfs_inobt_delete record deletion routine.
+xfs_inobt_get_minrecs(
- * Delete record pointed to by cur/level.
+        struct xfs_btree_cur    *cur,
- * Remove the record from its block then rebalance the tree.
+        int                     level)
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                              /* error */
-xfs_inobt_delrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level removing record from */
-        int                     *stat)  /* fail/done/go-on */
 {
-        xfs_buf_t               *agbp;  /* buffer for a.g. inode header */
+        return cur->bc_mp->m_inobt_mnr[level != 0];
-        xfs_mount_t             *mp;    /* mount structure */
+}
-        xfs_agi_t               *agi;   /* allocation group inode header */
-        xfs_inobt_block_t       *block; /* btree block record/key lives in */
-        xfs_agblock_t           bno;    /* btree block number */
-        xfs_buf_t               *bp;    /* buffer for block */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* kp points here if block is level 0 */
-        xfs_inobt_key_t         *kp = NULL;     /* pointer to btree keys */
-        xfs_agblock_t           lbno;   /* left block's block number */
-        xfs_buf_t               *lbp;   /* left block's buffer pointer */
-        xfs_inobt_block_t       *left;  /* left btree block */
-        xfs_inobt_key_t         *lkp;   /* left block key pointer */
-        xfs_inobt_ptr_t         *lpp;   /* left block address pointer */
-        int                     lrecs = 0;      /* number of records in left block */
-        xfs_inobt_rec_t         *lrp;   /* left block record pointer */
-        xfs_inobt_ptr_t         *pp = NULL;     /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_agblock_t           rbno;   /* right block's block number */
-        xfs_buf_t               *rbp;   /* right block's buffer pointer */
-        xfs_inobt_block_t       *right; /* right btree block */
-        xfs_inobt_key_t         *rkp;   /* right block key pointer */
-        xfs_inobt_rec_t         *rp;    /* pointer to btree records */
-        xfs_inobt_ptr_t         *rpp;   /* right block address pointer */
-        int                     rrecs = 0;      /* number of records in right block */
-        int                     numrecs;
-        xfs_inobt_rec_t         *rrp;   /* right block record pointer */
-        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-        mp = cur->bc_mp;
-        /*
-         * Get the index of the entry being deleted, check for nothing there.
-         */
-        ptr = cur->bc_ptrs[level];
-        if (ptr == 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Get the buffer & block containing the record or key/ptr.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Fail if we're off the end of the block.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
+STATIC struct xfs_btree_cur *
-        if (ptr > numrecs) {
+xfs_inobt_dup_cursor(
-                *stat = 0;
+        struct xfs_btree_cur    *cur)
-                return 0;
+{
-        }
+        return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
-        /*
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno);
-         * It's a nonleaf.  Excise the key and ptr being deleted, by
+}
-         * sliding the entries past them down one.
-         * Log the changed areas of the block.
-         */
-        if (level > 0) {
-                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
-                                return error;
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&kp[ptr - 1], &kp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        memmove(&pp[ptr - 1], &pp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
-                        xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                }
-        }
-        /*
-         * It's a leaf.  Excise the record being deleted, by sliding the
-         * entries past it down one.  Log the changed areas of the block.
-         */
-        else {
-                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&rp[ptr - 1], &rp[ptr],
-                                (numrecs - ptr) * sizeof(*rp));
-                        xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                /*
-                 * If it's the first record in the block, we'll need a key
-                 * structure to pass up to the next level (updkey).
-                 */
-                if (ptr == 1) {
-                        key.ir_startino = rp->ir_startino;
-                        kp = &key;
-                }
-        }
-        /*
-         * Decrement and log the number of entries in the block.
-         */
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * Is this the root level?  If so, we're almost done.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                /*
-                 * If this is the root level,
-                 * and there's only one entry left,
-                 * and it's NOT the leaf level,
-                 * then we can get rid of this level.
-                 */
-                if (numrecs == 1 && level > 0) {
-                        agbp = cur->bc_private.a.agbp;
-                        agi = XFS_BUF_TO_AGI(agbp);
-                        /*
-                         * pp is still set to the first pointer in the block.
-                         * Make it the new root of the btree.
-                         */
-                        bno = be32_to_cpu(agi->agi_root);
-                        agi->agi_root = *pp;
-                        be32_add_cpu(&agi->agi_level, -1);
-                        /*
-                         * Free the block.
-                         */
-                        if ((error = xfs_free_extent(cur->bc_tp,
-                                XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
-                                return error;
-                        xfs_trans_binval(cur->bc_tp, bp);
-                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
-                                XFS_AGI_ROOT | XFS_AGI_LEVEL);
-                        /*
-                         * Update the cursor so there's one fewer level.
-                         */
-                        cur->bc_bufs[level] = NULL;
-                        cur->bc_nlevels--;
-                } else if (level > 0 &&
-                           (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we deleted the leftmost entry in the block, update the
-         * key values above us in the tree.
-         */
-        if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
-                return error;
-        /*
-         * If the number of records remaining in the block is at least
-         * the minimum, we're done.
-         */
-        if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                if (level > 0 &&
-                    (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Otherwise, we have to move some records around to keep the
-         * tree balanced.  Look at the left and right sibling blocks to
-         * see if we can re-balance by moving only one record.
-         */
-        rbno = be32_to_cpu(block->bb_rightsib);
-        lbno = be32_to_cpu(block->bb_leftsib);
-        bno = NULLAGBLOCK;
-        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-        /*
-         * Duplicate the cursor so our btree manipulations here won't
-         * disrupt the next level up.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        /*
-         * If there's a right sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (rbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the last entry in the next block.
-                 * Actually any entry but the first would suffice.
-                 */
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_inobt_increment(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(right->bb_leftsib);
-                /*
-                 * If right block is full enough so that removing one entry
-                 * won't make it too empty, and left-shifting an entry out
-                 * of right to us works, we're done.
-                 */
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_inobt_lshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level > 0 &&
-                                    (error = xfs_inobt_decrement(cur, level,
-                                                &i)))
-                                        return error;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference, and fix up the temp cursor to point
-                 * to our block again (last record).
-                 */
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLAGBLOCK) {
-                        xfs_btree_firstrec(tcur, level);
-                        if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                                goto error0;
-                }
-        }
-        /*
-         * If there's a left sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (lbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the first entry in the
-                 * previous block.
-                 */
-                xfs_btree_firstrec(tcur, level);
-                if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                        goto error0;
-                xfs_btree_firstrec(tcur, level);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(left->bb_rightsib);
-                /*
-                 * If left block is full enough so that removing one entry
-                 * won't make it too empty, and right-shifting an entry out
-                 * of left to us works, we're done.
-                 */
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_inobt_rshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference.
-                 */
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * Delete the temp cursor, we're done with it.
-         */
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        /*
-         * If here, we need to do a join to keep the tree balanced.
-         */
-        ASSERT(bno != NULLAGBLOCK);
-        /*
-         * See if we can join with the left neighbor block.
-         */
-        if (lbno != NULLAGBLOCK &&
-            lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "right" to be the starting block,
-                 * "left" to be the left neighbor.
-                 */
-                rbno = bno;
-                right = block;
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                rbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        return error;
-        }
-        /*
-         * If that won't work, see if we can join with the right neighbor block.
-         */
-        else if (rbno != NULLAGBLOCK &&
-                 rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "left" to be the starting block,
-                 * "right" to be the right neighbor.
-                 */
-                lbno = bno;
-                left = block;
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                lbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        return error;
-        }
-        /*
-         * Otherwise, we can't fix the imbalance.
-         * Just return.  This is probably a logic error, but it's not fatal.
-         */
-        else {
-                if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * We're now going to join "left" and "right" by moving all the stuff
-         * in "right" to "left" and deleting "right".
-         */
-        if (level > 0) {
-                /*
-                 * It's a non-leaf.  Move keys and pointers.
-                 */
-                lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        } else {
-                /*
-                 * It's a leaf.  Move records.
-                 */
-                lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        }
-        /*
-         * If we joined with the left neighbor, set the buffer in the
-         * cursor to the left block, and fix up the index.
-         */
-        if (bp != lbp) {
-                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += lrecs;
-        }
-        /*
-         * If we joined with the right neighbor and there's a level above
-         * us, increment the cursor at that level.
-         */
-        else if (level + 1 < cur->bc_nlevels &&
-                 (error = xfs_alloc_increment(cur, level + 1, &i)))
-                return error;
-        /*
-         * Fix up the number of records in the surviving block.
-         */
-        lrecs += rrecs;
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        /*
-         * Fix up the right block pointer in the surviving block, and log it.
-         */
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there is a right sibling now, make it point to the
-         * remaining block.
-         */
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                xfs_inobt_block_t       *rrblock;
-                xfs_buf_t               *rrbp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+STATIC void
-                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
+xfs_inobt_set_root(
-                                &rrbp, XFS_INO_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        union xfs_btree_ptr     *nptr,
-                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+        int                     inc)    /* level change */
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+{
-                        return error;
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-                rrblock->bb_leftsib = cpu_to_be32(lbno);
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-                xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-        }
-        /*
-         * Free the deleting block.
-         */
-        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                     cur->bc_private.a.agno, rbno), 1)))
-                return error;
-        xfs_trans_binval(cur->bc_tp, rbp);
-        /*
-         * Readjust the ptr at this level if it's not a leaf, since it's
-         * still pointing at the deletion point, which makes the cursor
-         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-         * We can't use decrement because it would change the next level up.
-         */
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        /*
-         * Return value means the next level up has something to do.
-         */
-        *stat = 2;
-        return 0;
-error0:
+        agi->agi_root = nptr->s;
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        be32_add_cpu(&agi->agi_level, inc);
-        return error;
+        xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
 }
-/*
+STATIC int
- * Insert one record/level.  Return information to the caller
+xfs_inobt_alloc_block(
- * allowing the next level up to proceed if necessary.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_ptr     *start,
-STATIC int                              /* error */
+        union xfs_btree_ptr     *new,
-xfs_inobt_insrec(
+        int                     length,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *stat)
-        int                     level,  /* level to insert record at */
-        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-        xfs_inobt_rec_t         *recp,  /* i/o: record data inserted */
-        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-        int                     *stat)  /* success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block record/key lives in */
+        xfs_alloc_arg_t         args;           /* block allocation args */
-        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;          /* error return value */
-        int                     error;  /* error return value */
+        xfs_agblock_t           sbno = be32_to_cpu(start->s);
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* key value being inserted */
-        xfs_inobt_key_t         *kp=NULL;       /* pointer to btree keys */
-        xfs_agblock_t           nbno;   /* block number of allocated block */
-        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-        xfs_inobt_key_t         nkey;   /* new key value, from split */
-        xfs_inobt_rec_t         nrec;   /* new record value, for caller */
-        int                     numrecs;
-        int                     optr;   /* old ptr value */
-        xfs_inobt_ptr_t         *pp;    /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_inobt_rec_t         *rp=NULL;       /* pointer to btree records */
-        /*
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-         * GCC doesn't understand the (arguably complex) control flow in
-         * this function and complains about uninitialized structure fields
-         * without this.
-         */
-        memset(&nrec, 0, sizeof(nrec));
-        /*
+        memset(&args, 0, sizeof(args));
-         * If we made it to the root level, allocate a new root block
+        args.tp = cur->bc_tp;
-         * and we're done.
+        args.mp = cur->bc_mp;
-         */
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
-        if (level >= cur->bc_nlevels) {
+        args.minlen = 1;
-                error = xfs_inobt_newroot(cur, &i);
+        args.maxlen = 1;
-                *bnop = NULLAGBLOCK;
+        args.prod = 1;
-                *stat = i;
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        error = xfs_alloc_vextent(&args);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
        }
-        /*
+        if (args.fsbno == NULLFSBLOCK) {
-         * Make a key out of the record data to be inserted, and save it.
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-         */
-        key.ir_startino = recp->ir_startino;
-        optr = ptr = cur->bc_ptrs[level];
-        /*
-         * If we're off the left edge, return failure.
-         */
-        if (ptr == 0) {
                *stat = 0;
                return 0;
        }
-        /*
+        ASSERT(args.len == 1);
-         * Get pointers to the btree buffer and block.
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-         */
-        bp = cur->bc_bufs[level];
+        new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-        /*
-         * Check that the new entry is being inserted in the right place.
-         */
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-                } else {
-                        kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLAGBLOCK;
-        ncur = NULL;
-        /*
-         * If the block is full, we can't insert the new entry until we
-         * make the block un-full.
-         */
-        if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * First, try shifting an entry to the right neighbor.
-                 */
-                if ((error = xfs_inobt_rshift(cur, level, &i)))
-                        return error;
-                if (i) {
-                        /* nothing */
-                }
-                /*
-                 * Next, try shifting an entry to the left neighbor.
-                 */
-                else {
-                        if ((error = xfs_inobt_lshift(cur, level, &i)))
-                                return error;
-                        if (i) {
-                                optr = ptr = cur->bc_ptrs[level];
-                        } else {
-                                /*
-                                 * Next, try splitting the current block
-                                 * in half. If this works we have to
-                                 * re-set our variables because
-                                 * we could be in a different block now.
-                                 */
-                                if ((error = xfs_inobt_split(cur, level, &nbno,
-                                                &nkey, &ncur, &i)))
-                                        return error;
-                                if (i) {
-                                        bp = cur->bc_bufs[level];
-                                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                                        if ((error = xfs_btree_check_sblock(cur,
-                                                        block, level, bp)))
-                                                return error;
-#endif
-                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ir_startino = nkey.ir_startino;
-                                } else {
-                                        /*
-                                         * Otherwise the insert fails.
-                                         */
-                                        *stat = 0;
-                                        return 0;
-                                }
-                        }
-                }
-        }
-        /*
-         * At this point we know there's room for our new entry in the block
-         * we're pointing at.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                /*
-                 * It's a non-leaf entry.  Make a hole for the new data
-                 * in the key and ptr regions of the block.
-                 */
-                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                                return error;
-                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-                /*
-                 * Now stuff the new data in, bump numrecs and log the new data.
-                 */
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                        return error;
-#endif
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be32(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_inobt_log_keys(cur, bp, ptr, numrecs);
-                xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
-        } else {
-                /*
-                 * It's a leaf entry.  Make a hole for the new record.
-                 */
-                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                /*
-                 * Now stuff the new record in, bump numrecs
-                 * and log the new data.
-                 */
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_inobt_log_recs(cur, bp, ptr, numrecs);
-        }
-        /*
-         * Log the new number of records in the btree header.
-         */
-        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        /*
-         * Check that the key/record is in the right place, now.
-         */
-        if (ptr < numrecs) {
-                if (level == 0)
-                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                                rp + ptr);
-                else
-                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                                kp + ptr);
-        }
-#endif
-        /*
-         * If we inserted at the start of a block, update the parents' keys.
-         */
-        if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
-                return error;
-        /*
-         * Return the new block number, if any.
-         * If there is one, give back a record value and a cursor too.
-         */
-        *bnop = nbno;
-        if (nbno != NULLAGBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
        *stat = 1;
        return 0;
 }
-/*
+STATIC int
- * Log header fields from a btree block.
+xfs_inobt_free_block(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        struct xfs_buf          *bp)
-xfs_inobt_log_block(
-        xfs_trans_t             *tp,    /* transaction pointer */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     fields) /* mask of fields: XFS_BB_... */
 {
-        int                     first;  /* first byte offset logged */
+        xfs_fsblock_t           fsbno;
-        int                     last;   /* last byte offset logged */
+        int                     error;
-        static const short      offsets[] = {   /* table of offsets */
-                offsetof(xfs_inobt_block_t, bb_magic),
-                offsetof(xfs_inobt_block_t, bb_level),
-                offsetof(xfs_inobt_block_t, bb_numrecs),
-                offsetof(xfs_inobt_block_t, bb_leftsib),
-                offsetof(xfs_inobt_block_t, bb_rightsib),
-                sizeof(xfs_inobt_block_t)
-        };
-        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+        fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-        xfs_trans_log_buf(tp, bp, first, last);
+        error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+        if (error)
+                return error;
+        xfs_trans_binval(cur->bc_tp, bp);
+        return error;
 }
-/*
+STATIC int
- * Log keys from a btree block (nonleaf).
+xfs_inobt_get_maxrecs(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        int                     level)
-xfs_inobt_log_keys(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     kfirst, /* index of first key to log */
-        int                     klast)  /* index of last key to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        return cur->bc_mp->m_inobt_mxr[level != 0];
-        int                     first;  /* first byte offset logged */
-        xfs_inobt_key_t         *kp;    /* key pointer in btree block */
-        int                     last;   /* last byte offset logged */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_inobt_log_ptrs(
+xfs_inobt_init_key_from_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     pfirst, /* index of first pointer to log */
-        int                     plast)  /* index of last pointer to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        key->inobt.ir_startino = rec->inobt.ir_startino;
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_inobt_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_inobt_log_recs(
+xfs_inobt_init_rec_from_key(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     rfirst, /* index of first record to log */
-        int                     rlast)  /* index of last record to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        rec->inobt.ir_startino = key->inobt.ir_startino;
-        int                     first;  /* first byte offset logged */
+}
-        int                     last;   /* last byte offset logged */
-        xfs_inobt_rec_t         *rp;    /* record pointer for btree block */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+STATIC void
-        rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+xfs_inobt_init_rec_from_cur(
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+        struct xfs_btree_cur    *cur,
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+        union xfs_btree_rec     *rec)
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+{
+        rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+        rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+        rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 /*
- * Lookup the record.  The cursor is made to point to it, based on dir.
+ * intial value of ptr for lookup
- * Return 0 if can't find any such record, 1 for success.
 */
-STATIC int                              /* error */
+STATIC void
-xfs_inobt_lookup(
+xfs_inobt_init_ptr_from_cur(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        union xfs_btree_ptr     *ptr)
-        int                     *stat)  /* success/failure */
 {
-        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_inobt_block_t       *block=NULL;    /* current btree block */
-        __int64_t               diff;   /* difference for the current key */
-        int                     error;  /* error return value */
-        int                     keyno=0;        /* current key number */
-        int                     level;  /* level in the btree */
-        xfs_mount_t             *mp;    /* file system mount point */
-        /*
-         * Get the allocation group header, and the root block number.
-         */
-        mp = cur->bc_mp;
-        {
-                xfs_agi_t       *agi;   /* a.g. inode header */
-                agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-                agno = be32_to_cpu(agi->agi_seqno);
-                agbno = be32_to_cpu(agi->agi_root);
-        }
-        /*
-         * Iterate over each level in the btree, starting at the root.
-         * For each level above the leaves, find the key we need, based
-         * on the lookup record, then follow the corresponding block
-         * pointer down to the next level.
-         */
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                xfs_buf_t       *bp;    /* buffer pointer for btree block */
-                xfs_daddr_t     d;      /* disk address of btree block */
-                /*
-                 * Get the disk address we're looking for.
-                 */
-                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                /*
-                 * If the old buffer at this level is for a different block,
-                 * throw it away, otherwise just use it.
-                 */
-                bp = cur->bc_bufs[level];
-                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = NULL;
-                if (!bp) {
-                        /*
-                         * Need to get a new buffer.  Read it, then
-                         * set it in the cursor, releasing the old one.
-                         */
-                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                        agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
-                                return error;
-                        xfs_btree_setbuf(cur, level, bp);
-                        /*
-                         * Point to the btree block, now that we have the buffer
-                         */
-                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                        if ((error = xfs_btree_check_sblock(cur, block, level,
-                                        bp)))
-                                return error;
-                } else
-                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                /*
-                 * If we already had a key match at a higher level, we know
-                 * we need to use the first entry in this block.
-                 */
-                if (diff == 0)
-                        keyno = 1;
-                /*
-                 * Otherwise we need to search this block.  Do a binary search.
-                 */
-                else {
-                        int             high;   /* high entry number */
-                        xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
-                        xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
-                        int             low;    /* low entry number */
-                        /*
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-                         * Get a pointer to keys or records.
-                         */
-                        if (level > 0)
-                                kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                        else
-                                krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
-                        /*
-                         * Set low and high entry numbers, 1-based.
-                         */
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                /*
-                                 * If the block is empty, the tree must
-                                 * be an empty leaf.
-                                 */
-                                ASSERT(level == 0 && cur->bc_nlevels == 1);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                *stat = 0;
-                                return 0;
-                        }
-                        /*
-                         * Binary search the block.
-                         */
-                        while (low <= high) {
-                                xfs_agino_t     startino;       /* key value */
-                                /*
-                                 * keyno is average of low and high.
-                                 */
-                                keyno = (low + high) >> 1;
-                                /*
-                                 * Get startino.
-                                 */
-                                if (level > 0) {
-                                        xfs_inobt_key_t *kkp;
-                                        kkp = kkbase + keyno - 1;
-                                        startino = be32_to_cpu(kkp->ir_startino);
-                                } else {
-                                        xfs_inobt_rec_t *krp;
-                                        krp = krbase + keyno - 1;
-                                        startino = be32_to_cpu(krp->ir_startino);
-                                }
-                                /*
-                                 * Compute difference to get next direction.
-                                 */
-                                diff = (__int64_t)
-                                        startino - cur->bc_rec.i.ir_startino;
-                                /*
-                                 * Less than, move right.
-                                 */
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                /*
-                                 * Greater than, move left.
-                                 */
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                /*
-                                 * Equal, we're done.
-                                 */
-                                else
-                                        break;
-                        }
-                }
-                /*
-                 * If there are more levels, set up for the next level
-                 * by getting the block number and filling in the cursor.
-                 */
-                if (level > 0) {
-                        /*
-                         * If we moved left, need the previous key number,
-                         * unless there isn't one.
-                         */
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                                return error;
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        /*
-         * Done with the search.
-         * See if we need to adjust the results.
-         */
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE &&
-                    keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                        int     i;
-                        cur->bc_ptrs[0] = keyno;
+        ptr->s = agi->agi_root;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
-                                return error;
-                        ASSERT(i == 1);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        /*
-         * Return if we succeeded or not.
-         */
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-                *stat = 0;
-        else
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        return 0;
 }
-/*
+STATIC __int64_t
- * Move 1 record left from cur/level if possible.
+xfs_inobt_key_diff(
- * Update cur to reflect the new path.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key)
-STATIC int                              /* error */
-xfs_inobt_lshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
-#ifdef DEBUG
+                          cur->bc_rec.i.ir_startino;
-        int                     i;      /* loop index */
-#endif
-        xfs_inobt_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-        xfs_inobt_block_t       *left;  /* left neighbor btree block */
-        xfs_inobt_key_t         *lkp=NULL;      /* key pointer for left block */
-        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-        xfs_inobt_rec_t         *lrp=NULL;      /* record pointer for left block */
-        int                     nrec;   /* new number of left block entries */
-        xfs_buf_t               *rbp;   /* buffer for right (current) block */
-        xfs_inobt_block_t       *right; /* right (current) btree block */
-        xfs_inobt_key_t         *rkp=NULL;      /* key pointer for right block */
-        xfs_inobt_ptr_t         *rpp=NULL;      /* address pointer for right block */
-        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-        /*
-         * Set up variables for this block as "right".
-         */
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-#endif
-        /*
-         * If we've got no left sibling then we can't shift an entry left.
-         */
-        if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] <= 1) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the left neighbor as "left".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                        0, &lbp, XFS_INO_BTREE_REF)))
-                return error;
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        nrec = be16_to_cpu(left->bb_numrecs) + 1;
-        /*
-         * If non-leaf, copy a key and a ptr to the left block.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                *lkp = *rkp;
-                xfs_inobt_log_keys(cur, lbp, nrec, nrec);
-                lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                        return error;
-#endif
-                *lpp = *rpp;
-                xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
-        }
-        /*
-         * If leaf, copy a record to the left block.
-         */
-        else {
-                lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_inobt_log_recs(cur, lbp, nrec, nrec);
-        }
-        /*
-         * Bump and log left's numrecs, decrement and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, 1);
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-        else
-                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
-        be16_add_cpu(&right->bb_numrecs, -1);
-        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Slide the contents of right down one entry.
-         */
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                        level)))
-                                return error;
-                }
-#endif
-                memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-        } else {
-                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ir_startino = rrp->ir_startino;
-                rkp = &key;
-        }
-        /*
-         * Update the parent key values of right.
-         */
-        if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
-                return error;
-        /*
-         * Slide the cursor value left one.
-         */
-        cur->bc_ptrs[level]--;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Allocate a new root block, fill it in.
+xfs_inobt_kill_root(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        struct xfs_buf          *bp,
-xfs_inobt_newroot(
+        int                     level,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_ptr     *newroot)
-        int                     *stat)  /* success/failure */
 {
-        xfs_agi_t               *agi;   /* a.g. inode header */
+        int                     error;
-        xfs_alloc_arg_t         args;   /* allocation argument structure */
-        xfs_inobt_block_t       *block; /* one half of the old root block */
-        xfs_buf_t               *bp;    /* buffer containing block */
-        int                     error;  /* error return value */
-        xfs_inobt_key_t         *kp;    /* btree key pointer */
-        xfs_agblock_t           lbno;   /* left block number */
-        xfs_buf_t               *lbp;   /* left buffer pointer */
-        xfs_inobt_block_t       *left;  /* left btree block */
-        xfs_buf_t               *nbp;   /* new (root) buffer */
-        xfs_inobt_block_t       *new;   /* new (root) btree block */
-        int                     nptr;   /* new value for key index, 1 or 2 */
-        xfs_inobt_ptr_t         *pp;    /* btree address pointer */
-        xfs_agblock_t           rbno;   /* right block number */
-        xfs_buf_t               *rbp;   /* right buffer pointer */
-        xfs_inobt_block_t       *right; /* right btree block */
-        xfs_inobt_rec_t         *rp;    /* btree record pointer */
-        ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
        /*
-         * Get a block & a buffer.
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
         */
-        agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+        xfs_inobt_set_root(cur, newroot, -1);
-        args.tp = cur->bc_tp;
+        error = xfs_inobt_free_block(cur, bp);
-        args.mp = cur->bc_mp;
+        if (error) {
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                be32_to_cpu(agi->agi_root));
-        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-                args.isfl = args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args)))
                return error;
-        /*
-         * None available, we fail.
-         */
-        if (args.fsbno == NULLFSBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-        new = XFS_BUF_TO_INOBT_BLOCK(nbp);
-        /*
-         * Set the root data in the a.g. inode structure.
-         */
-        agi->agi_root = cpu_to_be32(args.agbno);
-        be32_add_cpu(&agi->agi_level, 1);
-        xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
-                XFS_AGI_ROOT | XFS_AGI_LEVEL);
-        /*
-         * At the previous root level there are now two blocks: the old
-         * root, and the new block generated when it was split.
-         * We don't know which one the cursor is pointing at, so we
-         * set up variables "left" and "right" for each case.
-         */
-        bp = cur->bc_bufs[cur->bc_nlevels - 1];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
-                return error;
-#endif
-        if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                /*
-                 * Our block is left, pick up the right block.
-                 */
-                lbp = bp;
-                lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-                left = block;
-                rbno = be32_to_cpu(left->bb_rightsib);
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                rbno, 0, &rbp, XFS_INO_BTREE_REF)))
-                        return error;
-                bp = rbp;
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-                if ((error = xfs_btree_check_sblock(cur, right,
-                                cur->bc_nlevels - 1, rbp)))
-                        return error;
-                nptr = 1;
-        } else {
-                /*
-                 * Our block is right, pick up the left block.
-                 */
-                rbp = bp;
-                rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
-                right = block;
-                lbno = be32_to_cpu(right->bb_leftsib);
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                lbno, 0, &lbp, XFS_INO_BTREE_REF)))
-                        return error;
-                bp = lbp;
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-                if ((error = xfs_btree_check_sblock(cur, left,
-                                cur->bc_nlevels - 1, lbp)))
-                        return error;
-                nptr = 2;
-        }
-        /*
-         * Fill in the new block's btree header and log it.
-         */
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        new->bb_level = cpu_to_be16(cur->bc_nlevels);
-        new->bb_numrecs = cpu_to_be16(2);
-        new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-        new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-        xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
-        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-        /*
-         * Fill in the key data in the new root.
-         */
-        kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
-        if (be16_to_cpu(left->bb_level) > 0) {
-                kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
-                kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
-        } else {
-                rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-                kp[0].ir_startino = rp->ir_startino;
-                rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                kp[1].ir_startino = rp->ir_startino;
        }
-        xfs_inobt_log_keys(cur, nbp, 1, 2);
-        /*
-         * Fill in the pointer data in the new root.
-         */
-        pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
-        pp[0] = cpu_to_be32(lbno);
-        pp[1] = cpu_to_be32(rbno);
-        xfs_inobt_log_ptrs(cur, nbp, 1, 2);
-        /*
-         * Fix up the cursor.
-         */
-        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-        cur->bc_ptrs[cur->bc_nlevels] = nptr;
-        cur->bc_nlevels++;
-        *stat = 1;
-        return 0;
-}
-/*
+        XFS_BTREE_STATS_INC(cur, free);
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                              /* error */
-xfs_inobt_rshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
-{
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left (current) block */
-        xfs_inobt_block_t       *left;  /* left (current) btree block */
-        xfs_inobt_key_t         *lkp;   /* key pointer for left block */
-        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-        xfs_inobt_rec_t         *lrp;   /* record pointer for left block */
-        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-        xfs_inobt_block_t       *right; /* right neighbor btree block */
-        xfs_inobt_key_t         *rkp;   /* key pointer for right block */
-        xfs_inobt_ptr_t         *rpp;   /* address pointer for right block */
-        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-        xfs_btree_cur_t         *tcur;  /* temporary cursor */
-        /*
+        cur->bc_bufs[level] = NULL;
-         * Set up variables for this block as "left".
+        cur->bc_nlevels--;
-         */
-        lbp = cur->bc_bufs[level];
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * If we've got no right sibling then we can't shift an entry right.
-         */
-        if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the right neighbor as "right".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                        0, &rbp, XFS_INO_BTREE_REF)))
-                return error;
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Make a hole at the start of the right neighbor block, then
-         * copy the last left block entry to the hole.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                        return error;
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-        } else {
-                lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ir_startino = rrp->ir_startino;
-                rkp = &key;
-        }
-        /*
-         * Decrement and log left's numrecs, bump and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-        else
-                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
-        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Using a temporary cursor, update the parent key values of the
-         * block on the right.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        xfs_btree_lastrec(tcur, level);
-        if ((error = xfs_inobt_increment(tcur, level, &i)) ||
-            (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
-                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-                return error;
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        *stat = 1;
        return 0;
 }
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                              /* error */
-xfs_inobt_split(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to split */
-        xfs_agblock_t           *bnop,  /* output: block number allocated */
-        xfs_inobt_key_t         *keyp,  /* output: first key of new block */
-        xfs_btree_cur_t         **curp, /* output: new cursor */
-        int                     *stat)  /* success/failure */
-{
-        xfs_alloc_arg_t         args;   /* allocation argument structure */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index/record number */
-        xfs_agblock_t           lbno;   /* left (current) block number */
-        xfs_buf_t               *lbp;   /* buffer for left block */
-        xfs_inobt_block_t       *left;  /* left (current) btree block */
-        xfs_inobt_key_t         *lkp;   /* left btree key pointer */
-        xfs_inobt_ptr_t         *lpp;   /* left btree address pointer */
-        xfs_inobt_rec_t         *lrp;   /* left btree record pointer */
-        xfs_buf_t               *rbp;   /* buffer for right block */
-        xfs_inobt_block_t       *right; /* right (new) btree block */
-        xfs_inobt_key_t         *rkp;   /* right btree key pointer */
-        xfs_inobt_ptr_t         *rpp;   /* right btree address pointer */
-        xfs_inobt_rec_t         *rrp;   /* right btree record pointer */
-        /*
-         * Set up left block (current one).
-         */
-        lbp = cur->bc_bufs[level];
-        args.tp = cur->bc_tp;
-        args.mp = cur->bc_mp;
-        lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-        /*
-         * Allocate the new block.
-         * If we can't do it, we're toast.  Give up.
-         */
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
-        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-                args.isfl = args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args)))
-                return error;
-        if (args.fsbno == NULLFSBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-        /*
-         * Set up the new block as "right".
-         */
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-        /*
-         * "Left" is the current (according to the cursor) block.
-         */
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
 #ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+STATIC int
-                return error;
+xfs_inobt_keys_inorder(
-#endif
+        struct xfs_btree_cur    *cur,
-        /*
+        union xfs_btree_key     *k1,
-         * Fill in the btree header for the new block.
+        union xfs_btree_key     *k2)
-         */
+{
-        right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        return be32_to_cpu(k1->inobt.ir_startino) <
-        right->bb_level = left->bb_level;
+                be32_to_cpu(k2->inobt.ir_startino);
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        /*
-         * Make sure that if there's an odd number of entries now, that
-         * each new block will have the same number of entries.
-         */
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        /*
-         * For non-leaf blocks, copy keys and addresses over to the new block.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *keyp = *rkp;
-        }
-        /*
-         * For leaf blocks, copy records over to the new block.
-         */
-        else {
-                lrp = XFS_INOBT_REC_ADDR(left, i, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ir_startino = rrp->ir_startino;
-        }
-        /*
-         * Find the left block number by looking in the buffer.
-         * Adjust numrecs, sibling pointers.
-         */
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be32(args.agbno);
-        right->bb_leftsib = cpu_to_be32(lbno);
-        xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
-        xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there's a block to the new block's right, make that block
-         * point back to right instead of to left.
-         */
-        if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-                xfs_inobt_block_t       *rrblock;       /* rr btree block */
-                xfs_buf_t               *rrbp;          /* buffer for rrblock */
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                be32_to_cpu(right->bb_rightsib), 0, &rrbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                        return error;
-                rrblock->bb_leftsib = cpu_to_be32(args.agbno);
-                xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
-        }
-        /*
-         * If the cursor is really in the right block, move it there.
-         * If it's just pointing past the last entry in left, then we'll
-         * insert there, so don't change anything in that case.
-         */
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-                xfs_btree_setbuf(cur, level, rbp);
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * If there are more levels, we'll need another cursor which refers
-         * the right block, no matter where this cursor was.
-         */
-        if (level + 1 < cur->bc_nlevels) {
-                if ((error = xfs_btree_dup_cursor(cur, curp)))
-                        return error;
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = args.agbno;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Update keys at all levels from here to the root along the cursor's path.
+xfs_inobt_recs_inorder(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        union xfs_btree_rec     *r1,
-xfs_inobt_updkey(
+        union xfs_btree_rec     *r2)
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_inobt_key_t         *keyp,  /* new key value to update to */
-        int                     level)  /* starting level for update */
 {
-        int                     ptr;    /* index of key in block */
+        return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+                be32_to_cpu(r2->inobt.ir_startino);
-        /*
-         * Go up the tree from this level toward the root.
-         * At each level, update the key value to the value input.
-         * Stop when we reach a level where the cursor isn't pointing
-         * at the first entry in the block.
-         */
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                xfs_buf_t               *bp;    /* buffer for block */
-                xfs_inobt_block_t       *block; /* btree block */
-#ifdef DEBUG
-                int                     error;  /* error return value */
-#endif
-                xfs_inobt_key_t         *kp;    /* ptr to btree block keys */
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                        return error;
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_inobt_log_keys(cur, bp, ptr, ptr);
-        }
-        return 0;
 }
+#endif  /* DEBUG */
-/*
+#ifdef XFS_BTREE_TRACE
- * Externally visible routines.
+ktrace_t        *xfs_inobt_trace_buf;
- */
-/*
+STATIC void
- * Decrement cursor by one record at the level.
+xfs_inobt_trace_enter(
- * For nonzero levels the leaf-ward information is untouched.
+        struct xfs_btree_cur    *cur,
- */
+        const char              *func,
-int                                     /* error */
+        char                    *s,
-xfs_inobt_decrement(
+        int                     type,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     line,
-        int                     level,  /* level in btree, 0 is leaf */
+        __psunsigned_t          a0,
-        int                     *stat)  /* success/failure */
+        __psunsigned_t          a1,
+        __psunsigned_t          a2,
+        __psunsigned_t          a3,
+        __psunsigned_t          a4,
+        __psunsigned_t          a5,
+        __psunsigned_t          a6,
+        __psunsigned_t          a7,
+        __psunsigned_t          a8,
+        __psunsigned_t          a9,
+        __psunsigned_t          a10)
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
-        int                     error;
+                (void *)func, (void *)s, NULL, (void *)cur,
-        int                     lev;    /* btree level */
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-        ASSERT(level < cur->bc_nlevels);
+                (void *)a8, (void *)a9, (void *)a10);
-        /*
-         * Read-ahead to the left at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        /*
-         * Decrement the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (--cur->bc_ptrs[level] > 0) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Get a pointer to the btree block.
-         */
-        block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level,
-                        cur->bc_bufs[level])))
-                return error;
-#endif
-        /*
-         * If we just went off the left edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree decrementing pointers.
-         * Stop when we don't go off the left edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                /*
-                 * Read-ahead the left block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                xfs_buf_t       *bp;    /* buffer containing btree block */
-                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Delete the record pointed to by cur.
+xfs_inobt_trace_cursor(
- * The cursor refers to the place where the record was (could be inserted)
+        struct xfs_btree_cur    *cur,
- * when the operation returns.
+        __uint32_t              *s0,
- */
+        __uint64_t              *l0,
-int                                     /* error */
+        __uint64_t              *l1)
-xfs_inobt_delete(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
 {
-        int             error;
+        *s0 = cur->bc_private.a.agno;
-        int             i;              /* result code */
+        *l0 = cur->bc_rec.i.ir_startino;
-        int             level;          /* btree level */
+        *l1 = cur->bc_rec.i.ir_free;
-        /*
-         * Go up the tree, starting at leaf level.
-         * If 2 is returned then a join was done; go to the next level.
-         * Otherwise we are done.
-         */
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_inobt_delrec(cur, level, &i)))
-                        return error;
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_inobt_decrement(cur, level, &i)))
-                                        return error;
-                                break;
-                        }
-                }
-        }
-        *stat = i;
-        return 0;
 }
+STATIC void
-/*
+xfs_inobt_trace_key(
- * Get the data from the pointed-to record.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key,
-int                                     /* error */
+        __uint64_t              *l0,
-xfs_inobt_get_rec(
+        __uint64_t              *l1)
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agino_t             *ino,   /* output: starting inode of chunk */
-        __int32_t               *fcnt,  /* output: number of free inodes */
-        xfs_inofree_t           *free,  /* output: free inode mask */
-        int                     *stat)  /* output: success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(key->inobt.ir_startino);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
+        *l1 = 0;
-#ifdef DEBUG
-        int                     error;  /* error return value */
-#endif
-        int                     ptr;    /* record number */
-        xfs_inobt_rec_t         *rec;   /* record data */
-        bp = cur->bc_bufs[0];
-        ptr = cur->bc_ptrs[0];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-                return error;
-#endif
-        /*
-         * Off the right end or left end, return failure.
-         */
-        if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Point to the record and extract its data.
-         */
-        rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        *ino = be32_to_cpu(rec->ir_startino);
-        *fcnt = be32_to_cpu(rec->ir_freecount);
-        *free = be64_to_cpu(rec->ir_free);
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Increment cursor by one record at the level.
+xfs_inobt_trace_record(
- * For nonzero levels the leaf-ward information is untouched.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_rec     *rec,
-int                                     /* error */
+        __uint64_t              *l0,
-xfs_inobt_increment(
+        __uint64_t              *l1,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        __uint64_t              *l2)
-        int                     level,  /* level in btree, 0 is leaf */
-        int                     *stat)  /* success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(rec->inobt.ir_startino);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
+        *l1 = be32_to_cpu(rec->inobt.ir_freecount);
-        int                     error;  /* error return value */
+        *l2 = be64_to_cpu(rec->inobt.ir_free);
-        int                     lev;    /* btree level */
+}
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_inobt_ops = {
+        .rec_len                = sizeof(xfs_inobt_rec_t),
+        .key_len                = sizeof(xfs_inobt_key_t),
+        .dup_cursor             = xfs_inobt_dup_cursor,
+        .set_root               = xfs_inobt_set_root,
+        .kill_root              = xfs_inobt_kill_root,
+        .alloc_block            = xfs_inobt_alloc_block,
+        .free_block             = xfs_inobt_free_block,
+        .get_minrecs            = xfs_inobt_get_minrecs,
+        .get_maxrecs            = xfs_inobt_get_maxrecs,
+        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+        .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+        .key_diff               = xfs_inobt_key_diff,
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the right at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        /*
-         * Get a pointer to the btree block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Increment the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we just went off the right edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree incrementing pointers.
-         * Stop when we don't go off the right edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                bp = cur->bc_bufs[lev];
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
 #ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+        .keys_inorder           = xfs_inobt_keys_inorder,
-                        return error;
+        .recs_inorder           = xfs_inobt_recs_inorder,
 #endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                /*
-                 * Read-ahead the right block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
-             lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
+#ifdef XFS_BTREE_TRACE
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+        .trace_enter            = xfs_inobt_trace_enter,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
+        .trace_cursor           = xfs_inobt_trace_cursor,
-                                XFS_INO_BTREE_REF)))
+        .trace_key              = xfs_inobt_trace_key,
-                        return error;
+        .trace_record           = xfs_inobt_trace_record,
-                lev--;
+#endif
-                xfs_btree_setbuf(cur, lev, bp);
+};
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = 1;
-        }
-        *stat = 1;
-        return 0;
-}
 /*
- * Insert the current record at the point referenced by cur.
+ * Allocate a new inode btree cursor.
- * The cursor may be inconsistent on return if splits have been done.
 */
-int                                     /* error */
+struct xfs_btree_cur *                          /* new inode btree cursor */
-xfs_inobt_insert(
+xfs_inobt_init_cursor(
-        xfs_btree_cur_t *cur,           /* btree cursor */
+        struct xfs_mount        *mp,            /* file system mount point */
-        int             *stat)          /* success/failure */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_buf          *agbp,          /* buffer for agi structure */
+        xfs_agnumber_t          agno)           /* allocation group number */
 {
-        int             error;          /* error return value */
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-        int             i;              /* result value, 0 for failure */
+        struct xfs_btree_cur    *cur;
-        int             level;          /* current level number in btree */
-        xfs_agblock_t   nbno;           /* new block number (split result) */
-        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-        xfs_inobt_rec_t nrec;           /* record being inserted this level */
-        xfs_btree_cur_t *pcur;          /* previous level's cursor */
-        level = 0;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-        nbno = NULLAGBLOCK;
-        nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-        nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-        nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-        ncur = NULL;
-        pcur = cur;
-        /*
-         * Loop going up the tree, starting at the leaf level.
-         * Stop when we don't get a split block, that must mean that
-         * the insert is finished with this level.
-         */
-        do {
-                /*
-                 * Insert nrec/nbno into this level of the tree.
-                 * Note if we fail, nbno will be null.
-                 */
-                if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                /*
-                 * See if the cursor we just used is trash.
-                 * Can't trash the caller's cursor, but otherwise we should
-                 * if ncur is a new cursor or we're about to be done.
-                 */
-                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                /*
-                 * If we got a new cursor, switch to it.
-                 */
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLAGBLOCK);
-        *stat = i;
-        return 0;
-}
-/*
+        cur->bc_tp = tp;
- * Lookup the record equal to ino in the btree given by cur.
+        cur->bc_mp = mp;
- */
+        cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-int                                     /* error */
+        cur->bc_btnum = XFS_BTNUM_INO;
-xfs_inobt_lookup_eq(
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
+        cur->bc_ops = &xfs_inobt_ops;
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_inobt_lookup_ge(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
+        cur->bc_private.a.agbp = agbp;
- * Lookup the first record less than or equal to ino
+        cur->bc_private.a.agno = agno;
- * in the btree given by cur.
- */
+        return cur;
-int                                     /* error */
-xfs_inobt_lookup_le(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
- * Update the record referred to by cur, to the value given
+ * Calculate number of records in an inobt btree block.
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
 */
-int                                     /* error */
+int
-xfs_inobt_update(
+xfs_inobt_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,
-        xfs_agino_t             ino,    /* starting inode of chunk */
+        int                     blocklen,
-        __int32_t               fcnt,   /* free inode count */
+        int                     leaf)
-        xfs_inofree_t           free)   /* free inode mask */
 {
-        xfs_inobt_block_t       *block; /* btree block to update */
+        blocklen -= XFS_INOBT_BLOCK_LEN(mp);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
-        int                     error;  /* error return value */
-        int                     ptr;    /* current record number (updating) */
-        xfs_inobt_rec_t         *rp;    /* pointer to updated record */
-        /*
+        if (leaf)
-         * Pick up the current block.
+                return blocklen / sizeof(xfs_inobt_rec_t);
-         */
+        return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
-        bp = cur->bc_bufs[0];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-                return error;
-#endif
-        /*
-         * Get the address of the rec to be updated.
-         */
-        ptr = cur->bc_ptrs[0];
-        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        /*
-         * Fill in the new contents and log them.
-         */
-        rp->ir_startino = cpu_to_be32(ino);
-        rp->ir_freecount = cpu_to_be32(fcnt);
-        rp->ir_free = cpu_to_be64(free);
-        xfs_inobt_log_recs(cur, bp, ptr, ptr);
-        /*
-         * Updating first record in leaf. Pass new key value up to our parent.
-         */
-        if (ptr == 1) {
-                xfs_inobt_key_t key;    /* key containing [ino] */
-                key.ir_startino = cpu_to_be32(ino);
-                if ((error = xfs_inobt_updkey(cur, &key, 1)))
-                        return error;
-        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 8efc4a5b8b92..37e5dd01a577 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -24,7 +24,6 @@
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 /*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
 /* btree pointer type */
 typedef __be32 xfs_inobt_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_inobt_block_t;
-#define XFS_BUF_TO_INOBT_BLOCK(bp)      ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
 /*
 * Bit manipulations for ir_free.
 */
@@ -85,14 +79,6 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
 /*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define XFS_INOBT_IS_LAST_REC(cur)      \
-        ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-/*
 * Maximum number of inode btree levels.
 */
 #define XFS_IN_MAXLEVELS(mp)            ((mp)->m_in_maxlevels)
@@ -104,75 +90,38 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_PREALLOC_BLOCKS(mp)         ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
 /*
- * Record, key, and pointer address macros for btree blocks.
+ * Btree block header size depends on a superblock flag.
- */
+ *
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
+ * (not quite yet, but soon)
-        (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
-#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
-        (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
-#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
-        (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
-                                i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
 */
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
 /*
- * Update the record referred to by cur, to the value given
+ * Record, key, and pointer address macros for btree blocks.
- * by [ino, fcnt, free].
+ *
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * (note that some of these may appear unused, but they are used in userspace)
- */
+ */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
-                                __int32_t fcnt, xfs_inofree_t free);
+        ((xfs_inobt_rec_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+        ((xfs_inobt_key_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_inobt_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e229e9e001c2..e2fb6210d4c5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,281 +38,283 @@
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_dir2_trace.h"
 /*
- * Look up an inode by number in the given file system.
+ * Allocate and initialise an xfs_inode.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, attach it to the provided
- * vnode.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and attach the provided vnode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *               for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *        if known (as by bulkstat), else 0.
 */
-STATIC int
+STATIC struct xfs_inode *
-xfs_iget_core(
+xfs_inode_alloc(
-        struct inode    *inode,
+        struct xfs_mount        *mp,
-        xfs_mount_t     *mp,
+        xfs_ino_t               ino)
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp,
-        xfs_daddr_t     bno)
 {
-        struct inode    *old_inode;
+        struct xfs_inode        *ip;
-        xfs_inode_t     *ip;
-        xfs_inode_t     *iq;
-        int             error;
-        unsigned long   first_index, mask;
-        xfs_perag_t     *pag;
-        xfs_agino_t     agino;
-        /* the radix tree exists only in inode capable AGs */
+        /*
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+         * if this didn't occur in transactions, we could use
-                return EINVAL;
+         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+         * code up to do this anyway.
+         */
+        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        if (!ip)
+                return NULL;
-        /* get the perag structure and ensure that it's inode capable */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        pag = xfs_get_perag(mp, ino);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        if (!pag->pagi_inodeok)
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-                return EINVAL;
+        ASSERT(completion_done(&ip->i_flush));
-        ASSERT(pag->pag_ici_init);
-        agino = XFS_INO_TO_AGINO(mp, ino);
-again:
+        /*
-        read_lock(&pag->pag_ici_lock);
+         * initialise the VFS inode here to get failures
-        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+         * out of the way early.
+         */
+        if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
+        /* initialise the xfs inode */
+        ip->i_ino = ino;
+        ip->i_mount = mp;
+        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+        ip->i_afp = NULL;
+        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+        ip->i_flags = 0;
+        ip->i_update_core = 0;
+        ip->i_update_size = 0;
+        ip->i_delayed_blks = 0;
+        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+        ip->i_size = 0;
+        ip->i_new_size = 0;
+        /*
+         * Initialize inode's trace buffers.
+         */
+#ifdef  XFS_INODE_TRACE
+        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_RW_TRACE
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
+#endif
+        return ip;
+}
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip,
+        int                     flags,
+        int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     error = EAGAIN;
+        /*
+         * If INEW is set this inode is being set up
+         * If IRECLAIM is set this inode is being torn down
+         * Pause and try again.
+         */
+        if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+                XFS_STATS_INC(xs_ig_frecycle);
+                goto out_error;
+        }
+        /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+        if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-        if (ip != NULL) {
                /*
-                 * If INEW is set this inode is being set up
+                 * If lookup is racing with unlink, then we should return an
-                 * we need to pause and try again.
+                 * error immediately so we don't remove it from the reclaim
+                 * list and potentially leak the inode.
                 */
-                if (xfs_iflags_test(ip, XFS_INEW)) {
+                if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                        read_unlock(&pag->pag_ici_lock);
+                        error = ENOENT;
-                        delay(1);
+                        goto out_error;
-                        XFS_STATS_INC(xs_ig_frecycle);
-                        goto again;
                }
-                old_inode = ip->i_vnode;
+                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-                if (old_inode == NULL) {
-                        /*
-                         * If IRECLAIM is set this inode is
-                         * on its way out of the system,
-                         * we need to pause and try again.
-                         */
-                        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                delay(1);
-                                XFS_STATS_INC(xs_ig_frecycle);
-                                goto again;
-                        }
-                        ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                        /*
-                         * If lookup is racing with unlink, then we
-                         * should return an error immediately so we
-                         * don't remove it from the reclaim list and
-                         * potentially leak the inode.
-                         */
-                        if ((ip->i_d.di_mode == 0) &&
-                            !(flags & XFS_IGET_CREATE)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                xfs_put_perag(mp, pag);
-                                return ENOENT;
-                        }
-                        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-                        XFS_STATS_INC(xs_ig_found);
-                        xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                        read_unlock(&pag->pag_ici_lock);
-                        XFS_MOUNT_ILOCK(mp);
-                        list_del_init(&ip->i_reclaim);
-                        XFS_MOUNT_IUNLOCK(mp);
-                        goto finish_inode;
-                } else if (inode != old_inode) {
-                        /* The inode is being torn down, pause and
-                         * try again.
-                         */
-                        if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                delay(1);
-                                XFS_STATS_INC(xs_ig_frecycle);
-                                goto again;
-                        }
-/* Chances are the other vnode (the one in the inode) is being torn
-* down right now, and we landed on top of it. Question is, what do
-* we do? Unhook the old inode and hook up the new one?
-*/
-                        cmn_err(CE_PANIC,
-                "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
-                                        old_inode, inode);
-                }
                /*
-                 * Inode cache hit
+                 * We need to re-initialise the VFS inode as it has been
+                 * 'freed' by the VFS. Do this here so we can deal with
+                 * errors cleanly, then tag it so it can be set up correctly
+                 * later.
                 */
-                read_unlock(&pag->pag_ici_lock);
+                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-                XFS_STATS_INC(xs_ig_found);
+                        error = ENOMEM;
+                        goto out_error;
-finish_inode:
-                if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                        xfs_put_perag(mp, pag);
-                        return ENOENT;
                }
-                if (lock_flags != 0)
+                /*
-                        xfs_ilock(ip, lock_flags);
+                 * We must set the XFS_INEW flag before clearing the
+                 * XFS_IRECLAIMABLE flag so that if a racing lookup does
+                 * not find the XFS_IRECLAIMABLE above but has the igrab()
+                 * below succeed we can safely check XFS_INEW to detect
+                 * that this inode is still being initialised.
+                 */
+                xfs_iflags_set(ip, XFS_INEW);
+                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+                /* clear the radix tree reclaim flag as well. */
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+        } else if (!igrab(VFS_I(ip))) {
+                /* If the VFS inode is being torn down, pause and try again. */
+                XFS_STATS_INC(xs_ig_frecycle);
+                goto out_error;
+        } else if (xfs_iflags_test(ip, XFS_INEW)) {
+                /*
+                 * We are racing with another cache hit that is
+                 * currently recycling this inode out of the XFS_IRECLAIMABLE
+                 * state. Wait for the initialisation to complete before
+                 * continuing.
+                 */
+                wait_on_inode(VFS_I(ip));
+        }
-                xfs_iflags_clear(ip, XFS_ISTALE);
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                xfs_itrace_exit_tag(ip, "xfs_iget.found");
+                error = ENOENT;
-                goto return_ip;
+                iput(VFS_I(ip));
+                goto out_error;
        }
-        /*
+        /* We've got a live one. */
-         * Inode cache miss
-         */
        read_unlock(&pag->pag_ici_lock);
-        XFS_STATS_INC(xs_ig_missed);
-        /*
+        if (lock_flags != 0)
-         * Read the disk inode attributes into a new inode structure and get
+                xfs_ilock(ip, lock_flags);
-         * a new vnode for it. This should also initialize i_ino and i_mount.
-         */
-        error = xfs_iread(mp, tp, ino, &ip, bno,
-                          (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
-        if (error) {
-                xfs_put_perag(mp, pag);
-                return error;
-        }
-        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+        xfs_iflags_clear(ip, XFS_ISTALE);
+        xfs_itrace_exit_tag(ip, "xfs_iget.found");
+        XFS_STATS_INC(xs_ig_found);
+        return 0;
+out_error:
+        read_unlock(&pag->pag_ici_lock);
+        return error;
+}
-        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+static int
-                     "xfsino", ip->i_ino);
+xfs_iget_cache_miss(
-        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        struct xfs_mount        *mp,
-        init_waitqueue_head(&ip->i_ipin_wait);
+        struct xfs_perag        *pag,
-        atomic_set(&ip->i_pincount, 0);
+        xfs_trans_t             *tp,
+        xfs_ino_t               ino,
+        struct xfs_inode        **ipp,
+        xfs_daddr_t             bno,
+        int                     flags,
+        int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+        struct xfs_inode        *ip;
+        int                     error;
+        unsigned long           first_index, mask;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
-        /*
+        ip = xfs_inode_alloc(mp, ino);
-         * Because we want to use a counting completion, complete
+        if (!ip)
-         * the flush completion once to allow a single access to
+                return ENOMEM;
-         * the flush completion without blocking.
-         */
-        init_completion(&ip->i_flush);
-        complete(&ip->i_flush);
-        if (lock_flags)
+        error = xfs_iread(mp, tp, ip, bno, flags);
-                xfs_ilock(ip, lock_flags);
+        if (error)
+                goto out_destroy;
+        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                xfs_idestroy(ip);
+                error = ENOENT;
-                xfs_put_perag(mp, pag);
+                goto out_destroy;
-                return ENOENT;
        }
+        if (lock_flags)
+                xfs_ilock(ip, lock_flags);
        /*
         * Preload the radix tree so we can insert safely under the
-         * write spinlock.
+         * write spinlock. Note that we cannot sleep inside the preload
+         * region.
         */
        if (radix_tree_preload(GFP_KERNEL)) {
-                xfs_idestroy(ip);
+                error = EAGAIN;
-                delay(1);
+                goto out_unlock;
-                goto again;
        }
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-        /*
-         * insert the new inode
+        /* insert the new inode */
-         */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
-                BUG_ON(error != -EEXIST);
+                WARN_ON(error != -EEXIST);
-                write_unlock(&pag->pag_ici_lock);
-                radix_tree_preload_end();
-                xfs_idestroy(ip);
                XFS_STATS_INC(xs_ig_dup);
-                goto again;
+                error = EAGAIN;
+                goto out_preload_end;
        }
-        /*
+        /* These values _must_ be set before releasing the radix tree lock! */
-         * These values _must_ be set before releasing the radix tree lock!
-         */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        /*
-         * Link ip to its mount and thread it on the mount's inode list.
-         */
-        XFS_MOUNT_ILOCK(mp);
-        if ((iq = mp->m_inodes)) {
-                ASSERT(iq->i_mprev->i_mnext == iq);
-                ip->i_mprev = iq->i_mprev;
-                iq->i_mprev->i_mnext = ip;
-                iq->i_mprev = ip;
-                ip->i_mnext = iq;
-        } else {
-                ip->i_mnext = ip;
-                ip->i_mprev = ip;
-        }
-        mp->m_inodes = ip;
-        XFS_MOUNT_IUNLOCK(mp);
-        xfs_put_perag(mp, pag);
- return_ip:
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-        xfs_iflags_set(ip, XFS_IMODIFIED);
        *ipp = ip;
-        /*
-         * Set up the Linux with the Linux inode.
-         */
-        ip->i_vnode = inode;
-        inode->i_private = ip;
-        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         */
-        if (ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
        return 0;
-}
+out_preload_end:
+        write_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+out_unlock:
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+out_destroy:
+        xfs_destroy_inode(ip);
+        return error;
+}
 /*
- * The 'normal' internal xfs_iget, if needed it will
+ * Look up an inode by number in the given file system.
- * 'allocate', or 'get', the vnode.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *               for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *        if known (as by bulkstat), else 0.
 */
 int
 xfs_iget(
@@ -324,61 +326,64 @@ xfs_iget(
        xfs_inode_t     **ipp,
        xfs_daddr_t     bno)
 {
-        struct inode    *inode;
        xfs_inode_t     *ip;
        int             error;
+        xfs_perag_t     *pag;
+        xfs_agino_t     agino;
-        XFS_STATS_INC(xs_ig_attempts);
+        /* the radix tree exists only in inode capable AGs */
+        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+                return EINVAL;
-retry:
+        /* get the perag structure and ensure that it's inode capable */
-        inode = iget_locked(mp->m_super, ino);
+        pag = xfs_get_perag(mp, ino);
-        if (!inode)
+        if (!pag->pagi_inodeok)
-                /* If we got no inode we are out of memory */
+                return EINVAL;
-                return ENOMEM;
+        ASSERT(pag->pag_ici_init);
+        agino = XFS_INO_TO_AGINO(mp, ino);
-        if (inode->i_state & I_NEW) {
+again:
-                XFS_STATS_INC(vn_active);
+        error = 0;
-                XFS_STATS_INC(vn_alloc);
+        read_lock(&pag->pag_ici_lock);
+        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-                error = xfs_iget_core(inode, mp, tp, ino, flags,
-                                lock_flags, ipp, bno);
+        if (ip) {
-                if (error) {
+                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
-                        make_bad_inode(inode);
+                if (error)
-                        if (inode->i_state & I_NEW)
+                        goto out_error_or_again;
-                                unlock_new_inode(inode);
+        } else {
-                        iput(inode);
+                read_unlock(&pag->pag_ici_lock);
-                }
+                XFS_STATS_INC(xs_ig_missed);
-                return error;
+                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+                                                        flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
        }
+        xfs_put_perag(mp, pag);
+        *ipp = ip;
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
-         * If the inode is not fully constructed due to
+         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * filehandle mismatches wait for the inode to go
+         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         * away and try again.
-         *
-         * iget_locked will call __wait_on_freeing_inode
-         * to wait for the inode to go away.
         */
-        if (is_bad_inode(inode)) {
+        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                iput(inode);
+                xfs_setup_inode(ip);
-                delay(1);
+        return 0;
-                goto retry;
-        }
-        ip = XFS_I(inode);
+out_error_or_again:
-        if (!ip) {
+        if (error == EAGAIN) {
-                iput(inode);
                delay(1);
-                goto retry;
+                goto again;
        }
+        xfs_put_perag(mp, pag);
-        if (lock_flags != 0)
+        return error;
-                xfs_ilock(ip, lock_flags);
-        XFS_STATS_INC(xs_ig_found);
-        *ipp = ip;
-        return 0;
 }
 /*
 * Look for the inode corresponding to the given ino in the hash table.
 * If it is there and its i_transp pointer matches tp, return it.
@@ -444,99 +449,109 @@ xfs_iput_new(
        IRELE(ip);
 }
 /*
- * This routine embodies the part of the reclaim code that pulls
+ * This is called free all the memory associated with an inode.
- * the inode from the inode hash table and the mount structure's
+ * It must free the inode itself and any buffers allocated for
- * inode list.
+ * if_extents/if_data and if_broot.  It must also free the lock
- * This should only be called from xfs_reclaim().
+ * associated with the inode.
+ *
+ * Note: because we don't initialise everything on reallocation out
+ * of the zone, we must ensure we nullify everything correctly before
+ * freeing the structure.
 */
 void
-xfs_ireclaim(xfs_inode_t *ip)
+xfs_ireclaim(
+        struct xfs_inode        *ip)
 {
-        /*
+        struct xfs_mount        *mp = ip->i_mount;
-         * Remove from old hash list and mount list.
+        struct xfs_perag        *pag;
-         */
-        XFS_STATS_INC(xs_ig_reclaims);
-        xfs_iextract(ip);
+        XFS_STATS_INC(xs_ig_reclaims);
-        /*
-         * Here we do a spurious inode lock in order to coordinate with
-         * xfs_sync().  This is because xfs_sync() references the inodes
-         * in the mount list without taking references on the corresponding
-         * vnodes.  We make that OK here by ensuring that we wait until
-         * the inode is unlocked in xfs_sync() before we go ahead and
-         * free it.  We get both the regular lock and the io lock because
-         * the xfs_sync() code may need to drop the regular one but will
-         * still hold the io lock.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        /*
-         * Release dquots (and their references) if any. An inode may escape
-         * xfs_inactive and get here via vn_alloc->vn_reclaim path.
-         */
-        XFS_QM_DQDETACH(ip->i_mount, ip);
-        /*
-         * Pull our behavior descriptor from the vnode chain.
-         */
-        if (ip->i_vnode) {
-                ip->i_vnode->i_private = NULL;
-                ip->i_vnode = NULL;
-        }
        /*
-         * Free all memory associated with the inode.
+         * Remove the inode from the per-AG radix tree.  It doesn't matter
+         * if it was never added to it because radix_tree_delete can deal
+         * with that case just fine.
         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        pag = xfs_get_perag(mp, ip->i_ino);
-        xfs_idestroy(ip);
-}
-/*
- * This routine removes an about-to-be-destroyed inode from
- * all of the lists in which it is located with the exception
- * of the behavior chain.
- */
-void
-xfs_iextract(
-        xfs_inode_t     *ip)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-        xfs_inode_t     *iq;
        write_lock(&pag->pag_ici_lock);
        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
        write_unlock(&pag->pag_ici_lock);
        xfs_put_perag(mp, pag);
        /*
-         * Remove from mount's inode list.
+         * Here we do an (almost) spurious inode lock in order to coordinate
+         * with inode cache radix tree lookups.  This is because the lookup
+         * can reference the inodes in the cache without taking references.
+         *
+         * We make that OK here by ensuring that we wait until the inode is
+         * unlocked after the lookup before we go ahead and free it.  We get
+         * both the ilock and the iolock because the code may need to drop the
+         * ilock one but will still hold the iolock.
         */
-        XFS_MOUNT_ILOCK(mp);
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
-        iq = ip->i_mnext;
-        iq->i_mprev = ip->i_mprev;
-        ip->i_mprev->i_mnext = iq;
        /*
-         * Fix up the head pointer if it points to the inode being deleted.
+         * Release dquots (and their references) if any.
         */
-        if (mp->m_inodes == ip) {
+        XFS_QM_DQDETACH(ip->i_mount, ip);
-                if (ip == iq) {
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        mp->m_inodes = NULL;
-                } else {
+        switch (ip->i_d.di_mode & S_IFMT) {
-                        mp->m_inodes = iq;
+        case S_IFREG:
-                }
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
        }
-        /* Deal with the deleted inodes list */
+        if (ip->i_afp)
-        list_del_init(&ip->i_reclaim);
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        mp->m_ireclaims++;
+#ifdef XFS_INODE_TRACE
-        XFS_MOUNT_IUNLOCK(mp);
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
@@ -737,7 +752,7 @@ xfs_iunlock(
                 * it is in the AIL and anyone is waiting on it.  Don't do
                 * this if the caller has asked us not to.
                 */
-                xfs_trans_unlocked_item(ip->i_mount,
+                xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
                                        (xfs_log_item_t*)(ip->i_itemp));
        }
        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
@@ -790,3 +805,51 @@ xfs_isilocked(
 }
 #endif
+#ifdef  XFS_INODE_TRACE
+#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
+        ktrace_enter((ip)->i_trace,                             \
+/*  0 */                (void *)(__psint_t)(vk),                \
+/*  1 */                (void *)(s),                            \
+/*  2 */                (void *)(__psint_t) line,               \
+/*  3 */                (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
+/*  4 */                (void *)(ra),                           \
+/*  5 */                NULL,                                   \
+/*  6 */                (void *)(__psint_t)current_cpu(),       \
+/*  7 */                (void *)(__psint_t)current_pid(),       \
+/*  8 */                (void *)__return_address,               \
+/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+/*
+ * Vnode tracing code.
+ */
+void
+_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
+}
+void
+_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
+}
+void
+xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
+}
+void
+_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
+}
+void
+xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
+}
+#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644
index d36450003983..000000000000
--- a/fs/xfs/xfs_imap.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IMAP_H__
-#define __XFS_IMAP_H__
-/*
- * This is the structure passed to xfs_imap() to map
- * an inode number to its on disk location.
- */
-typedef struct xfs_imap {
-        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-        uint            im_len;         /* length in BBs of inode chunk */
-        xfs_agblock_t   im_agblkno;     /* logical block of inode chunk in ag */
-        ushort          im_ioffset;     /* inode offset in block in "inodes" */
-        ushort          im_boffset;     /* inode offset in block in bytes */
-} xfs_imap_t;
-#ifdef __KERNEL__
-struct xfs_mount;
-struct xfs_trans;
-int     xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                 xfs_imap_t *, uint);
-#endif
-#endif  /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a391b955df01..5a5e035e5d38 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,7 +23,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_imap.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
@@ -41,6 +40,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
@@ -133,10 +133,10 @@ STATIC int
 xfs_imap_to_bp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-        xfs_imap_t      *imap,
+        struct xfs_imap *imap,
        xfs_buf_t       **bpp,
        uint            buf_flags,
-        uint            imap_flags)
+        uint            iget_flags)
 {
        int             error;
        int             i;
@@ -173,12 +173,12 @@ xfs_imap_to_bp(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        (i << mp->m_sb.sb_inodelog));
-                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
-                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                            XFS_DINODE_GOOD_VERSION(dip->di_version);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP,
                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (imap_flags & XFS_IMAP_BULKSTAT) {
+                        if (iget_flags & XFS_IGET_BULKSTAT) {
                                xfs_trans_brelse(tp, bp);
                                return XFS_ERROR(EINVAL);
                        }
@@ -190,7 +190,7 @@ xfs_imap_to_bp(
                                        "daddr %lld #%d (magic=%x)",
                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
-                                be16_to_cpu(dip->di_core.di_magic));
+                                be16_to_cpu(dip->di_magic));
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -221,25 +221,26 @@ xfs_imap_to_bp(
 * Use xfs_imap() to determine the size and location of the
 * buffer to read from disk.
 */
-STATIC int
+int
 xfs_inotobp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-        int             *offset)
+        int             *offset,
+        uint            imap_flags)
 {
-        xfs_imap_t      imap;
+        struct xfs_imap imap;
        xfs_buf_t       *bp;
        int             error;
        imap.im_blkno = 0;
-        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+        error = xfs_imap(mp, tp, ino, &imap, imap_flags);
        if (error)
                return error;
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
        if (error)
                return error;
@@ -260,15 +261,11 @@ xfs_inotobp(
 * If a non-zero error is returned, then the contents of bpp and
 * dipp are undefined.
 *
- * If the inode is new and has not yet been initialized, use xfs_imap()
+ * The inode is expected to already been mapped to its buffer and read
- * to determine the size and location of the buffer to read from disk.
+ * in once, thus we can use the mapping information stored in the inode
- * If the inode has already been mapped to its buffer and read in once,
+ * rather than calling xfs_imap().  This allows us to avoid the overhead
- * then use the mapping information stored in the inode rather than
+ * of looking at the inode btree for small block file systems
- * calling xfs_imap().  This allows us to avoid the overhead of looking
+ * (see xfs_imap()).
- * at the inode btree for small block file systems (see xfs_dilocate()).
- * We can tell whether the inode has been mapped in before by comparing
- * its disk block address to 0.  Only uninitialized inodes will have
- * 0 for the disk block address.
 */
 int
 xfs_itobp(
@@ -277,40 +274,14 @@ xfs_itobp(
        xfs_inode_t     *ip,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-        xfs_daddr_t     bno,
-        uint            imap_flags,
        uint            buf_flags)
 {
-        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        if (ip->i_blkno == (xfs_daddr_t)0) {
+        ASSERT(ip->i_imap.im_blkno != 0);
-                imap.im_blkno = bno;
-                error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                        XFS_IMAP_LOOKUP | imap_flags);
-                if (error)
-                        return error;
-                /*
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
-                 * Fill in the fields in the inode that will be used to
-                 * map the inode to its buffer from now on.
-                 */
-                ip->i_blkno = imap.im_blkno;
-                ip->i_len = imap.im_len;
-                ip->i_boffset = imap.im_boffset;
-        } else {
-                /*
-                 * We've already mapped the inode once, so just use the
-                 * mapping that we saved the first time.
-                 */
-                imap.im_blkno = ip->i_blkno;
-                imap.im_len = ip->i_len;
-                imap.im_boffset = ip->i_boffset;
-        }
-        ASSERT(bno == 0 || bno == imap.im_blkno);
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
        if (error)
                return error;
@@ -321,7 +292,7 @@ xfs_itobp(
                return EAGAIN;
        }
-        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        *bpp = bp;
        return 0;
 }
@@ -348,26 +319,26 @@ xfs_iformat(
                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        error = 0;
-        if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
+        if (unlikely(be32_to_cpu(dip->di_nextents) +
-                     be16_to_cpu(dip->di_core.di_anextents) >
+                     be16_to_cpu(dip->di_anextents) >
-                     be64_to_cpu(dip->di_core.di_nblocks))) {
+                     be64_to_cpu(dip->di_nblocks))) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
-                        (int)(be32_to_cpu(dip->di_core.di_nextents) +
+                        (int)(be32_to_cpu(dip->di_nextents) +
-                              be16_to_cpu(dip->di_core.di_anextents)),
+                              be16_to_cpu(dip->di_anextents)),
                        (unsigned long long)
-                                be64_to_cpu(dip->di_core.di_nblocks));
+                                be64_to_cpu(dip->di_nblocks));
                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
-        if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
-                        dip->di_core.di_forkoff);
+                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
@@ -378,25 +349,25 @@ xfs_iformat(
        case S_IFCHR:
        case S_IFBLK:
        case S_IFSOCK:
-                if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
+                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
                                              ip->i_mount, dip);
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
                ip->i_size = 0;
-                ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
+                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
        case S_IFREG:
        case S_IFLNK:
        case S_IFDIR:
-                switch (dip->di_core.di_format) {
+                switch (dip->di_format) {
                case XFS_DINODE_FMT_LOCAL:
                        /*
                         * no local regular files yet
                         */
-                        if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
+                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
                                        "(local format for regular file).",
@@ -407,7 +378,7 @@ xfs_iformat(
                                return XFS_ERROR(EFSCORRUPTED);
                        }
-                        di_size = be64_to_cpu(dip->di_core.di_size);
+                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
@@ -449,7 +420,7 @@ xfs_iformat(
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
        ip->i_afp->if_ext_max =
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-        switch (dip->di_core.di_aformat) {
+        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
                size = be16_to_cpu(atp->hdr.totsize);
@@ -621,7 +592,7 @@ xfs_iformat_btree(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
        size = XFS_BMAP_BROOT_SPACE(dfp);
-        nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+        nrecs = be16_to_cpu(dfp->bb_numrecs);
        /*
         * blow out if -- fork has less extents than can fit in
@@ -649,8 +620,9 @@ xfs_iformat_btree(
         * Copy and convert from the on-disk structure
         * to the in-memory structure.
         */
-        xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+        xfs_bmdr_to_bmbt(ip->i_mount, dfp,
-                ifp->if_broot, size);
+                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                         ifp->if_broot, size);
        ifp->if_flags &= ~XFS_IFEXTENTS;
        ifp->if_flags |= XFS_IFBROOT;
@@ -660,7 +632,7 @@ xfs_iformat_btree(
 void
 xfs_dinode_from_disk(
        xfs_icdinode_t          *to,
-        xfs_dinode_core_t       *from)
+        xfs_dinode_t            *from)
 {
        to->di_magic = be16_to_cpu(from->di_magic);
        to->di_mode = be16_to_cpu(from->di_mode);
@@ -694,7 +666,7 @@ xfs_dinode_from_disk(
 void
 xfs_dinode_to_disk(
-        xfs_dinode_core_t       *to,
+        xfs_dinode_t            *to,
        xfs_icdinode_t          *from)
 {
        to->di_magic = cpu_to_be16(from->di_magic);
@@ -781,93 +753,57 @@ uint
 xfs_dic2xflags(
        xfs_dinode_t            *dip)
 {
-        xfs_dinode_core_t       *dic = &dip->di_core;
+        return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
-        return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 /*
- * Given a mount structure and an inode number, return a pointer
+ * Read the disk inode attributes into the in-core inode structure.
- * to a newly allocated in-core inode corresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
 */
 int
 xfs_iread(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
+        xfs_inode_t     *ip,
-        xfs_inode_t     **ipp,
        xfs_daddr_t     bno,
-        uint            imap_flags)
+        uint            iget_flags)
 {
        xfs_buf_t       *bp;
        xfs_dinode_t    *dip;
-        xfs_inode_t     *ip;
        int             error;
-        ASSERT(xfs_inode_zone != NULL);
-        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
-        ip->i_ino = ino;
-        ip->i_mount = mp;
-        atomic_set(&ip->i_iocount, 0);
-        spin_lock_init(&ip->i_flags_lock);
        /*
-         * Get pointer's to the on-disk inode and the buffer containing it.
+         * Fill in the location information in the in-core inode.
-         * If the inode number refers to a block outside the file system
-         * then xfs_itobp() will return NULL.  In this case we should
-         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
-         * know that this is a new incore inode.
         */
-        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
+        ip->i_imap.im_blkno = bno;
-        if (error) {
+        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-                kmem_zone_free(xfs_inode_zone, ip);
+        if (error)
                return error;
-        }
+        ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
        /*
-         * Initialize inode's trace buffers.
+         * Get pointers to the on-disk inode and the buffer containing it.
-         * Do this before xfs_iformat in case it adds entries.
         */
-#ifdef  XFS_INODE_TRACE
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+                               XFS_BUF_LOCK, iget_flags);
-#endif
+        if (error)
-#ifdef XFS_BMAP_TRACE
+                return error;
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
        /*
         * If we got something that isn't an inode it means someone
         * (nfs or dmi) has a stale handle.
         */
-        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
+        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
-                                "dip->di_core.di_magic (0x%x) != "
+                                "dip->di_magic (0x%x) != "
                                "XFS_DINODE_MAGIC (0x%x)",
-                                be16_to_cpu(dip->di_core.di_magic),
+                                be16_to_cpu(dip->di_magic),
                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
-                return XFS_ERROR(EINVAL);
+                error = XFS_ERROR(EINVAL);
+                goto out_brelse;
        }
        /*
@@ -877,24 +813,22 @@ xfs_iread(
         * specific information.
         * Otherwise, just get the truly permanent information.
         */
-        if (dip->di_core.di_mode) {
+        if (dip->di_mode) {
-                xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
+                xfs_dinode_from_disk(&ip->i_d, dip);
                error = xfs_iformat(ip, dip);
                if (error)  {
-                        kmem_zone_free(xfs_inode_zone, ip);
-                        xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
                                        "xfs_iformat() returned error %d",
                                        error);
 #endif /* DEBUG */
-                        return error;
+                        goto out_brelse;
                }
        } else {
-                ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
+                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-                ip->i_d.di_version = dip->di_core.di_version;
+                ip->i_d.di_version = dip->di_version;
-                ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
+                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-                ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
+                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
                /*
                 * Make sure to pull in the mode here as well in
                 * case the inode is released without being used.
@@ -911,8 +845,6 @@ xfs_iread(
                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
-        INIT_LIST_HEAD(&ip->i_reclaim);
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -924,7 +856,7 @@ xfs_iread(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+        if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
                ip->i_d.di_projid = 0;
@@ -938,7 +870,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-         XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        XFS_BUF_SET_REF(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -953,9 +885,9 @@ xfs_iread(
         * to worry about the inode being changed just because we released
         * the buffer.
         */
+ out_brelse:
        xfs_trans_brelse(tp, bp);
-        *ipp = ip;
+        return error;
-        return 0;
 }
 /*
@@ -1049,6 +981,7 @@ xfs_ialloc(
        uint            flags;
        int             error;
        timespec_t      tv;
+        int             filestreams = 0;
        /*
         * Call the space management code to pick
@@ -1056,9 +989,8 @@ xfs_ialloc(
         */
        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
                            ialloc_context, call_again, &ino);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        if (*call_again || ino == NULLFSINO) {
                *ipp = NULL;
                return 0;
@@ -1072,9 +1004,8 @@ xfs_ialloc(
         */
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        ASSERT(ip != NULL);
        ip->i_d.di_mode = (__uint16_t)mode;
@@ -1093,8 +1024,8 @@ xfs_ialloc(
         * here rather than here and in the flush/logging code.
         */
        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
-            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+            ip->i_d.di_version == 1) {
-                ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                ip->i_d.di_version = 2;
                /*
                 * We've already zeroed the old link count, the projid field,
                 * and the pad field.
@@ -1104,7 +1035,7 @@ xfs_ialloc(
        /*
         * Project ids won't be stored on disk if we are using a version 1 inode.
         */
-        if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+        if ((prid != 0) && (ip->i_d.di_version == 1))
                xfs_bump_ino_vers2(tp, ip);
        if (pip && XFS_INHERIT_GID(pip)) {
@@ -1155,13 +1086,12 @@ xfs_ialloc(
                flags |= XFS_ILOG_DEV;
                break;
        case S_IFREG:
-                if (pip && xfs_inode_is_filestream(pip)) {
+                /*
-                        error = xfs_filestream_associate(pip, ip);
+                 * we can't set up filestreams until after the VFS inode
-                        if (error < 0)
+                 * is set up properly.
-                                return -error;
+                 */
-                        if (!error)
+                if (pip && xfs_inode_is_filestream(pip))
-                                xfs_iflags_set(ip, XFS_IFILESTREAM);
+                        filestreams = 1;
-                }
                /* fall through */
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1157,15 @@ xfs_ialloc(
        /* now that we have an i_mode we can setup inode ops and unlock */
        xfs_setup_inode(ip);
+        /* now we have set up the vfs inode we can associate the filestream */
+        if (filestreams) {
+                error = xfs_filestream_associate(pip, ip);
+                if (error < 0)
+                        return -error;
+                if (!error)
+                        xfs_iflags_set(ip, XFS_IFILESTREAM);
+        }
        *ipp = ip;
        return 0;
 }
@@ -1383,8 +1322,8 @@ xfs_itrunc_trace(
 * direct I/O with the truncate operation.  Also, because we hold
 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
 * started until the truncate completes and drops the lock. Essentially,
- * the vn_iowait() call forms an I/O barrier that provides strict ordering
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
- * between direct I/Os and the truncate operation.
+ * ordering between direct I/Os and the truncate operation.
 *
 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
 * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
@@ -1415,7 +1354,7 @@ xfs_itruncate_start(
        /* wait for the completion of any pending DIOs */
        if (new_size == 0 || new_size < ip->i_size)
-                vn_iowait(ip);
+                xfs_ioend_wait(ip);
        /*
         * Call toss_pages or flushinval_pages to get rid of pages
@@ -1726,8 +1665,14 @@ xfs_itruncate_finish(
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
                xfs_trans_ihold(ntp, ip);
-                if (!error)
+                if (error)
-                        error = xfs_trans_reserve(ntp, 0,
+                        return error;
+                /*
+                 * transaction commit worked ok so we can drop the extra ticket
+                 * reference that we gained in xfs_trans_dup()
+                 */
+                xfs_log_ticket_put(ntp->t_ticket);
+                error = xfs_trans_reserve(ntp, 0,
                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
                                        XFS_TRANS_PERM_LOG_RES,
                                        XFS_ITRUNCATE_LOG_COUNT);
@@ -1781,13 +1726,10 @@ xfs_iunlink(
        xfs_dinode_t    *dip;
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
-        xfs_agnumber_t  agno;
-        xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        short           bucket_index;
        int             offset;
        int             error;
-        int             agi_ok;
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_mode != 0);
@@ -1795,31 +1737,15 @@ xfs_iunlink(
        mp = tp->t_mountp;
-        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
        if (error)
                return error;
-        /*
-         * Validate the magic number of the agi block.
-         */
        agi = XFS_BUF_TO_AGI(agibp);
-        agi_ok =
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
-                        XFS_RANDOM_IUNLINK))) {
-                XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
-                xfs_trans_brelse(tp, agibp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1837,14 +1763,14 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error)
                        return error;
                ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
                /* both on-disk, don't endian flip twice */
                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-                offset = ip->i_boffset +
+                offset = ip->i_imap.im_boffset +
                        offsetof(xfs_dinode_t, di_next_unlinked);
                xfs_trans_inode_buf(tp, ibp);
                xfs_trans_log_buf(tp, ibp, offset,
@@ -1879,7 +1805,6 @@ xfs_iunlink_remove(
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
        xfs_agnumber_t  agno;
-        xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        xfs_agino_t     next_agino;
        xfs_buf_t       *last_ibp;
@@ -1887,45 +1812,20 @@ xfs_iunlink_remove(
        short           bucket_index;
        int             offset, last_offset = 0;
        int             error;
-        int             agi_ok;
-        /*
-         * First pull the on-disk inode from the AGI unlinked list.
-         */
        mp = tp->t_mountp;
        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+        error = xfs_read_agi(mp, tp, agno, &agibp);
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+        if (error)
-        if (error) {
-                cmn_err(CE_WARN,
-                        "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
-                        error, mp->m_fsname);
                return error;
-        }
-        /*
-         * Validate the magic number of the agi block.
-         */
        agi = XFS_BUF_TO_AGI(agibp);
-        agi_ok =
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
-                        XFS_RANDOM_IUNLINK_REMOVE))) {
-                XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
-                                     mp, agi);
-                xfs_trans_brelse(tp, agibp);
-                cmn_err(CE_WARN,
-                        "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
-                         mp->m_fsname);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1945,7 +1845,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1956,7 +1856,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != 0);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        offset = ip->i_boffset +
+                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -1992,7 +1892,7 @@ xfs_iunlink_remove(
                        }
                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
-                                            &last_ibp, &last_offset);
+                                            &last_ibp, &last_offset, 0);
                        if (error) {
                                cmn_err(CE_WARN,
                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
@@ -2007,7 +1907,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2019,7 +1919,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != agino);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        offset = ip->i_boffset +
+                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -2160,9 +2060,9 @@ xfs_ifree_cluster(
                                iip = (xfs_inode_log_item_t *)lip;
                                ASSERT(iip->ili_logged == 1);
                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                spin_lock(&mp->m_ail_lock);
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                                        &iip->ili_flush_lsn,
-                                spin_unlock(&mp->m_ail_lock);
+                                                        &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
                                pre_flushed++;
                        }
@@ -2183,9 +2083,8 @@ xfs_ifree_cluster(
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
                        iip->ili_logged = 1;
-                        spin_lock(&mp->m_ail_lock);
+                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                        iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                                &iip->ili_item.li_lsn);
-                        spin_unlock(&mp->m_ail_lock);
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2263,7 +2162,7 @@ xfs_ifree(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
        if (error)
                return error;
@@ -2279,7 +2178,7 @@ xfs_ifree(
        * This is a temporary hack that would require a proper fix
        * in the future.
        */
-        dip->di_core.di_mode = 0;
+        dip->di_mode = 0;
        if (delete) {
                xfs_ifree_cluster(ip, tp, first_ino);
@@ -2312,9 +2211,10 @@ xfs_iroot_realloc(
        int                     rec_diff,
        int                     whichfork)
 {
+        struct xfs_mount        *mp = ip->i_mount;
        int                     cur_max;
        xfs_ifork_t             *ifp;
-        xfs_bmbt_block_t        *new_broot;
+        struct xfs_btree_block  *new_broot;
        int                     new_max;
        size_t                  new_size;
        char                    *np;
@@ -2335,8 +2235,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                        ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
+                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
-                                                                     KM_SLEEP);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2347,18 +2246,16 @@ xfs_iroot_realloc(
                 * location.  The records don't change location because
                 * they are kept butted up against the btree block header.
                 */
-                cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+                cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
                new_max = cur_max + rec_diff;
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
-                ifp->if_broot = (xfs_bmbt_block_t *)
+                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                  kmem_realloc(ifp->if_broot,
-                                new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
                                KM_SLEEP);
-                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                      ifp->if_broot_bytes);
+                                                     ifp->if_broot_bytes);
-                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                      (int)new_size);
+                                                     (int)new_size);
                ifp->if_broot_bytes = (int)new_size;
                ASSERT(ifp->if_broot_bytes <=
                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2269,7 @@ xfs_iroot_realloc(
         * records, just get rid of the root and clear the status bit.
         */
        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-        cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+        cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
        new_max = cur_max + rec_diff;
        ASSERT(new_max >= 0);
        if (new_max > 0)
@@ -2380,11 +2277,11 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+                new_broot = kmem_alloc(new_size, KM_SLEEP);
                /*
                 * First copy over the btree block header.
                 */
-                memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+                memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
        } else {
                new_broot = NULL;
                ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2294,16 @@ xfs_iroot_realloc(
                /*
                 * First copy the records.
                 */
-                op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-                np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
-                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
                /*
                 * Then copy the pointers.
                 */
-                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
-                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
        }
@@ -2511,64 +2406,6 @@ xfs_idata_realloc(
        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 }
-/*
- * Map inode to disk block and offset.
- *
- * mp -- the mount point structure for the current file system
- * tp -- the current transaction
- * ino -- the inode number of the inode to be located
- * imap -- this structure is filled in with the information necessary
- *       to retrieve the given inode from disk
- * flags -- flags to pass to xfs_dilocate indicating whether or not
- *       lookups in the inode btree were OK or not
- */
-int
-xfs_imap(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        xfs_imap_t      *imap,
-        uint            flags)
-{
-        xfs_fsblock_t   fsbno;
-        int             len;
-        int             off;
-        int             error;
-        fsbno = imap->im_blkno ?
-                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
-        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-        if (error)
-                return error;
-        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
-        imap->im_len = XFS_FSB_TO_BB(mp, len);
-        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
-        imap->im_ioffset = (ushort)off;
-        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
-        /*
-         * If the inode number maps to a block outside the bounds
-         * of the file system then return NULL rather than calling
-         * read_buf and panicing when we get an error from the
-         * driver.
-         */
-        if ((imap->im_blkno + imap->im_len) >
-            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
-                        (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_len,
-                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-                return EINVAL;
-        }
-        return 0;
-}
 void
 xfs_idestroy_fork(
        xfs_inode_t     *ip,
@@ -2613,70 +2450,6 @@ xfs_idestroy_fork(
 }
 /*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- */
-void
-xfs_idestroy(
-        xfs_inode_t     *ip)
-{
-        switch (ip->i_d.di_mode & S_IFMT) {
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        mrfree(&ip->i_lock);
-        mrfree(&ip->i_iolock);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_mount_t     *mp = ip->i_mount;
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&mp->m_ail_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_delete_ail(mp, lip);
-                        else
-                                spin_unlock(&mp->m_ail_lock);
-                }
-                xfs_inode_item_destroy(ip);
-        }
-        kmem_zone_free(xfs_inode_zone, ip);
-}
-/*
 * Increment the pin count of the given buffer.
 * This value is protected by ipinlock spinlock in the mount structure.
 */
@@ -2880,7 +2653,7 @@ xfs_iflush_fork(
                        ASSERT(ifp->if_broot_bytes <=
                               (XFS_IFORK_SIZE(ip, whichfork) +
                                XFS_BROOT_SIZE_ADJ));
-                        xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
                                (xfs_bmdr_block_t *)cp,
                                XFS_DFORK_SIZE(dip, mp, whichfork));
                }
@@ -2889,15 +2662,16 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_DEV:
                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
+                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
                }
                break;
        case XFS_DINODE_FMT_UUID:
                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
+                        memcpy(XFS_DFORK_DPTR(dip),
-                                sizeof(uuid_t));
+                               &ip->i_df.if_u2.if_uuid,
+                               sizeof(uuid_t));
                }
                break;
@@ -3030,7 +2804,6 @@ cluster_corrupt_out:
                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
-                        XFS_BUF_SHUT(bp);
                        XFS_BUF_ERROR(bp,EIO);
                        xfs_biodone(bp);
                } else {
@@ -3172,7 +2945,7 @@ xfs_iflush(
        /*
         * Get the buffer containing the on-disk inode.
         */
-        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
@@ -3253,7 +3026,7 @@ xfs_iflush_int(
        }
        /* set *dip = inode's place in the buffer */
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        /*
         * Clear i_update_core before copying out the data.
@@ -3275,11 +3048,11 @@ xfs_iflush_int(
         */
        xfs_synchronize_atime(ip);
-        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
+        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
+                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3342,7 +3115,7 @@ xfs_iflush_int(
         * because if the inode is dirty at all the core must
         * be.
         */
-        xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
+        xfs_dinode_to_disk(dip, &ip->i_d);
        /* Wrap, we never let the log put out DI_MAX_FLUSH */
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3354,28 +3127,27 @@ xfs_iflush_int(
         * convert back to the old inode format.  If the superblock version
         * has been updated, then make the conversion permanent.
         */
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-               xfs_sb_version_hasnlink(&mp->m_sb));
+        if (ip->i_d.di_version == 1) {
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
                         */
                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-                        dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+                        dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
                } else {
                        /*
                         * The superblock version has already been bumped,
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_version = 2;
-                        dip->di_core.di_version =  XFS_DINODE_VERSION_2;
+                        dip->di_version = 2;
                        ip->i_d.di_onlink = 0;
-                        dip->di_core.di_onlink = 0;
+                        dip->di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                        memset(&(dip->di_core.di_pad[0]), 0,
+                        memset(&(dip->di_pad[0]), 0,
-                              sizeof(dip->di_core.di_pad));
+                              sizeof(dip->di_pad));
                        ASSERT(ip->i_d.di_projid == 0);
                }
        }
@@ -3418,10 +3190,8 @@ xfs_iflush_int(
                iip->ili_format.ilf_fields = 0;
                iip->ili_logged = 1;
-                ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                spin_lock(&mp->m_ail_lock);
+                                        &iip->ili_item.li_lsn);
-                iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                spin_unlock(&mp->m_ail_lock);
                /*
                 * Attach the function xfs_iflush_done to the inode's
@@ -3459,45 +3229,8 @@ corrupt_out:
 }
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
-        xfs_mount_t     *mp)
-{
-        xfs_inode_t     *ip;
- again:
-        XFS_MOUNT_ILOCK(mp);
-        ip = mp->m_inodes;
-        if (ip == NULL)
-                goto out;
-        do {
-                /* Make sure we skip markers inserted by sync */
-                if (ip->i_mount == NULL) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                if (!VFS_I(ip)) {
-                        XFS_MOUNT_IUNLOCK(mp);
-                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
-                        goto again;
-                }
-                ASSERT(vn_count(VFS_I(ip)) == 0);
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
- out:
-        XFS_MOUNT_IUNLOCK(mp);
-}
 #ifdef XFS_ILOCK_TRACE
-ktrace_t        *xfs_ilock_trace_buf;
 void
 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6be310d41daf..1f175fa34b22 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -19,8 +19,7 @@
 #define __XFS_INODE_H__
 struct xfs_dinode;
-struct xfs_dinode_core;
+struct xfs_inode;
 /*
 * Fork identifiers.
@@ -63,7 +62,7 @@ typedef struct xfs_ext_irec {
 typedef struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
        int                     if_real_bytes;  /* bytes allocated in if_u1 */
-        xfs_bmbt_block_t        *if_broot;      /* file's incore btree root */
+        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
@@ -84,52 +83,14 @@ typedef struct xfs_ifork {
 } xfs_ifork_t;
 /*
- * Flags for xfs_ichgtime().
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
 */
-#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+struct xfs_imap {
-#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+        ushort          im_len;         /* length in BBs of inode chunk */
-/*
+        ushort          im_boffset;     /* inode offset in block in bytes */
- * Per-fork incore inode flags.
+};
- */
-#define XFS_IFINLINE    0x01    /* Inline data is read in */
-#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-/*
- * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
- */
-#define XFS_IMAP_LOOKUP         0x1
-#define XFS_IMAP_BULKSTAT       0x2
-#ifdef __KERNEL__
-struct bhv_desc;
-struct cred;
-struct ktrace;
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_bmbt_irec;
-struct xfs_bmbt_block;
-struct xfs_inode;
-struct xfs_inode_log_item;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot;
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE   32
-extern ktrace_t *xfs_ilock_trace_buf;
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define xfs_ilock_trace(i,n,f,ra)
-#endif
-typedef struct dm_attrs_s {
-        __uint32_t      da_dmevmask;    /* DMIG event mask */
-        __uint16_t      da_dmstate;     /* DMIG state info */
-        __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
 /*
 * This is the xfs in-core inode structure.
@@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp {
 } xfs_ictimestamp_t;
 /*
- * NOTE:  This structure must be kept identical to struct xfs_dinode_core
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
 *        in xfs_dinode.h except for the endianess annotations.
 */
 typedef struct xfs_icdinode {
@@ -191,27 +152,97 @@ typedef struct xfs_icdinode {
        __uint32_t      di_gen;         /* generation number */
 } xfs_icdinode_t;
-typedef struct {
+/*
-        struct xfs_inode        *ip_mnext;      /* next inode in mount list */
+ * Flags for xfs_ichgtime().
-        struct xfs_inode        *ip_mprev;      /* ptr to prev inode */
+ */
-        struct xfs_mount        *ip_mount;      /* fs mount struct ptr */
+#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-} xfs_iptr_t;
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE    0x01    /* Inline data is read in */
+#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+/*
+ * Fork handling.
+ */
+#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+#define XFS_IFORK_PTR(ip,w)             \
+        ((w) == XFS_DATA_FORK ? \
+                &(ip)->i_df : \
+                (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_IFORK_BOFF(ip) : \
+                XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+                0)
+#define XFS_IFORK_SIZE(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_IFORK_DSIZE(ip) : \
+                XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_format : \
+                (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_format = (n)) : \
+                ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_nextents : \
+                (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_nextents = (n)) : \
+                ((ip)->i_d.di_anextents = (n)))
+#ifdef __KERNEL__
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE   32
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define xfs_ilock_trace(i,n,f,ra)
+#endif
+typedef struct dm_attrs_s {
+        __uint32_t      da_dmevmask;    /* DMIG event mask */
+        __uint16_t      da_dmstate;     /* DMIG state info */
+        __uint16_t      da_pad;         /* DMIG extra padding */
+} dm_attrs_t;
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
-        struct xfs_inode        *i_mnext;       /* next inode in mount list */
-        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
-        struct list_head        i_reclaim;      /* reclaim list */
-        struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
        /* Inode location stuff */
        xfs_ino_t               i_ino;          /* inode number (agno/agino)*/
-        xfs_daddr_t             i_blkno;        /* blkno of inode buffer */
+        struct xfs_imap         i_imap;         /* location for xfs_imap() */
-        ushort                  i_len;          /* len of inode buffer */
-        ushort                  i_boffset;      /* off of inode in buffer */
        /* Extent information. */
        xfs_ifork_t             *i_afp;         /* attribute fork pointer */
@@ -230,7 +261,6 @@ typedef struct xfs_inode {
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned char           i_update_size;  /* di_size field is dirty */
-        unsigned int            i_gen;          /* generation count */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -238,6 +268,10 @@ typedef struct xfs_inode {
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
        atomic_t                i_iocount;      /* outstanding I/O count */
+        /* VFS inode */
+        struct inode            i_vnode;        /* embedded VFS inode */
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
@@ -245,7 +279,7 @@ typedef struct xfs_inode {
 #ifdef XFS_BMAP_TRACE
        struct ktrace           *i_xtrace;      /* inode extent list trace */
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        struct ktrace           *i_btrace;      /* inode bmap btree trace */
 #endif
 #ifdef XFS_RW_TRACE
@@ -265,13 +299,30 @@ typedef struct xfs_inode {
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
-        return (struct xfs_inode *)inode->i_private;
+        return container_of(inode, struct xfs_inode, i_vnode);
 }
 /* convert from xfs inode to vfs inode */
 static inline struct inode *VFS_I(struct xfs_inode *ip)
 {
-        return (struct inode *)ip->i_vnode;
+        return &ip->i_vnode;
+}
+/*
+ * Get rid of a partially initialized inode.
+ *
+ * We have to go through destroy_inode to make sure allocations
+ * from init_inode_always like the security data are undone.
+ *
+ * We mark the inode bad so that it takes the short cut in
+ * the reclaim path instead of going through the flush path
+ * which doesn't make sense for an inode that has never seen the
+ * light of day.
+ */
+static inline void xfs_destroy_inode(struct xfs_inode *ip)
+{
+        make_bad_inode(VFS_I(ip));
+        return destroy_inode(VFS_I(ip));
 }
 /*
@@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        spin_unlock(&ip->i_flags_lock);
        return ret;
 }
-#endif  /* __KERNEL__ */
 /*
- * Fork handling.
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
 */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+        wait_for_completion(&ip->i_flush);
+}
-#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+{
+        return try_wait_for_completion(&ip->i_flush);
-#define XFS_IFORK_PTR(ip,w)             \
+}
-        ((w) == XFS_DATA_FORK ? \
-                &(ip)->i_df : \
-                (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-        (XFS_IFORK_Q(ip) ? \
-                XFS_IFORK_BOFF(ip) : \
-                XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
-        (XFS_IFORK_Q(ip) ? \
-                XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
-                0)
-#define XFS_IFORK_SIZE(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                XFS_IFORK_DSIZE(ip) : \
-                XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (ip)->i_d.di_format : \
-                (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-        ((w) == XFS_DATA_FORK ? \
-                ((ip)->i_d.di_format = (n)) : \
-                ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (ip)->i_d.di_nextents : \
-                (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-        ((w) == XFS_DATA_FORK ? \
-                ((ip)->i_d.di_nextents = (n)) : \
-                ((ip)->i_d.di_anextents = (n)))
-#ifdef __KERNEL__
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+        complete(&ip->i_flush);
+}
 /*
 * In-core inode flags.
 */
-#define XFS_IGRIO       0x0001  /* inode used for guaranteed rate i/o */
+#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
-#define XFS_IUIOSZ      0x0002  /* inode i/o sizes have been explicitly set */
+#define XFS_ISTALE      0x0002  /* inode has been staled */
-#define XFS_IQUIESCE    0x0004  /* we have started quiescing for this inode */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_IRECLAIM    0x0008  /* we have started reclaiming this inode    */
+#define XFS_INEW        0x0008  /* inode has just been allocated */
-#define XFS_ISTALE      0x0010  /* inode has been staled */
+#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
-#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
+#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
-#define XFS_INEW        0x0040
-#define XFS_IFILESTREAM 0x0080  /* inode is in a filestream directory */
-#define XFS_IMODIFIED   0x0100  /* XFS inode state possibly differs */
-                                /* to the Linux inode state. */
-#define XFS_ITRUNCATED  0x0200  /* truncated down so flush-on-close */
 /*
 * Flags for inode locking.
@@ -460,16 +482,8 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
         ((pip)->i_d.di_mode & S_ISGID))
 /*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE         0x1
-#define XFS_IGET_BULKSTAT       0x2
-/*
 * xfs_iget.c prototypes.
 */
-void            xfs_ihash_init(struct xfs_mount *);
-void            xfs_ihash_free(struct xfs_mount *);
 xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
                                  struct xfs_trans *);
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
@@ -484,25 +498,13 @@ int		xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
 void            xfs_ireclaim(xfs_inode_t *);
-int             xfs_finish_reclaim(xfs_inode_t *, int, int);
-int             xfs_finish_reclaim_all(struct xfs_mount *, int);
 /*
 * xfs_inode.c prototypes.
 */
-int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
-                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                          xfs_daddr_t, uint, uint);
-int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                          xfs_inode_t **, xfs_daddr_t, uint);
-int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
-void            xfs_dinode_from_disk(struct xfs_icdinode *,
-                                     struct xfs_dinode_core *);
-void            xfs_dinode_to_disk(struct xfs_dinode_core *,
-                                   struct xfs_icdinode *);
 uint            xfs_ip2xflags(struct xfs_inode *);
 uint            xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +515,10 @@ int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
                                     xfs_fsize_t, int, int);
 int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-void            xfs_idestroy_fork(xfs_inode_t *, int);
-void            xfs_idestroy(xfs_inode_t *);
-void            xfs_idata_realloc(xfs_inode_t *, int, int);
-void            xfs_iextract(xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
-void            xfs_iroot_realloc(xfs_inode_t *, int, int);
 void            xfs_ipin(xfs_inode_t *);
 void            xfs_iunpin(xfs_inode_t *);
-int             xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
 int             xfs_iflush(xfs_inode_t *, uint);
-void            xfs_iflush_all(struct xfs_mount *);
 void            xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +527,77 @@ void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_atime(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
+#if defined(XFS_INODE_TRACE)
+#define INODE_TRACE_SIZE        16              /* number of trace entries */
+#define INODE_KTRACE_ENTRY      1
+#define INODE_KTRACE_EXIT       2
+#define INODE_KTRACE_HOLD       3
+#define INODE_KTRACE_REF        4
+#define INODE_KTRACE_RELE       5
+extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
+extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
+extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
+extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
+extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
+#define xfs_itrace_entry(ip)    \
+        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit(ip)     \
+        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit_tag(ip, tag)    \
+        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
+#define xfs_itrace_ref(ip)      \
+        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
+#else
+#define xfs_itrace_entry(a)
+#define xfs_itrace_exit(a)
+#define xfs_itrace_exit_tag(a, b)
+#define xfs_itrace_hold(a, b, c, d)
+#define xfs_itrace_ref(a)
+#define xfs_itrace_rele(a, b, c, d)
+#endif
+#define IHOLD(ip) \
+do { \
+        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+        atomic_inc(&(VFS_I(ip)->i_count)); \
+        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
+#define IRELE(ip) \
+do { \
+        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+        iput(VFS_I(ip)); \
+} while (0)
+#endif /* __KERNEL__ */
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE         0x1
+#define XFS_IGET_BULKSTAT       0x2
+int             xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+                            xfs_ino_t, struct xfs_dinode **,
+                            struct xfs_buf **, int *, uint);
+int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+                          struct xfs_inode *, struct xfs_dinode **,
+                          struct xfs_buf **, uint);
+int             xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                          struct xfs_inode *, xfs_daddr_t, uint);
+void            xfs_dinode_from_disk(struct xfs_icdinode *,
+                                     struct xfs_dinode *);
+void            xfs_dinode_to_disk(struct xfs_dinode *,
+                                   struct xfs_icdinode *);
+void            xfs_idestroy_fork(struct xfs_inode *, int);
+void            xfs_idata_realloc(struct xfs_inode *, int, int);
+void            xfs_iroot_realloc(struct xfs_inode *, int, int);
+int             xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int             xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
 void            xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
                                xfs_bmbt_irec_t *);
@@ -561,7 +627,8 @@ void		xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
 #define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
 #ifdef DEBUG
-void            xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+void            xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+                                xfs_fsize_t);
 #else   /* DEBUG */
 #define xfs_isize_check(mp, ip, isize)
 #endif  /* DEBUG */
@@ -576,26 +643,4 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-        wait_for_completion(&ip->i_flush);
-}
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return try_wait_for_completion(&ip->i_flush);
-}
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-        complete(&ip->i_flush);
-}
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 97c7452e2620..977c4aec587e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -281,7 +281,7 @@ xfs_inode_item_format(
        xfs_mark_inode_dirty_sync(ip);
        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
-        vecp->i_len  = sizeof(xfs_dinode_core_t);
+        vecp->i_len  = sizeof(struct xfs_icdinode);
        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
        vecp++;
        nvecs++;
@@ -296,9 +296,8 @@ xfs_inode_item_format(
         * has a new version number, then we don't bother converting back.
         */
        mp = ip->i_mount;
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-               xfs_sb_version_hasnlink(&mp->m_sb));
+        if (ip->i_d.di_version == 1) {
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
@@ -311,7 +310,7 @@ xfs_inode_item_format(
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_version = 2;
                        ip->i_d.di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                }
@@ -932,6 +931,7 @@ xfs_inode_item_init(
        iip->ili_item.li_type = XFS_LI_INODE;
        iip->ili_item.li_ops = &xfs_inode_item_ops;
        iip->ili_item.li_mountp = mp;
+        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
        /*
@@ -942,9 +942,9 @@ xfs_inode_item_init(
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
-        iip->ili_format.ilf_blkno = ip->i_blkno;
+        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
-        iip->ili_format.ilf_len = ip->i_len;
+        iip->ili_format.ilf_len = ip->i_imap.im_len;
-        iip->ili_format.ilf_boffset = ip->i_boffset;
+        iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 /*
@@ -976,9 +976,8 @@ xfs_iflush_done(
        xfs_buf_t               *bp,
        xfs_inode_log_item_t    *iip)
 {
-        xfs_inode_t     *ip;
+        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_ail          *ailp = iip->ili_item.li_ailp;
-        ip = iip->ili_inode;
        /*
         * We only want to pull the item from the AIL if it is
@@ -991,15 +990,12 @@ xfs_iflush_done(
         */
        if (iip->ili_logged &&
            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
-                spin_lock(&ip->i_mount->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
-                        /*
+                        /* xfs_trans_ail_delete() drops the AIL lock. */
-                         * xfs_trans_delete_ail() drops the AIL lock.
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
-                         */
-                        xfs_trans_delete_ail(ip->i_mount,
-                                             (xfs_log_item_t*)iip);
                } else {
-                        spin_unlock(&ip->i_mount->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
        }
@@ -1031,21 +1027,20 @@ void
 xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
-        xfs_inode_log_item_t    *iip;
+        xfs_inode_log_item_t    *iip = ip->i_itemp;
        xfs_mount_t             *mp;
        iip = ip->i_itemp;
        mp = ip->i_mount;
        if (iip) {
+                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&mp->m_ail_lock);
+                        spin_lock(&ailp->xa_lock);
                        if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                                /*
+                                /* xfs_trans_ail_delete() drops the AIL lock. */
-                                 * xfs_trans_delete_ail() drops the AIL lock.
+                                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
-                                 */
-                                xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
                        } else
-                                spin_unlock(&mp->m_ail_lock);
+                                spin_unlock(&ailp->xa_lock);
                }
                iip->ili_logged = 0;
                /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 40513077ab36..1ff04cc323ad 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
 #define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
+static inline int xfs_ilog_fbroot(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
+static inline int xfs_ilog_fext(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
+static inline int xfs_ilog_fdata(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
 #ifdef __KERNEL__
 struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
 } xfs_inode_log_item_t;
-#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
-static inline int xfs_ilog_fdata(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-#endif  /* __KERNEL__ */
-#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
-static inline int xfs_ilog_fbroot(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
-static inline int xfs_ilog_fext(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
        return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
               !ip->i_update_core;
 }
-#ifdef __KERNEL__
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67f22b2b44b3..911062cf73a6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -290,7 +290,6 @@ STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-        xfs_fsize_t     isize,
        xfs_extlen_t    extsize,
        xfs_fileoff_t   *last_fsb)
 {
@@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb(
         * stripe width and we are allocating past the allocation eof.
         */
        else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-                (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+                (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
        /*
         * Roundup the allocation request to a stripe unit (m_dalign) boundary
         * if the file size is >= stripe unit size, and we are allocating past
         * the allocation eof.
         */
-        else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+        else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
        /*
@@ -403,7 +402,6 @@ xfs_iomap_write_direct(
        xfs_filblks_t   count_fsb, resaligned;
        xfs_fsblock_t   firstfsb;
        xfs_extlen_t    extsz, temp;
-        xfs_fsize_t     isize;
        int             nimaps;
        int             bmapi_flag;
        int             quota_flag;
@@ -426,15 +424,10 @@ xfs_iomap_write_direct(
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
-        isize = ip->i_size;
-        if (ip->i_new_size > isize)
-                isize = ip->i_new_size;
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-        if ((offset + count) > isize) {
+        if ((offset + count) > ip->i_size) {
-                error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
+                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-                                                        &last_fsb);
                if (error)
                        goto error_out;
        } else {
@@ -559,7 +552,6 @@ STATIC int
 xfs_iomap_eof_want_preallocate(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-        xfs_fsize_t     isize,
        xfs_off_t       offset,
        size_t          count,
        int             ioflag,
@@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate(
        int             n, error, imaps;
        *prealloc = 0;
-        if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
+        if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
                return 0;
        /*
@@ -617,7 +609,6 @@ xfs_iomap_write_delay(
        xfs_fileoff_t   ioalign;
        xfs_fsblock_t   firstblock;
        xfs_extlen_t    extsz;
-        xfs_fsize_t     isize;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
        int             prealloc, fsynced = 0;
@@ -637,11 +628,7 @@ xfs_iomap_write_delay(
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 retry:
-        isize = ip->i_size;
+        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-        if (ip->i_new_size > isize)
-                isize = ip->i_new_size;
-        error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
@@ -655,8 +642,7 @@ retry:
        }
        if (prealloc || extsz) {
-                error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
+                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-                                                        &last_fsb);
                if (error)
                        return error;
        }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf6754a3c5b3..e19d0a8d5618 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -69,7 +69,7 @@ xfs_bulkstat_one_iget(
        }
        ASSERT(ip != NULL);
-        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+        ASSERT(ip->i_imap.im_blkno != 0);
        dic = &ip->i_d;
@@ -125,13 +125,9 @@ STATIC void
 xfs_bulkstat_one_dinode(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
-        xfs_dinode_t    *dip,           /* dinode inode pointer */
+        xfs_dinode_t    *dic,           /* dinode inode pointer */
        xfs_bstat_t     *buf)           /* return buffer */
 {
-        xfs_dinode_core_t *dic;         /* dinode core info pointer */
-        dic = &dip->di_core;
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-        if (dic->di_version == XFS_DINODE_VERSION_1) {
+        if (dic->di_version == 1) {
                buf->bs_nlink = be16_to_cpu(dic->di_onlink);
                buf->bs_projid = 0;
        } else {
@@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode(
        buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
        buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
        buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
-        buf->bs_xflags = xfs_dic2xflags(dip);
+        buf->bs_xflags = xfs_dic2xflags(dic);
        buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
        buf->bs_extents = be32_to_cpu(dic->di_nextents);
        buf->bs_gen = be32_to_cpu(dic->di_gen);
@@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode(
        switch (dic->di_format) {
        case XFS_DINODE_FMT_DEV:
-                buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev);
+                buf->bs_rdev = xfs_dinode_get_rdev(dic);
                buf->bs_blksize = BLKDEV_IOSIZE;
                buf->bs_blocks = 0;
                break;
@@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode(
        }
 }
+/* Return 0 on success or positive error */
 STATIC int
 xfs_bulkstat_one_fmt(
        void                    __user *ubuffer,
+        int                     ubsize,
+        int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
+        if (ubsize < sizeof(*buffer))
+                return XFS_ERROR(ENOMEM);
        if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-                return -EFAULT;
+                return XFS_ERROR(EFAULT);
-        return sizeof(*buffer);
+        if (ubused)
+                *ubused = sizeof(*buffer);
+        return 0;
 }
 /*
 * Return stat information for one inode.
 * Return 0 if ok, else errno.
 */
-int                             /* error status */
+int                                     /* error status */
-xfs_bulkstat_one(
+xfs_bulkstat_one_int(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* buffer to place output in */
        int             ubsize,         /* size of buffer */
-        void            *private_data,  /* my private data */
+        bulkstat_one_fmt_pf formatter,  /* formatter, copy to user */
        xfs_daddr_t     bno,            /* starting bno of inode cluster */
        int             *ubused,        /* bytes used by me */
        void            *dibuff,        /* on-disk inode buffer */
@@ -221,15 +224,12 @@ xfs_bulkstat_one(
        xfs_bstat_t     *buf;           /* return buffer */
        int             error = 0;      /* error value */
        xfs_dinode_t    *dip;           /* dinode inode pointer */
-        bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
        dip = (xfs_dinode_t *)dibuff;
        *stat = BULKSTAT_RV_NOTHING;
        if (!buffer || xfs_internal_inum(mp, ino))
                return XFS_ERROR(EINVAL);
-        if (ubsize < sizeof(*buf))
-                return XFS_ERROR(ENOMEM);
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
@@ -244,21 +244,34 @@ xfs_bulkstat_one(
                xfs_bulkstat_one_dinode(mp, ino, dip, buf);
        }
-        error = formatter(buffer, buf);
+        error = formatter(buffer, ubsize, ubused, buf);
-        if (error < 0)  {
+        if (error)
-                error = EFAULT;
                goto out_free;
-        }
        *stat = BULKSTAT_RV_DIDONE;
-        if (ubused)
-                *ubused = error;
 out_free:
        kmem_free(buf);
        return error;
 }
+int
+xfs_bulkstat_one(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        void            __user *buffer, /* buffer to place output in */
+        int             ubsize,         /* size of buffer */
+        void            *private_data,  /* my private data */
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
+        void            *dibuff,        /* on-disk inode buffer */
+        int             *stat)          /* BULKSTAT_RV_... */
+{
+        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+                                    xfs_bulkstat_one_fmt, bno,
+                                    ubused, dibuff, stat);
+}
 /*
 * Test to see whether we can use the ondisk inode directly, based
 * on the given bulkstat flags, filling in dipp accordingly.
@@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode(
         * to disk yet. This is a temporary hack that would require a proper
         * fix in the future.
         */
-        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC ||
+        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-            !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) ||
+            !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
-            !dip->di_core.di_mode)
+            !dip->di_mode)
                return 0;
        if (flags & BULKSTAT_FG_QUICK) {
                *dipp = dip;
                return 1;
        }
        /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
-        aformat = dip->di_core.di_aformat;
+        aformat = dip->di_aformat;
        if ((XFS_DFORK_Q(dip) == 0) ||
            (aformat == XFS_DINODE_FMT_LOCAL) ||
-            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
                *dipp = dip;
                return 1;
        }
@@ -359,7 +372,6 @@ xfs_bulkstat(
        int                     ubused; /* bytes used by formatter */
        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
-        xfs_inode_t             *ip;    /* ptr to in-core inode struct */
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -416,8 +428,7 @@ xfs_bulkstat(
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
-                cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
+                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                                                (xfs_inode_t *)0, 0);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -472,7 +483,7 @@ xfs_bulkstat(
                         * In any case, increment to the next record.
                         */
                        if (!error)
-                                error = xfs_inobt_increment(cur, 0, &tmp);
+                                error = xfs_btree_increment(cur, 0, &tmp);
                } else {
                        /*
                         * Start of ag.  Lookup the first inode chunk.
@@ -539,7 +550,7 @@ xfs_bulkstat(
                         * Set agino to after this chunk and bump the cursor.
                         */
                        agino = gino + XFS_INODES_PER_CHUNK;
-                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
                /*
@@ -586,6 +597,8 @@ xfs_bulkstat(
                                        if (flags & (BULKSTAT_FG_QUICK |
                                                     BULKSTAT_FG_INLINE)) {
+                                                int offset;
                                                ino = XFS_AGINO_TO_INO(mp, agno,
                                                                       agino);
                                                bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +607,15 @@ xfs_bulkstat(
                                                /*
                                                 * Get the inode cluster buffer
                                                 */
-                                                ASSERT(xfs_inode_zone != NULL);
-                                                ip = kmem_zone_zalloc(xfs_inode_zone,
-                                                                      KM_SLEEP);
-                                                ip->i_ino = ino;
-                                                ip->i_mount = mp;
-                                                spin_lock_init(&ip->i_flags_lock);
                                                if (bp)
                                                        xfs_buf_relse(bp);
-                                                error = xfs_itobp(mp, NULL, ip,
-                                                                &dip, &bp, bno,
+                                                error = xfs_inotobp(mp, NULL, ino, &dip,
-                                                                XFS_IMAP_BULKSTAT,
+                                                                    &bp, &offset,
-                                                                XFS_BUF_LOCK);
+                                                                    XFS_IGET_BULKSTAT);
                                                if (!error)
-                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
+                                                        clustidx = offset / mp->m_sb.sb_inodesize;
-                                                kmem_zone_free(xfs_inode_zone, ip);
                                                if (XFS_TEST_ERROR(error != 0,
                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +849,7 @@ xfs_inumbers(
                                agino = 0;
                                continue;
                        }
-                        cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
+                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                                XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +893,7 @@ xfs_inumbers(
                        bufidx = 0;
                }
                if (left) {
-                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index a1f18fce9b70..1fb04e7deb61 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -71,9 +71,23 @@ xfs_bulkstat_single(
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
        void                    __user *ubuffer, /* buffer to write to */
+        int                     ubsize,          /* remaining user buffer sz */
+        int                     *ubused,         /* bytes used by formatter */
        const xfs_bstat_t       *buffer);        /* buffer to read from */
 int
+xfs_bulkstat_one_int(
+        xfs_mount_t             *mp,
+        xfs_ino_t               ino,
+        void                    __user *buffer,
+        int                     ubsize,
+        bulkstat_one_fmt_pf     formatter,
+        xfs_daddr_t             bno,
+        int                     *ubused,
+        void                    *dibuff,
+        int                     *stat);
+int
 xfs_bulkstat_one(
        xfs_mount_t             *mp,
        xfs_ino_t               ino,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3608a0f0a5f6..f4726f702a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 /* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_get(xlog_t *log,
+STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
                                         int    unit_bytes,
                                         int    count,
                                         char   clientid,
                                         uint   flags);
-STATIC void             xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
@@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t	*mp,
                 */
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
-                xlog_ticket_put(log, ticket);
+                xfs_log_ticket_put(ticket);
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
@@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
-                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
                                                  client, flags);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
@@ -572,12 +571,12 @@ xfs_log_mount(
        /*
         * Initialize the AIL now we have a log.
         */
-        spin_lock_init(&mp->m_ail_lock);
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
                goto error;
        }
+        mp->m_log->l_ailp = mp->m_ail;
        /*
         * skip log recovery on a norecovery mount.  pretend it all
@@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
                error = xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (tic) {
                        xlog_trace_loggrant(log, tic, "unmount rec");
                        xlog_ungrant_log_space(log, tic);
-                        xlog_ticket_put(log, tic);
+                        xfs_log_ticket_put(tic);
                }
        } else {
                /*
@@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
                error =  xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
-        int             needed = 0, gen;
+        int             needed = 0;
        xlog_t          *log = mp->m_log;
        if (!xfs_fs_writable(mp))
@@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        spin_lock(&log->l_icloglock);
        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
-                        && !xfs_trans_first_ail(mp, &gen)
+                        && !xfs_trans_ail_tail(log->l_ailp)
                        && xlog_iclogs_empty(log)) {
                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                        log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
        xfs_lsn_t tail_lsn;
        xlog_t    *log = mp->m_log;
-        tail_lsn = xfs_trans_tail_ail(mp);
+        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
        spin_lock(&log->l_grant_lock);
        if (tail_lsn != 0) {
                log->l_tail_lsn = tail_lsn;
@@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp)
        ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
        aborted = 0;
-        /*
-         * Some versions of cpp barf on the recursive definition of
-         * ic_log -> hic_fields.ic_log and expand ic_log twice when
-         * it is passed through two macros.  Workaround broken cpp.
-         */
        l = iclog->ic_log;
        /*
@@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
-                iclog->hic_data = bp->b_addr;
+                iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
 #endif
@@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                atomic_set(&iclog->ic_refcnt, 0);
                spin_lock_init(&iclog->ic_callback_lock);
                iclog->ic_callback_tail = &(iclog->ic_callback);
-                iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
@@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t	*mp,
     */
    if (threshold_lsn &&
        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_push_ail(mp, threshold_lsn);
+            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }       /* xlog_grant_push_ail */
@@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t *	mp,
                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
                    record_cnt = data_cnt = 0;
+                    spin_lock(&log->l_icloglock);
                    xlog_state_want_sync(log, iclog);
+                    spin_unlock(&log->l_icloglock);
                    if (commit_iclog) {
                        ASSERT(flags & XLOG_COMMIT_TRANS);
                        *commit_iclog = iclog;
@@ -3200,7 +3195,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-        spin_lock(&log->l_icloglock);
+        ASSERT(spin_is_locked(&log->l_icloglock));
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
@@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
                ASSERT(iclog->ic_state &
                        (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
        }
+}
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_want_sync */
 /*****************************************************************************
@@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 */
 /*
- * Free a used ticket.
+ * Free a used ticket when it's refcount falls to zero.
 */
-STATIC void
+void
-xlog_ticket_put(xlog_t          *log,
+xfs_log_ticket_put(
-                xlog_ticket_t   *ticket)
+        xlog_ticket_t   *ticket)
 {
-        sv_destroy(&ticket->t_wait);
+        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        kmem_zone_free(xfs_log_ticket_zone, ticket);
+        if (atomic_dec_and_test(&ticket->t_ref)) {
-}       /* xlog_ticket_put */
+                sv_destroy(&ticket->t_wait);
+                kmem_zone_free(xfs_log_ticket_zone, ticket);
+        }
+}
+xlog_ticket_t *
+xfs_log_ticket_get(
+        xlog_ticket_t   *ticket)
+{
+        ASSERT(atomic_read(&ticket->t_ref) > 0);
+        atomic_inc(&ticket->t_ref);
+        return ticket;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
 STATIC xlog_ticket_t *
-xlog_ticket_get(xlog_t          *log,
+xlog_ticket_alloc(xlog_t                *log,
                int             unit_bytes,
                int             cnt,
                char            client,
@@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t		*log,
                unit_bytes += 2*BBSIZE;
        }
+        atomic_set(&tic->t_ref, 1);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t		*log,
        xlog_tic_reset_res(tic);
        return tic;
-}       /* xlog_ticket_get */
+}
 /******************************************************************************
@@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t	 *log,
        ptr = iclog->ic_datap;
        base_ptr = ptr;
        ophead = (xlog_op_header_t *)ptr;
-        xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+        xhdr = iclog->ic_data;
        for (i = 0; i < len; i++) {
                ophead = (xlog_op_header_t *)ptr;
@@ -3558,7 +3562,8 @@ xfs_log_force_umount(
        if (!log ||
            log->l_flags & XLOG_ACTIVE_RECOVERY) {
                mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-                XFS_BUF_DONE(mp->m_sb_bp);
+                if (mp->m_sb_bp)
+                        XFS_BUF_DONE(mp->m_sb_bp);
                return 0;
        }
@@ -3579,7 +3584,9 @@ xfs_log_force_umount(
        spin_lock(&log->l_icloglock);
        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-        XFS_BUF_DONE(mp->m_sb_bp);
+        if (mp->m_sb_bp)
+                XFS_BUF_DONE(mp->m_sb_bp);
        /*
         * This flag is sort of redundant because of the mount flag, but
         * it's good to maintain the separation between the log and the rest
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d47b91f10822..8a3e84e900a3 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -134,6 +134,7 @@ typedef struct xfs_log_callback {
 #ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
+struct xlog_ticket;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       xfs_log_ticket_t ticket,
                       void             **iclog,
@@ -177,6 +178,9 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
+struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+void      xfs_log_ticket_put(struct xlog_ticket *ticket);
 #endif
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e7d8f84443fa..654167be0efb 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -245,6 +245,7 @@ typedef struct xlog_ticket {
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
+        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
        int                t_unit_res;   /* unit reservation in bytes    : 4  */
        char               t_ocnt;       /* original count               : 1  */
@@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header {
 } xlog_rec_ext_header_t;
 #ifdef __KERNEL__
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+        xlog_rec_header_t       hic_header;
+        xlog_rec_ext_header_t   hic_xheader;
+        char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
 /*
 * - A log record header is 512 bytes.  There is plenty of room to grow the
 *      xlog_rec_header_t into the reserved space.
@@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header {
 * We'll put all the read-only and l_icloglock fields in the first cacheline,
 * and move everything else out to subsequent cachelines.
 */
-typedef struct xlog_iclog_fields {
+typedef struct xlog_in_core {
        sv_t                    ic_force_wait;
        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
@@ -361,41 +372,11 @@ typedef struct xlog_iclog_fields {
        /* reference counts need their own cacheline */
        atomic_t                ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t;
+        xlog_in_core_2_t        *ic_data;
+#define ic_header       ic_data->hic_header
-typedef union xlog_in_core2 {
-        xlog_rec_header_t       hic_header;
-        xlog_rec_ext_header_t   hic_xheader;
-        char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-typedef struct xlog_in_core {
-        xlog_iclog_fields_t     hic_fields;
-        xlog_in_core_2_t        *hic_data;
 } xlog_in_core_t;
 /*
- * Defines to save our code from this glop.
- */
-#define ic_force_wait   hic_fields.ic_force_wait
-#define ic_write_wait   hic_fields.ic_write_wait
-#define ic_next         hic_fields.ic_next
-#define ic_prev         hic_fields.ic_prev
-#define ic_bp           hic_fields.ic_bp
-#define ic_log          hic_fields.ic_log
-#define ic_callback     hic_fields.ic_callback
-#define ic_callback_lock hic_fields.ic_callback_lock
-#define ic_callback_tail hic_fields.ic_callback_tail
-#define ic_trace        hic_fields.ic_trace
-#define ic_size         hic_fields.ic_size
-#define ic_offset       hic_fields.ic_offset
-#define ic_refcnt       hic_fields.ic_refcnt
-#define ic_bwritecnt    hic_fields.ic_bwritecnt
-#define ic_state        hic_fields.ic_state
-#define ic_datap        hic_fields.ic_datap
-#define ic_header       hic_data->hic_header
-/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -404,6 +385,7 @@ typedef struct xlog_in_core {
 typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
+        struct xfs_ail          *l_ailp;        /* AIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 70e3ba32e6be..35cca98bd94c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -36,7 +36,6 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
@@ -54,10 +53,8 @@ STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
                                               xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void     xlog_recover_check_summary(xlog_t *);
-STATIC void     xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
 #else
 #define xlog_recover_check_summary(log)
-#define xlog_recover_check_ail(mp, lip, gen)
 #endif
@@ -270,21 +267,16 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-        xfs_mount_t     *mp;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
        if (XFS_BUF_GETERROR(bp)) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-                mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
                xfs_ioerror_alert("xlog_recover_iodone",
-                                  mp, bp, XFS_BUF_ADDR(bp));
+                                  bp->b_mount, bp, XFS_BUF_ADDR(bp));
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
        }
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_biodone(bp);
 }
@@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                bp->b_mount = mp;
-                XFS_BUF_SET_FSPRIVATE(bp, mp);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans(
        xfs_inode_log_format_t  *in_f;
        xfs_mount_t             *mp;
        xfs_buf_t               *bp;
-        xfs_imap_t              imap;
        xfs_dinode_t            *dip;
        xfs_ino_t               ino;
        int                     len;
@@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans(
        }
        ino = in_f->ilf_ino;
        mp = log->l_mp;
-        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-                imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
-                imap.im_len = in_f->ilf_len;
-                imap.im_boffset = in_f->ilf_boffset;
-        } else {
-                /*
-                 * It's an old inode format record.  We don't know where
-                 * its cluster is located on disk, and we can't allow
-                 * xfs_imap() to figure it out because the inode btrees
-                 * are not ready to be used.  Therefore do not pass the
-                 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
-                 * us only the single block in which the inode lives
-                 * rather than its cluster, so we must make sure to
-                 * invalidate the buffer when we write it out below.
-                 */
-                imap.im_blkno = 0;
-                error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
-                if (error)
-                        goto error;
-        }
        /*
         * Inode buffers can be freed, look out for it,
         * and do not replay the inode.
         */
-        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+                                        in_f->ilf_len, 0)) {
                error = 0;
                goto error;
        }
-        bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
+        bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
-                                                                XFS_BUF_LOCK);
+                                in_f->ilf_len, XFS_BUF_LOCK);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
-                                  bp, imap.im_blkno);
+                                  bp, in_f->ilf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                goto error;
        }
        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
        /*
         * Make sure the place we're flushing out to really looks
         * like an inode!
         */
-        if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) {
+        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
@@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans(
        }
        /* Skip replay when the on disk inode is newer than the log one */
-        if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) {
+        if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
                /*
                 * Deal with the wrap case, DI_MAX_FLUSH is less
                 * than smaller numbers
                 */
-                if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH &&
+                if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
@@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans(
                error = EFSCORRUPTED;
                goto error;
        }
-        if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
+        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
@@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans(
        }
        /* The core is in in-core format */
-        xfs_dinode_to_disk(&dip->di_core,
+        xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
-                (xfs_icdinode_t *)item->ri_buf[1].i_addr);
        /* the rest is in on-disk format */
-        if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+        if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
-                memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
+                memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
-                        item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
+                        item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
-                        item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+                        item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
        }
        fields = in_f->ilf_fields;
        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
        case XFS_ILOG_DEV:
-                dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev);
+                xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
                break;
        case XFS_ILOG_UUID:
-                dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+                memcpy(XFS_DFORK_DPTR(dip),
+                       &in_f->ilf_u.ilfu_uuid,
+                       sizeof(uuid_t));
                break;
        }
@@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans(
        switch (fields & XFS_ILOG_DFORK) {
        case XFS_ILOG_DDATA:
        case XFS_ILOG_DEXT:
-                memcpy(&dip->di_u, src, len);
+                memcpy(XFS_DFORK_DPTR(dip), src, len);
                break;
        case XFS_ILOG_DBROOT:
-                xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
-                                 &(dip->di_u.di_bmbt),
+                                 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
                                 XFS_DFORK_DSIZE(dip, mp));
                break;
@@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans(
                case XFS_ILOG_ABROOT:
                        dest = XFS_DFORK_APTR(dip);
-                        xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                        xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
-                                         (xfs_bmdr_block_t*)dest,
+                                         len, (xfs_bmdr_block_t*)dest,
                                         XFS_DFORK_ASIZE(dip, mp));
                        break;
@@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans(
 write_inode_buffer:
        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                bp->b_mount = mp;
-                XFS_BUF_SET_FSPRIVATE(bp, mp);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        } else {
@@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        ASSERT(dq_f->qlf_size == 2);
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-               XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+        bp->b_mount = mp;
-        XFS_BUF_SET_FSPRIVATE(bp, mp);
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
@@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans(
        efip->efi_next_extent = efi_formatp->efi_nextents;
        efip->efi_flags |= XFS_EFI_COMMITTED;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&log->l_ailp->xa_lock);
        /*
-         * xfs_trans_update_ail() drops the AIL lock.
+         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
        return 0;
 }
@@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-        xfs_mount_t             *mp;
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
        xfs_log_item_t          *lip;
-        int                     gen;
        __uint64_t              efi_id;
+        struct xfs_ail_cursor   cur;
+        struct xfs_ail          *ailp = log->l_ailp;
        if (pass == XLOG_RECOVER_PASS1) {
                return;
@@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans(
         * Search for the efi with the id in the efd format structure
         * in the AIL.
         */
-        mp = log->l_mp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-        lip = xfs_trans_first_ail(mp, &gen);
        while (lip != NULL) {
                if (lip->li_type == XFS_LI_EFI) {
                        efip = (xfs_efi_log_item_t *)lip;
                        if (efip->efi_format.efi_id == efi_id) {
                                /*
-                                 * xfs_trans_delete_ail() drops the
+                                 * xfs_trans_ail_delete() drops the
                                 * AIL lock.
                                 */
-                                xfs_trans_delete_ail(mp, lip);
+                                xfs_trans_ail_delete(ailp, lip);
                                xfs_efi_item_free(efip);
-                                return;
+                                spin_lock(&ailp->xa_lock);
+                                break;
                        }
                }
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-        spin_unlock(&mp->m_ail_lock);
+        xfs_trans_ail_cursor_done(ailp, &cur);
+        spin_unlock(&ailp->xa_lock);
 }
 /*
@@ -3036,33 +3007,6 @@ abort_error:
 }
 /*
- * Verify that once we've encountered something other than an EFI
- * in the AIL that there are no more EFIs in the AIL.
- */
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_ail(
-        xfs_mount_t             *mp,
-        xfs_log_item_t          *lip,
-        int                     gen)
-{
-        int                     orig_gen = gen;
-        do {
-                ASSERT(lip->li_type != XFS_LI_EFI);
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
-                /*
-                 * The check will be bogus if we restart from the
-                 * beginning of the AIL, so ASSERT that we don't.
-                 * We never should since we're holding the AIL lock
-                 * the entire time.
-                 */
-                ASSERT(gen == orig_gen);
-        } while (lip != NULL);
-}
-#endif  /* DEBUG */
-/*
 * When this is called, all of the EFIs which did not have
 * corresponding EFDs should be in the AIL.  What we do now
 * is free the extents associated with each one.
@@ -3086,20 +3030,23 @@ xlog_recover_process_efis(
 {
        xfs_log_item_t          *lip;
        xfs_efi_log_item_t      *efip;
-        int                     gen;
-        xfs_mount_t             *mp;
        int                     error = 0;
+        struct xfs_ail_cursor   cur;
+        struct xfs_ail          *ailp;
-        mp = log->l_mp;
+        ailp = log->l_ailp;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-        lip = xfs_trans_first_ail(mp, &gen);
        while (lip != NULL) {
                /*
                 * We're done when we see something other than an EFI.
+                 * There should be no EFIs left in the AIL now.
                 */
                if (lip->li_type != XFS_LI_EFI) {
-                        xlog_recover_check_ail(mp, lip, gen);
+#ifdef DEBUG
+                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+                                ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
                        break;
                }
@@ -3108,18 +3055,20 @@ xlog_recover_process_efis(
                 */
                efip = (xfs_efi_log_item_t *)lip;
                if (efip->efi_flags & XFS_EFI_RECOVERED) {
-                        lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
-                error = xlog_recover_process_efi(mp, efip);
+                error = xlog_recover_process_efi(log->l_mp, efip);
+                spin_lock(&ailp->xa_lock);
                if (error)
-                        return error;
+                        goto out;
-                spin_lock(&mp->m_ail_lock);
+                lip = xfs_trans_ail_cursor_next(ailp, &cur);
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
        }
-        spin_unlock(&mp->m_ail_lock);
+out:
+        xfs_trans_ail_cursor_done(ailp, &cur);
+        spin_unlock(&ailp->xa_lock);
        return error;
 }
@@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket(
        int             error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
-        if (!error)
+                                  0, 0, 0);
-                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
        if (error)
                goto out_abort;
-        error = EINVAL;
+        error = xfs_read_agi(mp, tp, agno, &agibp);
-        agi = XFS_BUF_TO_AGI(agibp);
+        if (error)
-        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
                goto out_abort;
+        agi = XFS_BUF_TO_AGI(agibp);
        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                 (sizeof(xfs_agino_t) * bucket);
@@ -3172,6 +3118,62 @@ out_error:
        return;
 }
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+        struct xfs_mount                *mp,
+        xfs_agnumber_t                  agno,
+        xfs_agino_t                     agino,
+        int                             bucket)
+{
+        struct xfs_buf                  *ibp;
+        struct xfs_dinode               *dip;
+        struct xfs_inode                *ip;
+        xfs_ino_t                       ino;
+        int                             error;
+        ino = XFS_AGINO_TO_INO(mp, agno, agino);
+        error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+        if (error)
+                goto fail;
+        /*
+         * Get the on disk inode to find the next inode in the bucket.
+         */
+        error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+        if (error)
+                goto fail_iput;
+        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(ip->i_d.di_mode != 0);
+        /* setup for the next pass */
+        agino = be32_to_cpu(dip->di_next_unlinked);
+        xfs_buf_relse(ibp);
+        /*
+         * Prevent any DMAPI event from being sent when the reference on
+         * the inode is dropped.
+         */
+        ip->i_d.di_dmevmask = 0;
+        IRELE(ip);
+        return agino;
+ fail_iput:
+        IRELE(ip);
+ fail:
+        /*
+         * We can't read in the inode this bucket points to, or this inode
+         * is messed up.  Just ditch this bucket of inodes.  We will lose
+         * some inodes and space, but at least we won't hang.
+         *
+         * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+         * clear the inode pointer in the bucket.
+         */
+        xlog_recover_clear_agi_bucket(mp, agno, bucket);
+        return NULLAGINO;
+}
 /*
 * xlog_iunlink_recover
 *
@@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks(
        xfs_agnumber_t  agno;
        xfs_agi_t       *agi;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *ibp;
-        xfs_dinode_t    *dip;
-        xfs_inode_t     *ip;
        xfs_agino_t     agino;
-        xfs_ino_t       ino;
        int             bucket;
        int             error;
        uint            mp_dmevmask;
@@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks(
                /*
                 * Find the agi for this ag.
                 */
-                agibp = xfs_buf_read(mp->m_ddev_targp,
+                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                if (error) {
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                        /*
-                if (XFS_BUF_ISERROR(agibp)) {
+                         * AGI is b0rked. Don't process it.
-                        xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
+                         *
-                                log->l_mp, agibp,
+                         * We should probably mark the filesystem as corrupt
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
+                         * after we've recovered all the ag's we can....
+                         */
+                        continue;
                }
                agi = XFS_BUF_TO_AGI(agibp);
-                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
                        while (agino != NULLAGINO) {
                                /*
                                 * Release the agi buffer so that it can
                                 * be acquired in the normal course of the
@@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks(
                                 */
                                xfs_buf_relse(agibp);
-                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                                agino = xlog_recover_process_one_iunlink(mp,
-                                error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+                                                        agno, agino, bucket);
-                                ASSERT(error || (ip != NULL));
-                                if (!error) {
-                                        /*
-                                         * Get the on disk inode to find the
-                                         * next inode in the bucket.
-                                         */
-                                        error = xfs_itobp(mp, NULL, ip, &dip,
-                                                        &ibp, 0, 0,
-                                                        XFS_BUF_LOCK);
-                                        ASSERT(error || (dip != NULL));
-                                }
-                                if (!error) {
-                                        ASSERT(ip->i_d.di_nlink == 0);
-                                        /* setup for the next pass */
-                                        agino = be32_to_cpu(
-                                                        dip->di_next_unlinked);
-                                        xfs_buf_relse(ibp);
-                                        /*
-                                         * Prevent any DMAPI event from
-                                         * being sent when the
-                                         * reference on the inode is
-                                         * dropped.
-                                         */
-                                        ip->i_d.di_dmevmask = 0;
-                                        /*
-                                         * If this is a new inode, handle
-                                         * it specially.  Otherwise,
-                                         * just drop our reference to the
-                                         * inode.  If there are no
-                                         * other references, this will
-                                         * send the inode to
-                                         * xfs_inactive() which will
-                                         * truncate the file and free
-                                         * the inode.
-                                         */
-                                        if (ip->i_d.di_mode == 0)
-                                                xfs_iput_new(ip, 0);
-                                        else
-                                                IRELE(ip);
-                                } else {
-                                        /*
-                                         * We can't read in the inode
-                                         * this bucket points to, or
-                                         * this inode is messed up.  Just
-                                         * ditch this bucket of inodes.  We
-                                         * will lose some inodes and space,
-                                         * but at least we won't hang.  Call
-                                         * xlog_recover_clear_agi_bucket()
-                                         * to perform a transaction to clear
-                                         * the inode pointer in the bucket.
-                                         */
-                                        xlog_recover_clear_agi_bucket(mp, agno,
-                                                        bucket);
-                                        agino = NULLAGINO;
-                                }
                                /*
                                 * Reacquire the agibuffer and continue around
-                                 * the loop.
+                                 * the loop. This should never fail as we know
+                                 * the buffer was good earlier on.
                                 */
-                                agibp = xfs_buf_read(mp->m_ddev_targp,
+                                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                                XFS_AG_DADDR(mp, agno,
+                                ASSERT(error == 0);
-                                                        XFS_AGI_DADDR(mp)),
-                                                XFS_FSS_TO_BB(mp, 1), 0);
-                                if (XFS_BUF_ISERROR(agibp)) {
-                                        xfs_ioerror_alert(
-                                "xlog_recover_process_iunlinks(#2)",
-                                                log->l_mp, agibp,
-                                                XFS_AG_DADDR(mp, agno,
-                                                        XFS_AGI_DADDR(mp)));
-                                }
                                agi = XFS_BUF_TO_AGI(agibp);
-                                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
-                                        agi->agi_magicnum));
                        }
                }
@@ -3367,7 +3294,6 @@ xlog_pack_data(
        int                     size = iclog->ic_offset + roundoff;
        __be32                  cycle_lsn;
        xfs_caddr_t             dp;
-        xlog_in_core_2_t        *xhdr;
        xlog_pack_data_checksum(log, iclog, size);
@@ -3382,7 +3308,8 @@ xlog_pack_data(
        }
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+                xlog_in_core_2_t *xhdr = iclog->ic_data;
                for ( ; i < BTOBB(size); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3440,7 +3367,6 @@ xlog_unpack_data(
        xlog_t                  *log)
 {
        int                     i, j, k;
-        xlog_in_core_2_t        *xhdr;
        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3449,7 +3375,7 @@ xlog_unpack_data(
        }
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xhdr = (xlog_in_core_2_t *)rhead;
+                xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4003,11 +3929,8 @@ xlog_recover_check_summary(
 {
        xfs_mount_t     *mp;
        xfs_agf_t       *agfp;
-        xfs_agi_t       *agip;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_daddr_t     agfdaddr;
-        xfs_daddr_t     agidaddr;
        xfs_buf_t       *sbbp;
 #ifdef XFS_LOUD_RECOVERY
        xfs_sb_t        *sbp;
@@ -4016,6 +3939,7 @@ xlog_recover_check_summary(
        __uint64_t      freeblks;
        __uint64_t      itotal;
        __uint64_t      ifree;
+        int             error;
        mp = log->l_mp;
@@ -4023,37 +3947,27 @@ xlog_recover_check_summary(
        itotal = 0LL;
        ifree = 0LL;
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-                agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
+                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
-                agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
+                if (error) {
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                        xfs_fs_cmn_err(CE_ALERT, mp,
-                if (XFS_BUF_ISERROR(agfbp)) {
+                                        "xlog_recover_check_summary(agf)"
-                        xfs_ioerror_alert("xlog_recover_check_summary(agf)",
+                                        "agf read failed agno %d error %d",
-                                                mp, agfbp, agfdaddr);
+                                                        agno, error);
-                }
+                } else {
-                agfp = XFS_BUF_TO_AGF(agfbp);
+                        agfp = XFS_BUF_TO_AGF(agfbp);
-                ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
+                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
+                                    be32_to_cpu(agfp->agf_flcount);
-                ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
+                        xfs_buf_relse(agfbp);
-                freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                            be32_to_cpu(agfp->agf_flcount);
-                xfs_buf_relse(agfbp);
-                agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-                agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
-                                XFS_FSS_TO_BB(mp, 1), 0);
-                if (XFS_BUF_ISERROR(agibp)) {
-                        xfs_ioerror_alert("xlog_recover_check_summary(agi)",
-                                          mp, agibp, agidaddr);
                }
-                agip = XFS_BUF_TO_AGI(agibp);
-                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
-                ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
-                ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
-                itotal += be32_to_cpu(agip->agi_count);
+                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                ifree += be32_to_cpu(agip->agi_freecount);
+                if (!error) {
-                xfs_buf_relse(agibp);
+                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
+                        itotal += be32_to_cpu(agi->agi_count);
+                        ifree += be32_to_cpu(agi->agi_freecount);
+                        xfs_buf_relse(agibp);
+                }
        }
        sbbp = xfs_getsb(mp, 0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 15f5dd22fbb2..3c97c6463a4e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 STATIC void
 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 {
-        int     i;
        mp->m_agfrotor = mp->m_agirotor = 0;
        spin_lock_init(&mp->m_agirotor_lock);
        mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-        mp->m_litino = sbp->sb_inodesize -
+        mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
-                ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
-        INIT_LIST_HEAD(&mp->m_del_inodes);
        /*
         * Setup for attributes, in case they get created.
@@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        }
        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
-        for (i = 0; i < 2; i++) {
+        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-                mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
-                        xfs_alloc, i == 0);
+        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
-                mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+        mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-                        xfs_alloc, i == 0);
-        }
+        mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-        for (i = 0; i < 2; i++) {
+        mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-                mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-                        xfs_bmbt, i == 0);
+        mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-                mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                        xfs_bmbt, i == 0);
+        mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-        }
+        mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
-        for (i = 0; i < 2; i++) {
+        mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
-                mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-                        xfs_inobt, i == 0);
-                mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                        xfs_inobt, i == 0);
-        }
        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1228,6 +1220,16 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
+        /*
+         * Release dquot that rootinode, rbmino and rsumino might be holding,
+         * and release the quota inodes.
+         */
+        XFS_QM_UNMOUNT(mp);
+        if (mp->m_rbmip)
+                IRELE(mp->m_rbmip);
+        if (mp->m_rsumip)
+                IRELE(mp->m_rsumip);
        IRELE(mp->m_rootip);
        /*
@@ -1241,7 +1243,7 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-        xfs_iflush_all(mp);
+        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
@@ -1288,11 +1290,6 @@ xfs_unmountfs(
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-        /*
-         * All inodes from this mount point should be freed.
-         */
-        ASSERT(mp->m_inodes == NULL);
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1365,24 +1362,6 @@ xfs_log_sbcount(
        return error;
 }
-STATIC void
-xfs_mark_shared_ro(
-        xfs_mount_t     *mp,
-        xfs_buf_t       *bp)
-{
-        xfs_dsb_t       *sb = XFS_BUF_TO_SBP(bp);
-        __uint16_t      version;
-        if (!(sb->sb_flags & XFS_SBF_READONLY))
-                sb->sb_flags |= XFS_SBF_READONLY;
-        version = be16_to_cpu(sb->sb_versionnum);
-        if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 ||
-            !(version & XFS_SB_VERSION_SHAREDBIT))
-                version |= XFS_SB_VERSION_SHAREDBIT;
-        sb->sb_versionnum = cpu_to_be16(version);
-}
 int
 xfs_unmountfs_writesb(xfs_mount_t *mp)
 {
@@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                sbp = xfs_getsb(mp, 0);
-                /*
-                 * mark shared-readonly if desired
-                 */
-                if (mp->m_mk_sharedro)
-                        xfs_mark_shared_ro(mp, sbp);
                XFS_BUF_UNDONE(sbp);
                XFS_BUF_UNREAD(sbp);
                XFS_BUF_UNDELAYWRITE(sbp);
@@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
-                if (error && mp->m_mk_sharedro)
-                        xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
                xfs_buf_relse(sbp);
        }
        return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f3c1024b1241..c1e028467327 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_MOUNT_H__
 #define __XFS_MOUNT_H__
 typedef struct xfs_trans_reservations {
        uint    tr_write;       /* extent alloc trans */
        uint    tr_itruncate;   /* truncate trans */
@@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations {
 } xfs_trans_reservations_t;
 #ifndef __KERNEL__
-/*
- * Moved here from xfs_ag.h to avoid reordering header files
- */
 #define XFS_DADDR_TO_AGNO(mp,d) \
        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
 #define XFS_DADDR_TO_AGBNO(mp,d) \
        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-#else
+#else /* __KERNEL__ */
+#include "xfs_sync.h"
 struct cred;
 struct log;
 struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
+struct xfs_ail;
 /*
 * Prototypes and functions for the Data Migration subsystem.
@@ -115,7 +117,7 @@ struct xfs_quotainfo;
 typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
 typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
+typedef void    (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
 typedef int     (*xfs_dqattach_t)(struct xfs_inode *, uint);
@@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
                        struct xfs_dquot **, struct xfs_dquot *);
 typedef int     (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
                        struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *);
+typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int     (*xfs_dqsync_t)(struct xfs_mount *, int flags);
 typedef int     (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
@@ -223,18 +225,10 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
-typedef struct xfs_ail {
-        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
-        xfs_lsn_t               xa_target;
-} xfs_ail_t;
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
-        spinlock_t              m_ail_lock;     /* fs AIL mutex */
+        struct xfs_ail          *m_ail;         /* fs active log item list */
-        xfs_ail_t               m_ail;          /* fs active log item list */
        xfs_sb_t                m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
@@ -247,10 +241,6 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        spinlock_t              m_agirotor_lock;/* .. and lock protecting it */
        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
-        struct xfs_inode        *m_inodes;      /* active inode list */
-        struct list_head        m_del_inodes;   /* inodes to reclaim */
-        mutex_t                 m_ilock;        /* inode list mutex */
-        uint                    m_ireclaims;    /* count of calls to reclaim*/
        uint                    m_readio_log;   /* min read size log bytes */
        uint                    m_readio_blocks; /* min read size blocks */
        uint                    m_writeio_log;  /* min write size log bytes */
@@ -267,7 +257,6 @@ typedef struct xfs_mount {
        xfs_buftarg_t           *m_ddev_targp;  /* saves taking the address */
        xfs_buftarg_t           *m_logdev_targp;/* ptr to log device */
        xfs_buftarg_t           *m_rtdev_targp; /* ptr to rt device */
-        __uint8_t               m_dircook_elog; /* log d-cookie entry bits */
        __uint8_t               m_blkbit_log;   /* blocklog + NBBY */
        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
        __uint8_t               m_agno_log;     /* log #ag's */
@@ -276,12 +265,12 @@ typedef struct xfs_mount {
        uint                    m_blockmask;    /* sb_blocksize-1 */
        uint                    m_blockwsize;   /* sb_blocksize in words */
        uint                    m_blockwmask;   /* blockwsize-1 */
-        uint                    m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
+        uint                    m_alloc_mxr[2]; /* max alloc btree records */
-        uint                    m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
+        uint                    m_alloc_mnr[2]; /* min alloc btree records */
-        uint                    m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
+        uint                    m_bmap_dmxr[2]; /* max bmap btree records */
-        uint                    m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
+        uint                    m_bmap_dmnr[2]; /* min bmap btree records */
-        uint                    m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
+        uint                    m_inobt_mxr[2]; /* max inobt btree records */
-        uint                    m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+        uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -312,9 +301,6 @@ typedef struct xfs_mount {
        int                     m_sinoalign;    /* stripe unit inode alignment */
        int                     m_attr_magicpct;/* 37% of the blocksize */
        int                     m_dir_magicpct; /* 37% of the dir blocksize */
-        __uint8_t               m_mk_sharedro;  /* mark shared ro on unmount */
-        __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
-                                                   field governed by m_ilock */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
        const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
        int                     m_dirblksize;   /* directory block sz--bytes */
@@ -362,7 +348,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_ATTR2         (1ULL << 8)     /* allow use of attr2 format */
 #define XFS_MOUNT_GRPID         (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY    (1ULL << 10)    /* no recovery - dirty fs */
-#define XFS_MOUNT_SHARED        (1ULL << 11)    /* shared mount */
 #define XFS_MOUNT_DFLT_IOSIZE   (1ULL << 12)    /* set default i/o size */
 #define XFS_MOUNT_OSYNCISOSYNC  (1ULL << 13)    /* o_sync is REALLY o_sync */
                                                /* osyncisdsync is now default*/
@@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 #define xfs_force_shutdown(m,f) \
        xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
+#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
 /*
 * Flags for xfs_mountfs
 */
@@ -508,14 +503,12 @@ typedef struct xfs_mod_sb {
 #define XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
 #define XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
-extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
 extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
-extern int      xfs_unmount_flush(xfs_mount_t *, int);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int      xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
                        int64_t, int);
@@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
 extern int      xfs_fs_writable(xfs_mount_t *);
-extern int      xfs_syncsub(xfs_mount_t *, int, int *);
-extern int      xfs_sync_inodes(xfs_mount_t *, int, int *);
-extern xfs_agnumber_t   xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 extern int      xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
-extern int      xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int      xfs_dmops_get(struct xfs_mount *);
 extern void     xfs_dmops_put(struct xfs_mount *);
-extern int      xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int      xfs_qmops_get(struct xfs_mount *);
 extern void     xfs_qmops_put(struct xfs_mount *);
 extern struct xfs_dmops xfs_dmcore_xfs;
 #endif  /* __KERNEL__ */
+extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern xfs_agnumber_t   xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index a294e58db8dd..27f80581520a 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -28,7 +28,6 @@
 #include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_error.h"
-#include "xfs_clnt.h"
 STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
 };
 int
-xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_qmops_get(struct xfs_mount *mp)
 {
-        if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) {
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
 #ifdef CONFIG_XFS_QUOTA
                mp->m_qm_ops = &xfs_qmcore_xfs;
 #else
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 12c4ec775af8..48965ecaa155 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -84,11 +84,9 @@ typedef struct xfs_dqblk {
 #define XFS_DQ_USER             0x0001          /* a user quota */
 #define XFS_DQ_PROJ             0x0002          /* project quota */
 #define XFS_DQ_GROUP            0x0004          /* a group quota */
-#define XFS_DQ_FLOCKED          0x0008          /* flush lock taken */
+#define XFS_DQ_DIRTY            0x0008          /* dquot is dirty */
-#define XFS_DQ_DIRTY            0x0010          /* dquot is dirty */
+#define XFS_DQ_WANT             0x0010          /* for lookup/reclaim race */
-#define XFS_DQ_WANT             0x0020          /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE         0x0020          /* dq off mplist & hashlist */
-#define XFS_DQ_INACTIVE         0x0040          /* dq off mplist & hashlist */
-#define XFS_DQ_MARKER           0x0080          /* sentinel */
 #define XFS_DQ_ALLTYPES         (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c903130be7fd..86471bb40fd4 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -42,31 +42,6 @@
 /*
- * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
- * If there are fewer than 4 entries in the array, the empty entries will
- * be at the end and will have NULL pointers in them.
- */
-STATIC void
-xfs_rename_unlock4(
-        xfs_inode_t     **i_tab,
-        uint            lock_mode)
-{
-        int     i;
-        xfs_iunlock(i_tab[0], lock_mode);
-        for (i = 1; i < 4; i++) {
-                if (i_tab[i] == NULL)
-                        break;
-                /*
-                 * Watch out for duplicate entries in the table.
-                 */
-                if (i_tab[i] != i_tab[i-1])
-                        xfs_iunlock(i_tab[i], lock_mode);
-        }
-}
-/*
 * Enter all inodes for a rename transaction into a sorted array.
 */
 STATIC void
@@ -205,19 +180,6 @@ xfs_rename(
        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
        /*
-         * If we are using project inheritance, we only allow renames
-         * into our tree when the project IDs are the same; else the
-         * tree quota mechanism would be circumvented.
-         */
-        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
-                error = XFS_ERROR(EXDEV);
-                xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
-                xfs_trans_cancel(tp, cancel_flags);
-                goto std_return;
-        }
-        /*
         * Join all the inodes to the transaction. From this point on,
         * we can rely on either trans_commit or trans_cancel to unlock
         * them.  Note that we need to add a vnode reference to the
@@ -242,6 +204,17 @@ xfs_rename(
        }
        /*
+         * If we are using project inheritance, we only allow renames
+         * into our tree when the project IDs are the same; else the
+         * tree quota mechanism would be circumvented.
+         */
+        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+                error = XFS_ERROR(EXDEV);
+                goto error_return;
+        }
+        /*
         * Set up the target.
         */
        if (target_ip == NULL) {
@@ -367,19 +340,11 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
        if (error)
                goto abort_return;
-        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
+        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-         * Update the generation counts on all the directory inodes
-         * that we're modifying.
-         */
-        src_dp->i_gen++;
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+        if (new_parent)
-        if (new_parent) {
-                target_dp->i_gen++;
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-        }
        /*
         * If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e2f68de16159..edf12c7b834c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -85,7 +85,6 @@ xfs_growfs_rt_alloc(
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
-        int             cancelflags;    /* flags for xfs_trans_cancel */
        int             committed;      /* transaction committed flag */
        xfs_daddr_t     d;              /* disk block address */
        int             error;          /* error return value */
@@ -96,15 +95,16 @@ xfs_growfs_rt_alloc(
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
-        xfs_trans_t     *tp;            /* transaction pointer */
        /*
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
+                int             cancelflags = 0;
+                xfs_trans_t     *tp;
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
-                cancelflags = 0;
                /*
                 * Reserve space & log for one extent added to the file.
                 */
@@ -171,7 +171,9 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = XFS_ERROR(EIO);
-                                goto error_cancel;
+error_cancel:
+                                xfs_trans_cancel(tp, cancelflags);
+                                goto error;
                        }
                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -188,8 +190,6 @@ xfs_growfs_rt_alloc(
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
-error_cancel:
-        xfs_trans_cancel(tp, cancelflags);
 error:
        return error;
 }
@@ -1856,7 +1856,6 @@ xfs_growfs_rt(
 {
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
-        int             cancelflags;    /* flags for xfs_trans_cancel */
        int             error;          /* error return value */
        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
@@ -1872,13 +1871,13 @@ xfs_growfs_rt(
        xfs_extlen_t    rsumblocks;     /* current number of rt summary blks */
        xfs_sb_t        *sbp;           /* old superblock */
        xfs_fsblock_t   sumbno;         /* summary block number */
-        xfs_trans_t     *tp;            /* transaction pointer */
        sbp = &mp->m_sb;
-        cancelflags = 0;
        /*
         * Initial error checking.
         */
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
@@ -1942,6 +1941,9 @@ xfs_growfs_rt(
                     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
             bmbno < nrbmblocks;
             bmbno++) {
+                xfs_trans_t     *tp;
+                int             cancelflags = 0;
                *nmp = *mp;
                nsbp = &nmp->m_sb;
                /*
@@ -1967,16 +1969,15 @@ xfs_growfs_rt(
                 * Start a transaction, get the log reservation.
                 */
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-                cancelflags = 0;
                if ((error = xfs_trans_reserve(tp, 0,
                                XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
-                        break;
+                        goto error_cancel;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        break;
+                        goto error_cancel;
                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
@@ -1990,7 +1991,7 @@ xfs_growfs_rt(
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        break;
+                        goto error_cancel;
                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
@@ -2005,7 +2006,7 @@ xfs_growfs_rt(
                    mp->m_rsumlevels != nmp->m_rsumlevels) {
                        error = xfs_rtcopy_summary(mp, nmp, tp);
                        if (error)
-                                break;
+                                goto error_cancel;
                }
                /*
                 * Update superblock fields.
@@ -2031,8 +2032,11 @@ xfs_growfs_rt(
                bp = NULL;
                error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
-                if (error)
+                if (error) {
+error_cancel:
+                        xfs_trans_cancel(tp, cancelflags);
                        break;
+                }
                /*
                 * Mark more blocks free in the superblock.
                 */
@@ -2045,15 +2049,10 @@ xfs_growfs_rt(
                mp->m_rsumsize = nrsumsize;
                error = xfs_trans_commit(tp, 0);
-                if (error) {
+                if (error)
-                        tp = NULL;
                        break;
-                }
        }
-        if (error && tp)
-                xfs_trans_cancel(tp, cancelflags);
        /*
         * Free the fake mp structure.
         */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3a82576dde9a..36f3a21c54d2 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -406,7 +406,7 @@ xfs_bwrite(
         * XXXsup how does this work for quotas.
         */
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
-        XFS_BUF_SET_FSPRIVATE3(bp, mp);
+        bp->b_mount = mp;
        XFS_BUF_WRITE(bp);
        if ((error = XFS_bwrite(bp))) {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 3f8cf1587f4c..1ed71916e4c9 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -79,6 +79,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT  0x00000002      /* Superblk counters */
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -296,30 +297,34 @@ typedef enum {
 #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-#ifdef __KERNEL__
 static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 {
-        return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
+        /* We always support version 1-3 */
-                  (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
+        if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
-                   ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+            sbp->sb_versionnum <= XFS_SB_VERSION_3)
-                    !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+                return 1;
-                      ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
-                       (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
+        /* We support version 4 if all feature bits are supported */
-                    (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)));
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
-}
+                if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+                    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+                     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+                        return 0;
+#ifdef __KERNEL__
+                if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+                        return 0;
 #else
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
+                if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
-{
+                    sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
-        return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
+                        return 0;
-                  (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
+#endif
-                   ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                    !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+                return 1;
-                      ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
+        }
-                       (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
-                  (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
+        return 0;
-                   (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))));
 }
-#endif /* __KERNEL__ */
 /*
 * Detect a mismatched features2 field.  Older kernels read/wrote
@@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
-        return ((((v) == XFS_SB_VERSION_1) ? \
+        if (v == XFS_SB_VERSION_1)
-                0 : \
+                return XFS_SB_VERSION_4;
-                (((v) == XFS_SB_VERSION_2) ? \
-                        XFS_SB_VERSION_ATTRBIT : \
+        if (v == XFS_SB_VERSION_2)
-                        (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
+                return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
-                XFS_SB_VERSION_4);
+        return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+                XFS_SB_VERSION_NLINKBIT;
 }
 static inline unsigned xfs_sb_version_toold(unsigned v)
 {
-        return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
+        if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
-                0 : \
+                return 0;
-                (((v) & XFS_SB_VERSION_NLINKBIT) ? \
+        if (v & XFS_SB_VERSION_NLINKBIT)
-                        XFS_SB_VERSION_3 : \
+                return XFS_SB_VERSION_3;
-                        (((v) & XFS_SB_VERSION_ATTRBIT) ?  \
+        if (v & XFS_SB_VERSION_ATTRBIT)
-                                XFS_SB_VERSION_2 : \
+                return XFS_SB_VERSION_2;
-                                XFS_SB_VERSION_1)));
+        return XFS_SB_VERSION_1;
 }
 static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
 {
-        return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
+        return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
-                 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+                sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-                 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                  ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+                 (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
 }
 static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
+        if (sbp->sb_versionnum == XFS_SB_VERSION_1)
-                XFS_SB_VERSION_2 : \
+                sbp->sb_versionnum = XFS_SB_VERSION_2;
-                ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
+        else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                        ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
+                sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
-                        (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)));
+        else
+                sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
 }
 static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
 {
-        return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+        return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-                 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                  ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+                  (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
 }
 static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
+        if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
-                XFS_SB_VERSION_3 : \
+                sbp->sb_versionnum = XFS_SB_VERSION_3;
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT));
+        else
+                sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
 }
 static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
 }
 static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = \
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
+                sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
-                        ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
+        else
-                        (xfs_sb_version_tonew((sbp)->sb_versionnum) | \
+                sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
-                         XFS_SB_VERSION_QUOTABIT));
+                                        XFS_SB_VERSION_QUOTABIT;
 }
 static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
 }
 static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
 }
 static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
 }
 static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
 }
 static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
 static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
 }
 static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
 static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
                (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
 }
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
 }
 /*
@@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
 {
-        return (xfs_sb_version_hasmorebits(sbp) &&      \
+        return xfs_sb_version_hasmorebits(sbp) &&
-                ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+                (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
 }
 static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
 {
-        return (xfs_sb_version_hasmorebits(sbp)) &&     \
+        return xfs_sb_version_hasmorebits(sbp) &&
-                ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+                (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
 }
 static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 {
-        ((sbp)->sb_versionnum = \
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT),    \
+        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        ((sbp)->sb_features2 =  \
-                ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
 }
 static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4e1c22a23be5..8570b826fedd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -290,7 +290,7 @@ xfs_trans_dup(
        ASSERT(tp->t_ticket != NULL);
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
-        ntp->t_ticket = tp->t_ticket;
+        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
        ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
@@ -1260,6 +1260,13 @@ xfs_trans_roll(
        trans = *tpp;
        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(trans->t_ticket);
+        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
         * out to disk if they are taking up space at the tail of the log
@@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed(
        xfs_log_item_desc_t     *lidp;
        xfs_log_item_t          *lip;
        xfs_lsn_t               item_lsn;
-        struct xfs_mount        *mp;
        int                     i;
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                struct xfs_ail          *ailp;
                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
@@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed(
                 * This would cause the earlier transaction to fail
                 * the test below.
                 */
-                mp = lip->li_mountp;
+                ailp = lip->li_ailp;
-                spin_lock(&mp->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
                        /*
                         * This will set the item's lsn to item_lsn
                         * and update the position of the item in
                         * the AIL.
                         *
-                         * xfs_trans_update_ail() drops the AIL lock.
+                         * xfs_trans_ail_update() drops the AIL lock.
                         */
-                        xfs_trans_update_ail(mp, lip, item_lsn);
+                        xfs_trans_ail_update(ailp, lip, item_lsn);
                } else {
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
                /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 74c80bd2b0ec..d6fe4a88d79f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_TRANS_H__
 #define __XFS_TRANS_H__
+struct xfs_log_item;
 /*
 * This is the structure written in the log at the head of
 * every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_TYPE_MAX              41
 /* new transaction types need to be reflected in xfs_logprint(8) */
-#ifdef __KERNEL__
-struct xfs_buf;
-struct xfs_buftarg;
-struct xfs_efd_log_item;
-struct xfs_efi_log_item;
-struct xfs_inode;
-struct xfs_item_ops;
-struct xfs_log_iovec;
-struct xfs_log_item;
-struct xfs_log_item_desc;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot_acct;
-typedef struct xfs_log_item {
-        struct list_head                li_ail;         /* AIL pointers */
-        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
-        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
-        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
-        uint                            li_type;        /* item type */
-        uint                            li_flags;       /* misc flags */
-        struct xfs_log_item             *li_bio_list;   /* buffer item list */
-        void                            (*li_cb)(struct xfs_buf *,
-                                                 struct xfs_log_item *);
-                                                        /* buffer item iodone */
-                                                        /* callback func */
-        struct xfs_item_ops             *li_ops;        /* function list */
-} xfs_log_item_t;
-#define XFS_LI_IN_AIL   0x1
-#define XFS_LI_ABORTED  0x2
-typedef struct xfs_item_ops {
-        uint (*iop_size)(xfs_log_item_t *);
-        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
-        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
-        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
-        uint (*iop_trylock)(xfs_log_item_t *);
-        void (*iop_unlock)(xfs_log_item_t *);
-        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
-        void (*iop_push)(xfs_log_item_t *);
-        void (*iop_pushbuf)(xfs_log_item_t *);
-        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
-#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
-#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
-#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-/*
- * Return values for the IOP_TRYLOCK() routines.
- */
-#define XFS_ITEM_SUCCESS        0
-#define XFS_ITEM_PINNED         1
-#define XFS_ITEM_LOCKED         2
-#define XFS_ITEM_FLUSHING       3
-#define XFS_ITEM_PUSHBUF        4
-#endif  /* __KERNEL__ */
 /*
 * This structure is used to track log items associated with
 * a transaction.  It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
 * once we get to commit processing (see xfs_trans_commit()).
 */
 typedef struct xfs_log_item_desc {
-        xfs_log_item_t  *lid_item;
+        struct xfs_log_item     *lid_item;
        ushort          lid_size;
        unsigned char   lid_flags;
        unsigned char   lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
 }
-#ifdef __KERNEL__
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-/*
- * This is the structure maintained for every active transaction.
- */
-typedef struct xfs_trans {
-        unsigned int            t_magic;        /* magic number */
-        xfs_log_callback_t      t_logcb;        /* log callback struct */
-        unsigned int            t_type;         /* transaction type */
-        unsigned int            t_log_res;      /* amt of log space resvd */
-        unsigned int            t_log_count;    /* count for perm log res */
-        unsigned int            t_blk_res;      /* # of blocks resvd */
-        unsigned int            t_blk_res_used; /* # of resvd blocks used */
-        unsigned int            t_rtx_res;      /* # of rt extents resvd */
-        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
-        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-        xfs_lsn_t               t_lsn;          /* log seq num of start of
-                                                 * transaction. */
-        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
-                                                 * transaction. */
-        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
-        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-        xfs_trans_callback_t    t_callback;     /* transaction callback */
-        void                    *t_callarg;     /* callback arg */
-        unsigned int            t_flags;        /* misc flags */
-        int64_t                 t_icount_delta; /* superblock icount change */
-        int64_t                 t_ifree_delta;  /* superblock ifree change */
-        int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
-        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
-        int64_t                 t_frextents_delta;/* superblock freextents chg*/
-        int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
-        int64_t                 t_ag_freeblks_delta; /* debugging counter */
-        int64_t                 t_ag_flist_delta; /* debugging counter */
-        int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
-        int64_t                 t_dblocks_delta;/* superblock dblocks change */
-        int64_t                 t_agcount_delta;/* superblock agcount change */
-        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
-        int64_t                 t_rextsize_delta;/* superblock rextsize chg */
-        int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
-        int64_t                 t_rblocks_delta;/* superblock rblocks change */
-        int64_t                 t_rextents_delta;/* superblocks rextents chg */
-        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-        unsigned int            t_items_free;   /* log item descs free */
-        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
-        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
-        unsigned long           t_pflags;       /* saved process flags state */
-} xfs_trans_t;
-#endif  /* __KERNEL__ */
 #define XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
 * Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+typedef struct xfs_log_item {
+        struct list_head                li_ail;         /* AIL pointers */
+        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
+        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
+        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
+        struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+        uint                            li_type;        /* item type */
+        uint                            li_flags;       /* misc flags */
+        struct xfs_log_item             *li_bio_list;   /* buffer item list */
+        void                            (*li_cb)(struct xfs_buf *,
+                                                 struct xfs_log_item *);
+                                                        /* buffer item iodone */
+                                                        /* callback func */
+        struct xfs_item_ops             *li_ops;        /* function list */
+} xfs_log_item_t;
+#define XFS_LI_IN_AIL   0x1
+#define XFS_LI_ABORTED  0x2
+typedef struct xfs_item_ops {
+        uint (*iop_size)(xfs_log_item_t *);
+        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+        void (*iop_pin)(xfs_log_item_t *);
+        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+        uint (*iop_trylock)(xfs_log_item_t *);
+        void (*iop_unlock)(xfs_log_item_t *);
+        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+        void (*iop_push)(xfs_log_item_t *);
+        void (*iop_pushbuf)(xfs_log_item_t *);
+        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS        0
+#define XFS_ITEM_PINNED         1
+#define XFS_ITEM_LOCKED         2
+#define XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF        4
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+typedef struct xfs_log_busy_slot {
+        xfs_agnumber_t          lbc_ag;
+        ushort                  lbc_idx;        /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+#define XFS_LBC_NUM_SLOTS       31
+typedef struct xfs_log_busy_chunk {
+        struct xfs_log_busy_chunk       *lbc_next;
+        uint                            lbc_free;       /* free slots bitmask */
+        ushort                          lbc_unused;     /* first unused */
+        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
+#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+        unsigned int            t_magic;        /* magic number */
+        xfs_log_callback_t      t_logcb;        /* log callback struct */
+        unsigned int            t_type;         /* transaction type */
+        unsigned int            t_log_res;      /* amt of log space resvd */
+        unsigned int            t_log_count;    /* count for perm log res */
+        unsigned int            t_blk_res;      /* # of blocks resvd */
+        unsigned int            t_blk_res_used; /* # of resvd blocks used */
+        unsigned int            t_rtx_res;      /* # of rt extents resvd */
+        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
+        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+        xfs_lsn_t               t_lsn;          /* log seq num of start of
+                                                 * transaction. */
+        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
+                                                 * transaction. */
+        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
+        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
+        xfs_trans_callback_t    t_callback;     /* transaction callback */
+        void                    *t_callarg;     /* callback arg */
+        unsigned int            t_flags;        /* misc flags */
+        int64_t                 t_icount_delta; /* superblock icount change */
+        int64_t                 t_ifree_delta;  /* superblock ifree change */
+        int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
+        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
+        int64_t                 t_frextents_delta;/* superblock freextents chg*/
+        int64_t                 t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+        int64_t                 t_ag_freeblks_delta; /* debugging counter */
+        int64_t                 t_ag_flist_delta; /* debugging counter */
+        int64_t                 t_ag_btree_delta; /* debugging counter */
+#endif
+        int64_t                 t_dblocks_delta;/* superblock dblocks change */
+        int64_t                 t_agcount_delta;/* superblock agcount change */
+        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
+        int64_t                 t_rextsize_delta;/* superblock rextsize chg */
+        int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
+        int64_t                 t_rblocks_delta;/* superblock rblocks change */
+        int64_t                 t_rextents_delta;/* superblocks rextents chg */
+        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
+        unsigned int            t_items_free;   /* log item descs free */
+        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
+        xfs_trans_header_t      t_header;       /* header for in-log trans */
+        unsigned int            t_busy_free;    /* busy descs free */
+        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
+        unsigned long           t_pflags;       /* saved process flags state */
+} xfs_trans_t;
 /*
 * XFS transaction mechanism exported interfaces that are
 * actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
 /*
 * XFS transaction mechanism exported interfaces.
 */
-void            xfs_trans_init(struct xfs_mount *);
 xfs_trans_t     *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t     *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int		_xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
-int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-void            xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t       xfs_trans_tail_ail(struct xfs_mount *);
-void            xfs_trans_unlocked_item(struct xfs_mount *,
-                                        xfs_log_item_t *);
 xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
                                        xfs_agnumber_t ag,
                                        xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t	*xfs_trans_zone;
 #endif  /* __KERNEL__ */
+void            xfs_trans_init(struct xfs_mount *);
+int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 #endif  /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1f77c00af566..2d47f10f8bed 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -28,13 +29,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
+STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
 #else
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 * lsn of the last item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_tail_ail(
+xfs_trans_ail_tail(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
-        lip = xfs_ail_min(&mp->m_ail);
+        lip = xfs_ail_min(ailp);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
                lsn = lip->li_lsn;
        }
-        spin_unlock(&mp->m_ail_lock);
+        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
 * any of the objects, so the lock is not needed.
 */
 void
-xfs_trans_push_ail(
+xfs_trans_ail_push(
-        xfs_mount_t             *mp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t               threshold_lsn)
+        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t          *lip;
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+                        xfsaild_wakeup(ailp, threshold_lsn);
+        }
+}
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next ƣtem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur)
+{
+        cur->item = NULL;
+        if (cur == &ailp->xa_cursors)
+                return;
+        cur->next = ailp->xa_cursors.next;
+        ailp->xa_cursors.next = cur;
+}
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur,
+        struct xfs_log_item     *lip)
+{
+        if (lip)
+                cur->item = xfs_ail_next(ailp, lip);
+}
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur)
+{
+        struct xfs_log_item     *lip = cur->item;
+        if ((__psint_t)lip & 1)
+                lip = xfs_ail_min(ailp);
+        xfs_trans_ail_cursor_set(ailp, cur, lip);
+        return lip;
+}
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is alway present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *done)
+{
+        struct xfs_ail_cursor   *prev = NULL;
+        struct xfs_ail_cursor   *cur;
+        done->item = NULL;
+        if (done == &ailp->xa_cursors)
+                return;
+        prev = &ailp->xa_cursors;
+        for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+                if (cur == done) {
+                        prev->next = cur->next;
+                        break;
+                }
+        }
+        ASSERT(cur);
+}
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip)
+{
+        struct xfs_ail_cursor   *cur;
-        lip = xfs_ail_min(&mp->m_ail);
+        /* need to search all cursors */
-        if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
+        for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
-                if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
+                if (cur->item == lip)
-                        xfsaild_wakeup(mp, threshold_lsn);
+                        cur->item = (struct xfs_log_item *)
+                                        ((__psint_t)cur->item | 1);
        }
 }
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
 * Return the current tree generation number for use
 * in calls to xfs_trans_next_ail().
 */
-STATIC xfs_log_item_t *
+xfs_log_item_t *
-xfs_trans_first_push_ail(
+xfs_trans_ail_cursor_first(
-        xfs_mount_t     *mp,
+        struct xfs_ail          *ailp,
-        int             *gen,
+        struct xfs_ail_cursor   *cur,
-        xfs_lsn_t       lsn)
+        xfs_lsn_t               lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_log_item_t          *lip;
-        lip = xfs_ail_min(&mp->m_ail);
+        xfs_trans_ail_cursor_init(ailp, cur);
-        *gen = (int)mp->m_ail.xa_gen;
+        lip = xfs_ail_min(ailp);
        if (lsn == 0)
-                return lip;
+                goto out;
-        list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
-                        return lip;
+                        goto out;
        }
+        lip = NULL;
-        return NULL;
+out:
+        xfs_trans_ail_cursor_set(ailp, cur, lip);
+        return lip;
 }
 /*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
 */
 long
 xfsaild_push(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_lsn_t       *last_lsn)
 {
        long            tout = 1000; /* milliseconds */
        xfs_lsn_t       last_pushed_lsn = *last_lsn;
-        xfs_lsn_t       target =  mp->m_ail.xa_target;
+        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-        int             gen;
-        int             restarts;
        int             flush_log, count, stuck;
+        xfs_mount_t     *mp = ailp->xa_mount;
+        struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
-#define XFS_TRANS_PUSH_AIL_RESTARTS     10
+        spin_lock(&ailp->xa_lock);
+        xfs_trans_ail_cursor_init(ailp, cur);
-        spin_lock(&mp->m_ail_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
-        lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
-                spin_unlock(&mp->m_ail_lock);
+                xfs_trans_ail_cursor_done(ailp, cur);
+                spin_unlock(&ailp->xa_lock);
                last_pushed_lsn = 0;
-                goto out;
+                return tout;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
         */
        tout = 10;
        lsn = lip->li_lsn;
-        flush_log = stuck = count = restarts = 0;
+        flush_log = stuck = count = 0;
        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
                int     lock_result;
                /*
@@ -184,7 +296,7 @@ xfsaild_push(
                 * skip to the next item in the list.
                 */
                lock_result = IOP_TRYLOCK(lip);
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
                        break;
                }
-                spin_lock(&mp->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                /* should we bother continuing? */
                if (XFS_FORCED_SHUTDOWN(mp))
                        break;
@@ -244,14 +356,13 @@ xfsaild_push(
                if (stuck > 100)
                        break;
-                lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+                lip = xfs_trans_ail_cursor_next(ailp, cur);
                if (lip == NULL)
                        break;
-                if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
-                        break;
                lsn = lip->li_lsn;
        }
-        spin_unlock(&mp->m_ail_lock);
+        xfs_trans_ail_cursor_done(ailp, cur);
+        spin_unlock(&ailp->xa_lock);
        if (flush_log) {
                /*
@@ -274,8 +385,7 @@ xfsaild_push(
                 */
                tout += 20;
                last_pushed_lsn = 0;
-        } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
+        } else if ((stuck * 100) / count > 90) {
-                   ((stuck * 100) / count > 90)) {
                /*
                 * Either there is a lot of contention on the AIL or we
                 * are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
                 */
                tout += 10;
        }
-out:
        *last_lsn = last_pushed_lsn;
        return tout;
 }       /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
 */
 void
 xfs_trans_unlocked_item(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
         * over some potentially valid data.
         */
        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-            XFS_FORCED_SHUTDOWN(mp)) {
+            XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
                return;
        }
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-        min_lip = xfs_ail_min(&mp->m_ail);
+        min_lip = xfs_ail_min(ailp);
        if (min_lip == lip)
-                xfs_log_move_tail(mp, 1);
+                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
 * we move in the AIL is the minimum one, update the tail lsn in the
 * log manager.
 *
- * Increment the AIL's generation count to indicate that the tree
- * has changed.
- *
 * This function must be called with the AIL lock held.  The lock
 * is dropped before returning.
 */
 void
-xfs_trans_update_ail(
+xfs_trans_ail_update(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip,
-        xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
+        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip=NULL;
+        xfs_log_item_t          *dlip = NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
-        mlip = xfs_ail_min(&mp->m_ail);
+        mlip = xfs_ail_min(ailp);
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                dlip = xfs_ail_delete(&mp->m_ail, lip);
+                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+                xfs_trans_ail_cursor_clear(ailp, dlip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
        }
        lip->li_lsn = lsn;
+        xfs_ail_insert(ailp, lip);
-        xfs_ail_insert(&mp->m_ail, lip);
-        mp->m_ail.xa_gen++;
        if (mlip == dlip) {
-                mlip = xfs_ail_min(&mp->m_ail);
+                mlip = xfs_ail_min(ailp);
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(mp, mlip->li_lsn);
+                xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
        } else {
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
 * is dropped before returning.
 */
 void
-xfs_trans_delete_ail(
+xfs_trans_ail_delete(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                mlip = xfs_ail_min(&mp->m_ail);
+                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(&mp->m_ail, lip);
+                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+                xfs_trans_ail_cursor_clear(ailp, dlip);
                lip->li_flags &= ~XFS_LI_IN_AIL;
                lip->li_lsn = 0;
-                mp->m_ail.xa_gen++;
                if (mlip == dlip) {
-                        mlip = xfs_ail_min(&mp->m_ail);
+                        mlip = xfs_ail_min(ailp);
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+                        xfs_log_move_tail(ailp->xa_mount,
+                                                (mlip ? mlip->li_lsn : 0));
                } else {
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
        }
        else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
                 * If the file system is not being shutdown, we are in
                 * serious trouble if we get to this stage.
                 */
-                if (XFS_FORCED_SHUTDOWN(mp))
+                struct xfs_mount        *mp = ailp->xa_mount;
-                        spin_unlock(&mp->m_ail_lock);
-                else {
+                spin_unlock(&ailp->xa_lock);
+                if (!XFS_FORCED_SHUTDOWN(mp)) {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
                                        __func__);
-                        spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
        }
@@ -448,56 +554,6 @@ xfs_trans_delete_ail(
 /*
- * Return the item in the AIL with the smallest lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
- */
-xfs_log_item_t *
-xfs_trans_first_ail(
-        xfs_mount_t     *mp,
-        int             *gen)
-{
-        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&mp->m_ail);
-        *gen = (int)mp->m_ail.xa_gen;
-        return lip;
-}
-/*
- * If the generation count of the tree has not changed since the
- * caller last took something from the AIL, then return the elmt
- * in the tree which follows the one given.  If the count has changed,
- * then return the minimum elmt of the AIL and bump the restarts counter
- * if one is given.
- */
-xfs_log_item_t *
-xfs_trans_next_ail(
-        xfs_mount_t     *mp,
-        xfs_log_item_t  *lip,
-        int             *gen,
-        int             *restarts)
-{
-        xfs_log_item_t  *nlip;
-        ASSERT(mp && lip && gen);
-        if (mp->m_ail.xa_gen == *gen) {
-                nlip = xfs_ail_next(&mp->m_ail, lip);
-        } else {
-                nlip = xfs_ail_min(&mp->m_ail);
-                *gen = (int)mp->m_ail.xa_gen;
-                if (restarts != NULL) {
-                        XFS_STATS_INC(xs_push_ail_restarts);
-                        (*restarts)++;
-                }
-        }
-        return (nlip);
-}
-/*
 * The active item list (AIL) is a doubly linked list of log
 * items sorted by ascending lsn.  The base of the list is
 * a forw/back pointer pair embedded in the xfs mount structure.
@@ -515,15 +571,35 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-        INIT_LIST_HEAD(&mp->m_ail.xa_ail);
+        struct xfs_ail  *ailp;
-        return xfsaild_start(mp);
+        int             error;
+        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+        if (!ailp)
+                return ENOMEM;
+        ailp->xa_mount = mp;
+        INIT_LIST_HEAD(&ailp->xa_ail);
+        spin_lock_init(&ailp->xa_lock);
+        error = xfsaild_start(ailp);
+        if (error)
+                goto out_free_ailp;
+        mp->m_ail = ailp;
+        return 0;
+out_free_ailp:
+        kmem_free(ailp);
+        return error;
 }
 void
 xfs_trans_ail_destroy(
        xfs_mount_t     *mp)
 {
-        xfsaild_stop(mp);
+        struct xfs_ail  *ailp = mp->m_ail;
+        xfsaild_stop(ailp);
+        kmem_free(ailp);
 }
 /*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
 */
 STATIC void
 xfs_ail_insert(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -568,7 +644,7 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -585,7 +661,7 @@ xfs_ail_delete(
 */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-        xfs_ail_t       *ailp)
+        struct xfs_ail  *ailp)
 /* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
 */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -617,7 +693,7 @@ xfs_ail_next(
 */
 STATIC void
 xfs_ail_check(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e855b5ced66..8ee2f8c8b0a6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t	*tp,
                        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                        if (lip->li_type == XFS_LI_BUF) {
                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                                xfs_trans_unlocked_item(
+                                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
-                                                bip->bli_item.li_mountp,
+                                                        lip);
-                                                lip);
                        }
                }
                xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * tell the AIL that the buffer is being unlocked.
         */
        if (bip != NULL) {
-                xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
                                        (xfs_log_item_t*)bip);
        }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2a1c0f071f91..23d276af2e0c 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -85,7 +85,6 @@ xfs_trans_iget(
 {
        int                     error;
        xfs_inode_t             *ip;
-        xfs_inode_log_item_t    *iip;
        /*
         * If the transaction pointer is NULL, just call the normal
@@ -138,34 +137,7 @@ xfs_trans_iget(
        }
        ASSERT(ip != NULL);
-        /*
+        xfs_trans_ijoin(tp, ip, lock_flags);
-         * Get a log_item_desc to point at the new item.
-         */
-        if (ip->i_itemp == NULL)
-                xfs_inode_item_init(ip, mp);
-        iip = ip->i_itemp;
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
-        xfs_trans_inode_broot_debug(ip);
-        /*
-         * If the IO lock has been acquired, mark that in
-         * the inode log item so we'll know to unlock it
-         * when the transaction commits.
-         */
-        ASSERT(iip->ili_flags == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-        }
-        /*
-         * Initialize i_transp so we can find it with xfs_inode_incore()
-         * above.
-         */
-        ip->i_transp = tp;
        *ipp = ip;
        return 0;
 }
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 3c666e8317f8..e110bf57d7f4 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,6 +22,14 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
+/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+#include "xfs_bit.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 STATIC int      xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
                                        int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                lidp->lid_size = 0;
                lip->li_desc = lidp;
                lip->li_mountp = tp->t_mountp;
+                lip->li_ailp = tp->t_mountp->m_ail;
                return lidp;
        }
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
        lidp->lid_size = 0;
        lip->li_desc = lidp;
        lip->li_mountp = tp->t_mountp;
+        lip->li_ailp = tp->t_mountp->m_ail;
        return lidp;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3c748c456ed4..73e2ad397432 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
                                                    xfs_extlen_t idx);
 /*
- * From xfs_trans_ail.c
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
 */
-void                    xfs_trans_update_ail(struct xfs_mount *mp,
+struct xfs_ail_cursor {
-                                     struct xfs_log_item *lip, xfs_lsn_t lsn)
+        struct xfs_ail_cursor   *next;
-                                     __releases(mp->m_ail_lock);
+        struct xfs_log_item     *item;
-void                    xfs_trans_delete_ail(struct xfs_mount *mp,
+};
-                                     struct xfs_log_item *lip)
-                                     __releases(mp->m_ail_lock);
-struct xfs_log_item     *xfs_trans_first_ail(struct xfs_mount *, int *);
-struct xfs_log_item     *xfs_trans_next_ail(struct xfs_mount *,
-                                     struct xfs_log_item *, int *, int *);
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+        struct xfs_mount        *xa_mount;
+        struct list_head        xa_ail;
+        uint                    xa_gen;
+        struct task_struct      *xa_task;
+        xfs_lsn_t               xa_target;
+        struct xfs_ail_cursor   xa_cursors;
+        spinlock_t              xa_lock;
+};
 /*
- * AIL push thread support
+ * From xfs_trans_ail.c
 */
-long    xfsaild_push(struct xfs_mount *, xfs_lsn_t *);
+void                    xfs_trans_ail_update(struct xfs_ail *ailp,
-void    xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t);
+                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
-int     xfsaild_start(struct xfs_mount *);
+                                        __releases(ailp->xa_lock);
-void    xfsaild_stop(struct xfs_mount *);
+void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+                                        struct xfs_log_item *lip)
+                                        __releases(ailp->xa_lock);
+void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_trans_unlocked_item(struct xfs_ail *,
+                                        xfs_log_item_t *);
+xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
+struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur,
+                                        xfs_lsn_t lsn);
+struct xfs_log_item     *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur);
+void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur);
+long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
+void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
+int     xfsaild_start(struct xfs_ail *);
+void    xfsaild_stop(struct xfs_ail *);
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       *dst,
+        xfs_lsn_t       *src)
+{
+        ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+        spin_lock(&ailp->xa_lock);
+        *dst = *src;
+        spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       *dst,
+        xfs_lsn_t       *src)
+{
+        ASSERT(sizeof(xfs_lsn_t) == 8);
+        *dst = *src;
+}
+#endif
 #endif  /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 35d4d414bcc2..fcc2285d03ed 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -172,6 +172,12 @@ xfs_dir_ialloc(
                        *ipp = NULL;
                        return code;
                }
+                /*
+                 * transaction commit worked ok so we can drop the extra ticket
+                 * reference that we gained in xfs_trans_dup()
+                 */
+                xfs_log_ticket_put(tp->t_ticket);
                code = xfs_trans_reserve(tp, 0, log_res, 0,
                                         XFS_TRANS_PERM_LOG_RES, log_count);
                /*
@@ -268,9 +274,9 @@ xfs_bump_ino_vers2(
        xfs_mount_t     *mp;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+        ASSERT(ip->i_d.di_version == 1);
-        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+        ip->i_d.di_version = 2;
        ip->i_d.di_onlink = 0;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        mp = tp->t_mountp;
@@ -302,7 +308,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
        inc_nlink(VFS_I(ip));
-        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+        if ((ip->i_d.di_version == 1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
                 * The inode has increased its number of links beyond
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
deleted file mode 100644
index 439dd3939dda..000000000000
--- a/fs/xfs/xfs_vfsops.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_buf_item.h"
-#include "xfs_log_priv.h"
-#include "xfs_dir2_trace.h"
-#include "xfs_extfree_item.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_clnt.h"
-#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
-#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
-#include "xfs_utils.h"
-STATIC void
-xfs_quiesce_fs(
-        xfs_mount_t             *mp)
-{
-        int                     count = 0, pincount;
-        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_finish_reclaim_all(mp, 0);
-        /* This loop must run at least twice.
-         * The first instance of the loop will flush
-         * most meta data but that will generate more
-         * meta data (typically directory updates).
-         * Which then must be flushed and logged before
-         * we can write the unmount record.
-         */
-        do {
-                xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
-                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
-                if (!pincount) {
-                        delay(50);
-                        count++;
-                }
-        } while (count < 2);
-}
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
- */
-void
-xfs_attr_quiesce(
-        xfs_mount_t     *mp)
-{
-        int     error = 0;
-        /* wait for all modifications to complete */
-        while (atomic_read(&mp->m_active_trans) > 0)
-                delay(100);
-        /* flush inodes and push all remaining buffers out to disk */
-        xfs_quiesce_fs(mp);
-        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
-        /* Push the superblock and write an unmount record */
-        error = xfs_log_sbcount(mp, 1);
-        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
-                                "xfs_attr_quiesce: failed to log sb changes. "
-                                "Frozen image may not be consistent.");
-        xfs_log_unmount_write(mp);
-        xfs_unmountfs_writesb(mp);
-}
-/*
- * xfs_unmount_flush implements a set of flush operation on special
- * inodes, which are needed as a separate set of operations so that
- * they can be called as part of relocation process.
- */
-int
-xfs_unmount_flush(
-        xfs_mount_t     *mp,            /* Mount structure we are getting
-                                           rid of. */
-        int             relocation)     /* Called from vfs relocation. */
-{
-        xfs_inode_t     *rip = mp->m_rootip;
-        xfs_inode_t     *rbmip;
-        xfs_inode_t     *rsumip = NULL;
-        int             error;
-        xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        xfs_iflock(rip);
-        /*
-         * Flush out the real time inodes.
-         */
-        if ((rbmip = mp->m_rbmip) != NULL) {
-                xfs_ilock(rbmip, XFS_ILOCK_EXCL);
-                xfs_iflock(rbmip);
-                error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
-                xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
-                if (error == EFSCORRUPTED)
-                        goto fscorrupt_out;
-                ASSERT(vn_count(VFS_I(rbmip)) == 1);
-                rsumip = mp->m_rsumip;
-                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
-                xfs_iflock(rsumip);
-                error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
-                xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
-                if (error == EFSCORRUPTED)
-                        goto fscorrupt_out;
-                ASSERT(vn_count(VFS_I(rsumip)) == 1);
-        }
-        /*
-         * Synchronously flush root inode to disk
-         */
-        error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
-        if (error == EFSCORRUPTED)
-                goto fscorrupt_out2;
-        if (vn_count(VFS_I(rip)) != 1 && !relocation) {
-                xfs_iunlock(rip, XFS_ILOCK_EXCL);
-                return XFS_ERROR(EBUSY);
-        }
-        /*
-         * Release dquot that rootinode, rbmino and rsumino might be holding,
-         * flush and purge the quota inodes.
-         */
-        error = XFS_QM_UNMOUNT(mp);
-        if (error == EFSCORRUPTED)
-                goto fscorrupt_out2;
-        if (rbmip) {
-                IRELE(rbmip);
-                IRELE(rsumip);
-        }
-        xfs_iunlock(rip, XFS_ILOCK_EXCL);
-        return 0;
-fscorrupt_out:
-        xfs_ifunlock(rip);
-fscorrupt_out2:
-        xfs_iunlock(rip, XFS_ILOCK_EXCL);
-        return XFS_ERROR(EFSCORRUPTED);
-}
-/*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted.  For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps).  For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them.  We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- *                     to sleep if we can help it.  All we really need
- *                     to do is ensure that the log is synced at least
- *                     periodically.  We also push the inodes and
- *                     superblock if we can lock them without sleeping
- *                      and they are not pinned.
- *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
- *                     set, then we really want to lock each inode and flush
- *                     it.
- *      SYNC_WAIT    - All the flushes that take place in this call should
- *                     be synchronous.
- *      SYNC_DELWRI  - This tells us to push dirty pages associated with
- *                     inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
- *                     determine if they should be flushed sync, async, or
- *                     delwri.
- *      SYNC_CLOSE   - This flag is passed when the system is being
- *                     unmounted.  We should sync and invalidate everything.
- *      SYNC_FSDATA  - This indicates that the caller would like to make
- *                     sure the superblock is safe on disk.  We can ensure
- *                     this by simply making sure the log gets flushed
- *                     if SYNC_BDFLUSH is set, and by actually writing it
- *                     out otherwise.
- *      SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
- *                     before we return (including direct I/O). Forms the drain
- *                     side of the write barrier needed to safely quiesce the
- *                     filesystem.
- *
- */
-int
-xfs_sync(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        int             error;
-        /*
-         * Get the Quota Manager to flush the dquots.
-         *
-         * If XFS quota support is not enabled or this filesystem
-         * instance does not use quotas XFS_QM_DQSYNC will always
-         * return zero.
-         */
-        error = XFS_QM_DQSYNC(mp, flags);
-        if (error) {
-                /*
-                 * If we got an IO error, we will be shutting down.
-                 * So, there's nothing more for us to do here.
-                 */
-                ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-                if (XFS_FORCED_SHUTDOWN(mp))
-                        return XFS_ERROR(error);
-        }
-        if (flags & SYNC_IOWAIT)
-                xfs_filestream_flush(mp);
-        return xfs_syncsub(mp, flags, NULL);
-}
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_sync_inodes(
-        xfs_mount_t     *mp,
-        int             flags,
-        int             *bypassed)
-{
-        xfs_inode_t     *ip = NULL;
-        struct inode    *vp = NULL;
-        int             error;
-        int             last_error;
-        uint64_t        fflag;
-        uint            lock_flags;
-        uint            base_lock_flags;
-        boolean_t       mount_locked;
-        boolean_t       vnode_refed;
-        int             preempt;
-        xfs_iptr_t      *ipointer;
-#ifdef DEBUG
-        boolean_t       ipointer_in = B_FALSE;
-#define IPOINTER_SET    ipointer_in = B_TRUE
-#define IPOINTER_CLR    ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp) { \
-                ASSERT(ipointer_in == B_FALSE); \
-                ipointer->ip_mnext = ip->i_mnext; \
-                ipointer->ip_mprev = ip; \
-                ip->i_mnext = (xfs_inode_t *)ipointer; \
-                ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
-                preempt = 0; \
-                XFS_MOUNT_IUNLOCK(mp); \
-                mount_locked = B_FALSE; \
-                IPOINTER_SET; \
-        }
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp) { \
-                ASSERT(ipointer_in == B_TRUE); \
-                if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
-                        ip = ipointer->ip_mnext; \
-                        ip->i_mprev = ipointer->ip_mprev; \
-                        ipointer->ip_mprev->i_mnext = ip; \
-                        if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
-                                mp->m_inodes = ip; \
-                        } \
-                } else { \
-                        ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
-                        mp->m_inodes = NULL; \
-                        ip = NULL; \
-                } \
-                IPOINTER_CLR; \
-        }
-#define XFS_PREEMPT_MASK        0x7f
-        ASSERT(!(flags & SYNC_BDFLUSH));
-        if (bypassed)
-                *bypassed = 0;
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return 0;
-        error = 0;
-        last_error = 0;
-        preempt = 0;
-        /* Allocate a reference marker */
-        ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
-        fflag = XFS_B_ASYNC;            /* default is don't wait */
-        if (flags & SYNC_DELWRI)
-                fflag = XFS_B_DELWRI;
-        if (flags & SYNC_WAIT)
-                fflag = 0;              /* synchronous overrides all */
-        base_lock_flags = XFS_ILOCK_SHARED;
-        if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
-                /*
-                 * We need the I/O lock if we're going to call any of
-                 * the flush/inval routines.
-                 */
-                base_lock_flags |= XFS_IOLOCK_SHARED;
-        }
-        XFS_MOUNT_ILOCK(mp);
-        ip = mp->m_inodes;
-        mount_locked = B_TRUE;
-        vnode_refed  = B_FALSE;
-        IPOINTER_CLR;
-        do {
-                ASSERT(ipointer_in == B_FALSE);
-                ASSERT(vnode_refed == B_FALSE);
-                lock_flags = base_lock_flags;
-                /*
-                 * There were no inodes in the list, just break out
-                 * of the loop.
-                 */
-                if (ip == NULL) {
-                        break;
-                }
-                /*
-                 * We found another sync thread marker - skip it
-                 */
-                if (ip->i_mount == NULL) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                vp = VFS_I(ip);
-                /*
-                 * If the vnode is gone then this is being torn down,
-                 * call reclaim if it is flushed, else let regular flush
-                 * code deal with it later in the loop.
-                 */
-                if (vp == NULL) {
-                        /* Skip ones already in reclaim */
-                        if (ip->i_flags & XFS_IRECLAIM) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                                ip = ip->i_mnext;
-                        } else if ((xfs_ipincount(ip) == 0) &&
-                                    xfs_iflock_nowait(ip)) {
-                                IPOINTER_INSERT(ip, mp);
-                                xfs_finish_reclaim(ip, 1,
-                                                XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-                                XFS_MOUNT_ILOCK(mp);
-                                mount_locked = B_TRUE;
-                                IPOINTER_REMOVE(ip, mp);
-                        } else {
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                ip = ip->i_mnext;
-                        }
-                        continue;
-                }
-                if (VN_BAD(vp)) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
-                        XFS_MOUNT_IUNLOCK(mp);
-                        kmem_free(ipointer);
-                        return 0;
-                }
-                /*
-                 * Try to lock without sleeping.  We're out of order with
-                 * the inode list lock here, so if we fail we need to drop
-                 * the mount lock and try again.  If we're called from
-                 * bdflush() here, then don't bother.
-                 *
-                 * The inode lock here actually coordinates with the
-                 * almost spurious inode lock in xfs_ireclaim() to prevent
-                 * the vnode we handle here without a reference from
-                 * being freed while we reference it.  If we lock the inode
-                 * while it's on the mount list here, then the spurious inode
-                 * lock in xfs_ireclaim() after the inode is pulled from
-                 * the mount list will sleep until we release it here.
-                 * This keeps the vnode from being freed while we reference
-                 * it.
-                 */
-                if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-                        if (vp == NULL) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        vp = vn_grab(vp);
-                        if (vp == NULL) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        IPOINTER_INSERT(ip, mp);
-                        xfs_ilock(ip, lock_flags);
-                        ASSERT(vp == VFS_I(ip));
-                        ASSERT(ip->i_mount == mp);
-                        vnode_refed = B_TRUE;
-                }
-                /* From here on in the loop we may have a marker record
-                 * in the inode list.
-                 */
-                /*
-                 * If we have to flush data or wait for I/O completion
-                 * we need to drop the ilock that we currently hold.
-                 * If we need to drop the lock, insert a marker if we
-                 * have not already done so.
-                 */
-                if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
-                    ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        if (flags & SYNC_CLOSE) {
-                                /* Shutdown case. Flush and invalidate. */
-                                if (XFS_FORCED_SHUTDOWN(mp))
-                                        xfs_tosspages(ip, 0, -1,
-                                                             FI_REMAPF);
-                                else
-                                        error = xfs_flushinval_pages(ip,
-                                                        0, -1, FI_REMAPF);
-                        } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
-                                error = xfs_flush_pages(ip, 0,
-                                                        -1, fflag, FI_NONE);
-                        }
-                        /*
-                         * When freezing, we need to wait ensure all I/O (including direct
-                         * I/O) is complete to ensure no further data modification can take
-                         * place after this point
-                         */
-                        if (flags & SYNC_IOWAIT)
-                                vn_iowait(ip);
-                        xfs_ilock(ip, XFS_ILOCK_SHARED);
-                }
-                if ((flags & SYNC_ATTR) &&
-                    (ip->i_update_core ||
-                     (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
-                        if (mount_locked)
-                                IPOINTER_INSERT(ip, mp);
-                        if (flags & SYNC_WAIT) {
-                                xfs_iflock(ip);
-                                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-                        /*
-                         * If we can't acquire the flush lock, then the inode
-                         * is already being flushed so don't bother waiting.
-                         *
-                         * If we can lock it then do a delwri flush so we can
-                         * combine multiple inode flushes in each disk write.
-                         */
-                        } else if (xfs_iflock_nowait(ip)) {
-                                error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                        } else if (bypassed) {
-                                (*bypassed)++;
-                        }
-                }
-                if (lock_flags != 0) {
-                        xfs_iunlock(ip, lock_flags);
-                }
-                if (vnode_refed) {
-                        /*
-                         * If we had to take a reference on the vnode
-                         * above, then wait until after we've unlocked
-                         * the inode to release the reference.  This is
-                         * because we can be already holding the inode
-                         * lock when IRELE() calls xfs_inactive().
-                         *
-                         * Make sure to drop the mount lock before calling
-                         * IRELE() so that we don't trip over ourselves if
-                         * we have to go for the mount lock again in the
-                         * inactive code.
-                         */
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                        IRELE(ip);
-                        vnode_refed = B_FALSE;
-                }
-                if (error) {
-                        last_error = error;
-                }
-                /*
-                 * bail out if the filesystem is corrupted.
-                 */
-                if (error == EFSCORRUPTED)  {
-                        if (!mount_locked) {
-                                XFS_MOUNT_ILOCK(mp);
-                                IPOINTER_REMOVE(ip, mp);
-                        }
-                        XFS_MOUNT_IUNLOCK(mp);
-                        ASSERT(ipointer_in == B_FALSE);
-                        kmem_free(ipointer);
-                        return XFS_ERROR(error);
-                }
-                /* Let other threads have a chance at the mount lock
-                 * if we have looped many times without dropping the
-                 * lock.
-                 */
-                if ((++preempt & XFS_PREEMPT_MASK) == 0) {
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                }
-                if (mount_locked == B_FALSE) {
-                        XFS_MOUNT_ILOCK(mp);
-                        mount_locked = B_TRUE;
-                        IPOINTER_REMOVE(ip, mp);
-                        continue;
-                }
-                ASSERT(ipointer_in == B_FALSE);
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
-        XFS_MOUNT_IUNLOCK(mp);
-        ASSERT(ipointer_in == B_FALSE);
-        kmem_free(ipointer);
-        return XFS_ERROR(last_error);
-}
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_syncsub(
-        xfs_mount_t     *mp,
-        int             flags,
-        int             *bypassed)
-{
-        int             error = 0;
-        int             last_error = 0;
-        uint            log_flags = XFS_LOG_FORCE;
-        xfs_buf_t       *bp;
-        xfs_buf_log_item_t      *bip;
-        /*
-         * Sync out the log.  This ensures that the log is periodically
-         * flushed even if there is not enough activity to fill it up.
-         */
-        if (flags & SYNC_WAIT)
-                log_flags |= XFS_LOG_SYNC;
-        xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
-                if (flags & SYNC_BDFLUSH)
-                        xfs_finish_reclaim_all(mp, 1);
-                else
-                        error = xfs_sync_inodes(mp, flags, bypassed);
-        }
-        /*
-         * Flushing out dirty data above probably generated more
-         * log activity, so if this isn't vfs_sync() then flush
-         * the log again.
-         */
-        if (flags & SYNC_DELWRI) {
-                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        }
-        if (flags & SYNC_FSDATA) {
-                /*
-                 * If this is vfs_sync() then only sync the superblock
-                 * if we can lock it without sleeping and it is not pinned.
-                 */
-                if (flags & SYNC_BDFLUSH) {
-                        bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-                        if (bp != NULL) {
-                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                                if ((bip != NULL) &&
-                                    xfs_buf_item_dirty(bip)) {
-                                        if (!(XFS_BUF_ISPINNED(bp))) {
-                                                XFS_BUF_ASYNC(bp);
-                                                error = xfs_bwrite(mp, bp);
-                                        } else {
-                                                xfs_buf_relse(bp);
-                                        }
-                                } else {
-                                        xfs_buf_relse(bp);
-                                }
-                        }
-                } else {
-                        bp = xfs_getsb(mp, 0);
-                        /*
-                         * If the buffer is pinned then push on the log so
-                         * we won't get stuck waiting in the write for
-                         * someone, maybe ourselves, to flush the log.
-                         * Even though we just pushed the log above, we
-                         * did not have the superblock buffer locked at
-                         * that point so it can become pinned in between
-                         * there and here.
-                         */
-                        if (XFS_BUF_ISPINNED(bp))
-                                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                        if (flags & SYNC_WAIT)
-                                XFS_BUF_UNASYNC(bp);
-                        else
-                                XFS_BUF_ASYNC(bp);
-                        error = xfs_bwrite(mp, bp);
-                }
-                if (error) {
-                        last_error = error;
-                }
-        }
-        /*
-         * Now check to see if the log needs a "dummy" transaction.
-         */
-        if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
-                xfs_trans_t *tp;
-                xfs_inode_t *ip;
-                /*
-                 * Put a dummy transaction in the log to tell
-                 * recovery that all others are OK.
-                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-                if ((error = xfs_trans_reserve(tp, 0,
-                                XFS_ICHANGE_LOG_RES(mp),
-                                0, 0, 0)))  {
-                        xfs_trans_cancel(tp, 0);
-                        return error;
-                }
-                ip = mp->m_rootip;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(tp, ip);
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                error = xfs_trans_commit(tp, 0);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        }
-        /*
-         * When shutting down, we need to insure that the AIL is pushed
-         * to disk or the filesystem can appear corrupt from the PROM.
-         */
-        if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
-                XFS_bflush(mp->m_ddev_targp);
-                if (mp->m_rtdev_targp) {
-                        XFS_bflush(mp->m_rtdev_targp);
-                }
-        }
-        return XFS_ERROR(last_error);
-}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
deleted file mode 100644
index a74b05087da4..000000000000
--- a/fs/xfs/xfs_vfsops.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _XFS_VFSOPS_H
-#define _XFS_VFSOPS_H 1
-struct cred;
-struct xfs_fid;
-struct inode;
-struct kstatfs;
-struct xfs_mount;
-struct xfs_mount_args;
-int xfs_sync(struct xfs_mount *mp, int flags);
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
-                int lnnum);
-void xfs_attr_quiesce(struct xfs_mount *mp);
-#endif /* _XFS_VFSOPS_H */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8b6812f66a15..f07bf8768c3a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -54,33 +54,10 @@
 #include "xfs_vnodeops.h"
 int
-xfs_open(
-        xfs_inode_t     *ip)
-{
-        int             mode;
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return XFS_ERROR(EIO);
-        /*
-         * If it's a directory with any blocks, read-ahead block 0
-         * as we're almost certain to have the next operation be a read there.
-         */
-        if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
-                mode = xfs_ilock_map_shared(ip);
-                if (ip->i_d.di_nextents > 0)
-                        (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
-                xfs_iunlock(ip, mode);
-        }
-        return 0;
-}
-int
 xfs_setattr(
        struct xfs_inode        *ip,
        struct iattr            *iattr,
-        int                     flags,
+        int                     flags)
-        cred_t                  *credp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
@@ -93,7 +70,6 @@ xfs_setattr(
        gid_t                   gid=0, igid=0;
        int                     timeflags = 0;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
-        int                     file_owner;
        int                     need_iolock = 1;
        xfs_itrace_entry(ip);
@@ -104,6 +80,10 @@ xfs_setattr(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
+        code = -inode_change_ok(inode, iattr);
+        if (code)
+                return code;
        olddquot1 = olddquot2 = NULL;
        udqp = gdqp = NULL;
@@ -181,62 +161,8 @@ xfs_setattr(
        xfs_ilock(ip, lock_flags);
-        /* boolean: are we the file owner? */
-        file_owner = (current_fsuid() == ip->i_d.di_uid);
-        /*
-         * Change various properties of a file.
-         * Only the owner or users with CAP_FOWNER
-         * capability may do these things.
-         */
-        if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
-                /*
-                 * CAP_FOWNER overrides the following restrictions:
-                 *
-                 * The user ID of the calling process must be equal
-                 * to the file owner ID, except in cases where the
-                 * CAP_FSETID capability is applicable.
-                 */
-                if (!file_owner && !capable(CAP_FOWNER)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-                /*
-                 * CAP_FSETID overrides the following restrictions:
-                 *
-                 * The effective user ID of the calling process shall match
-                 * the file owner when setting the set-user-ID and
-                 * set-group-ID bits on that file.
-                 *
-                 * The effective group ID or one of the supplementary group
-                 * IDs of the calling process shall match the group owner of
-                 * the file when setting the set-group-ID bit on that file
-                 */
-                if (mask & ATTR_MODE) {
-                        mode_t m = 0;
-                        if ((iattr->ia_mode & S_ISUID) && !file_owner)
-                                m |= S_ISUID;
-                        if ((iattr->ia_mode & S_ISGID) &&
-                            !in_group_p((gid_t)ip->i_d.di_gid))
-                                m |= S_ISGID;
-#if 0
-                        /* Linux allows this, Irix doesn't. */
-                        if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
-                                m |= S_ISVTX;
-#endif
-                        if (m && !capable(CAP_FSETID))
-                                iattr->ia_mode &= ~m;
-                }
-        }
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -251,23 +177,6 @@ xfs_setattr(
                uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
                /*
-                 * CAP_CHOWN overrides the following restrictions:
-                 *
-                 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
-                 * shall override the restriction that a process cannot
-                 * change the user ID of a file it owns and the restriction
-                 * that the group ID supplied to the chown() function
-                 * shall be equal to either the group ID or one of the
-                 * supplementary group IDs of the calling process.
-                 */
-                if (restricted_chown &&
-                    (iuid != uid || (igid != gid &&
-                                     !in_group_p((gid_t)gid))) &&
-                    !capable(CAP_CHOWN)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-                /*
                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
                 */
@@ -304,36 +213,22 @@ xfs_setattr(
                        code = XFS_ERROR(EINVAL);
                        goto error_return;
                }
                /*
                 * Make sure that the dquots are attached to the inode.
                 */
-                if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
+                code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+                if (code)
                        goto error_return;
-        }
-        /*
-         * Change file access or modified times.
-         */
-        if (mask & (ATTR_ATIME|ATTR_MTIME)) {
-                if (!file_owner) {
-                        if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
-                            !capable(CAP_FOWNER)) {
-                                code = XFS_ERROR(EPERM);
-                                goto error_return;
-                        }
-                }
-        }
-        /*
+                /*
-         * Now we can make the changes.  Before we join the inode
+                 * Now we can make the changes.  Before we join the inode
-         * to the transaction, if ATTR_SIZE is set then take care of
+                 * to the transaction, if ATTR_SIZE is set then take care of
-         * the part of the truncation that must be done without the
+                 * the part of the truncation that must be done without the
-         * inode lock.  This needs to be done before joining the inode
+                 * inode lock.  This needs to be done before joining the inode
-         * to the transaction, because the inode cannot be unlocked
+                 * to the transaction, because the inode cannot be unlocked
-         * once it is a part of the transaction.
+                 * once it is a part of the transaction.
-         */
+                 */
-        if (mask & ATTR_SIZE) {
-                code = 0;
                if (iattr->ia_size > ip->i_size) {
                        /*
                         * Do the first part of growing a file: zero any data
@@ -366,7 +261,7 @@ xfs_setattr(
                }
                /* wait for all I/O to complete */
-                vn_iowait(ip);
+                xfs_ioend_wait(ip);
                if (!code)
                        code = xfs_itruncate_data(ip, iattr->ia_size);
@@ -388,17 +283,10 @@ xfs_setattr(
                }
                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-        }
-        if (tp) {
                xfs_trans_ijoin(tp, ip, lock_flags);
                xfs_trans_ihold(tp, ip);
-        }
-        /*
-         * Truncate file.  Must have write permission and not be a directory.
-         */
-        if (mask & ATTR_SIZE) {
                /*
                 * Only change the c/mtime if we are changing the size
                 * or we are explicitly asked to change it. This handles
@@ -438,28 +326,13 @@ xfs_setattr(
                         */
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
-        }
+        } else if (tp) {
+                xfs_trans_ijoin(tp, ip, lock_flags);
-        /*
+                xfs_trans_ihold(tp, ip);
-         * Change file access modes.
-         */
-        if (mask & ATTR_MODE) {
-                ip->i_d.di_mode &= S_IFMT;
-                ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
-                inode->i_mode &= S_IFMT;
-                inode->i_mode |= iattr->ia_mode & ~S_IFMT;
-                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
-                timeflags |= XFS_ICHGTIME_CHG;
        }
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -503,6 +376,24 @@ xfs_setattr(
                timeflags |= XFS_ICHGTIME_CHG;
        }
+        /*
+         * Change file access modes.
+         */
+        if (mask & ATTR_MODE) {
+                umode_t mode = iattr->ia_mode;
+                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                        mode &= ~S_ISGID;
+                ip->i_d.di_mode &= S_IFMT;
+                ip->i_d.di_mode |= mode & ~S_IFMT;
+                inode->i_mode &= S_IFMT;
+                inode->i_mode |= mode & ~S_IFMT;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                timeflags |= XFS_ICHGTIME_CHG;
+        }
        /*
         * Change file access or modified times.
@@ -713,7 +604,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
        /* capture size updates in I/O completion before writing the inode. */
-        error = filemap_fdatawait(VFS_I(ip)->i_mapping);
+        error = xfs_wait_on_pages(ip, 0, -1);
        if (error)
                return XFS_ERROR(error);
@@ -1029,6 +920,12 @@ xfs_inactive_symlink_rmt(
                goto error0;
        }
        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(tp->t_ticket);
+        /*
         * Remove the memory for extent descriptions (just bookkeeping).
         */
        if (ip->i_df.if_bytes)
@@ -1625,8 +1522,6 @@ xfs_create(
                xfs_trans_set_sync(tp);
        }
-        dp->i_gen++;
        /*
         * Attach the dquot(s) to the inodes and modify them incore.
         * These ids of the inode couldn't have changed since the new
@@ -1993,13 +1888,6 @@ xfs_remove(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory generation count on the parent
-         * directory so that other can know that it has changed.
-         */
-        dp->i_gen++;
-        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        if (is_dir) {
                /*
                 * Drop the link from ip's "..".
@@ -2009,7 +1897,7 @@ xfs_remove(
                        goto out_bmap_cancel;
                /*
-                 * Drop the link from dp to ip.
+                 * Drop the "." link from ip to self.
                 */
                error = xfs_droplink(tp, ip);
                if (error)
@@ -2017,14 +1905,14 @@ xfs_remove(
        } else {
                /*
                 * When removing a non-directory we need to log the parent
-                 * inode here for the i_gen update.  For a directory this is
+                 * inode here.  For a directory this is done implicitly
-                 * done implicitly by the xfs_droplink call for the ".." entry.
+                 * by the xfs_droplink call for the ".." entry.
                 */
                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        }
        /*
-         * Drop the "." link from ip to self.
+         * Drop the link from dp to ip.
         */
        error = xfs_droplink(tp, ip);
        if (error)
@@ -2178,7 +2066,6 @@ xfs_link(
        if (error)
                goto abort_return;
        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        tdp->i_gen++;
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
@@ -2355,18 +2242,10 @@ xfs_mkdir(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory version number of the parent directory
-         * so that other processes accessing it will recognize that
-         * the directory has changed.
-         */
-        dp->i_gen++;
        error = xfs_dir_init(tp, cdp, dp);
        if (error)
                goto error2;
-        cdp->i_gen = 1;
        error = xfs_bumplink(tp, dp);
        if (error)
                goto error2;
@@ -2653,13 +2532,6 @@ xfs_symlink(
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
-         * Bump the in memory version number of the parent directory
-         * so that other processes accessing it will recognize that
-         * the directory has changed.
-         */
-        dp->i_gen++;
-        /*
         * If this is a synchronous mount, make sure that the
         * symlink transaction goes to disk before returning to
         * the user.
@@ -2809,7 +2681,7 @@ xfs_reclaim(
                return 0;
        }
-        vn_iowait(ip);
+        xfs_ioend_wait(ip);
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -2833,122 +2705,10 @@ xfs_reclaim(
        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_iflock(ip);
-                return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+                xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-        } else {
+                return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-                xfs_mount_t     *mp = ip->i_mount;
-                /* Protect sync and unpin from us */
-                XFS_MOUNT_ILOCK(mp);
-                spin_lock(&ip->i_flags_lock);
-                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                VFS_I(ip)->i_private = NULL;
-                ip->i_vnode = NULL;
-                spin_unlock(&ip->i_flags_lock);
-                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
-                XFS_MOUNT_IUNLOCK(mp);
-        }
-        return 0;
-}
-int
-xfs_finish_reclaim(
-        xfs_inode_t     *ip,
-        int             locked,
-        int             sync_mode)
-{
-        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        struct inode    *vp = VFS_I(ip);
-        if (vp && VN_BAD(vp))
-                goto reclaim;
-        /* The hash lock here protects a thread in xfs_iget_core from
-         * racing with us on linking the inode back with a vnode.
-         * Once we have the XFS_IRECLAIM flag set it will not touch
-         * us.
-         */
-        write_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-            (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                if (locked) {
-                        xfs_ifunlock(ip);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                }
-                return 1;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(ip->i_mount, pag);
-        /*
-         * If the inode is still dirty, then flush it out.  If the inode
-         * is not in the AIL, then it will be OK to flush it delwri as
-         * long as xfs_iflush() does not keep any references to the inode.
-         * We leave that decision up to xfs_iflush() since it has the
-         * knowledge of whether it's OK to simply do a delwri flush of
-         * the inode or whether we need to wait until the inode is
-         * pulled from the AIL.
-         * We get the flush lock regardless, though, just to make sure
-         * we don't free it while it is being flushed.
-         */
-        if (!locked) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_iflock(ip);
        }
+        xfs_inode_set_reclaim_tag(ip);
-        /*
-         * In the case of a forced shutdown we rely on xfs_iflush() to
-         * wait for the inode to be unpinned before returning an error.
-         */
-        if (xfs_iflush(ip, sync_mode) == 0) {
-                /* synchronize with xfs_iflush_done */
-                xfs_iflock(ip);
-                xfs_ifunlock(ip);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
- reclaim:
-        xfs_ireclaim(ip);
-        return 0;
-}
-int
-xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
-{
-        int             purged;
-        xfs_inode_t     *ip, *n;
-        int             done = 0;
-        while (!done) {
-                purged = 0;
-                XFS_MOUNT_ILOCK(mp);
-                list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
-                        if (noblock) {
-                                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
-                                        continue;
-                                if (xfs_ipincount(ip) ||
-                                    !xfs_iflock_nowait(ip)) {
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        continue;
-                                }
-                        }
-                        XFS_MOUNT_IUNLOCK(mp);
-                        if (xfs_finish_reclaim(ip, noblock,
-                                        XFS_IFLUSH_DELWRI_ELSE_ASYNC))
-                                delay(1);
-                        purged = 1;
-                        break;
-                }
-                done = !purged;
-        }
-        XFS_MOUNT_IUNLOCK(mp);
        return 0;
 }
@@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes(
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);
+        if (!bp)
+                return XFS_ERROR(ENOMEM);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -3312,7 +3074,8 @@ xfs_free_file_space(
                need_iolock = 0;
        if (need_iolock) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                vn_iowait(ip);  /* wait for the completion of any pending DIOs */
+                /* wait for the completion of any pending DIOs */
+                xfs_ioend_wait(ip);
        }
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -3474,7 +3237,6 @@ xfs_change_file_space(
        int             cmd,
        xfs_flock64_t   *bf,
        xfs_off_t       offset,
-        cred_t          *credp,
        int             attr_flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -3562,7 +3324,7 @@ xfs_change_file_space(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = startoffset;
-                error = xfs_setattr(ip, &iattr, attr_flags, credp);
+                error = xfs_setattr(ip, &iattr, attr_flags);
                if (error)
                        return error;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 7b0c2ab88333..76df328c61b4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -14,9 +14,7 @@ struct xfs_inode;
 struct xfs_iomap;
-int xfs_open(struct xfs_inode *ip);
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
-int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-                cred_t *credp);
 #define XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
@@ -44,8 +42,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-                xfs_flock64_t *bf, xfs_off_t offset,
+                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-                cred_t *credp, int      attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                struct xfs_name *target_name, struct xfs_inode *target_ip);
@@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
 int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
-                int ioflags, unsigned int cmd, void __user *arg);
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
                const struct iovec *iovp, unsigned int segs,
                loff_t *offset, int ioflags);
@@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, int fiopt);
 int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 #endif /* _XFS_VNODEOPS_H */
author	Ingo Molnar <mingo@elte.hu>	2008-12-31 02:14:29 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-12-31 02:14:29 -0500
commit	5fdf7e5975a0b0f6a0370655612c5dca3fd6311b (patch)
tree	639c536e818c6ace974aa285ba94576df0353b01 /fs
parent	7a51cffbd10886c0557677dd916c090097c691ef (diff)
parent	6a94cb73064c952255336cc57731904174b2c58f (diff)