Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/xfs
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
103 files changed, 6926 insertions, 6521 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
 config XFS_QUOTA
        bool "XFS Quota support"
        depends on XFS_FS
+        select QUOTACTL
        help
          If you say Y here, you will be able to set limits for disk usage on
          a per user and/or a per group basis under XFS.  XFS considers quota
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 XFS_LINUX := linux-2.6
-ifeq ($(CONFIG_XFS_DEBUG),y)
-        EXTRA_CFLAGS += -g
-endif
 obj-$(CONFIG_XFS_FS)            += xfs.o
 xfs-y                           += linux-2.6/xfs_trace.o
@@ -98,17 +95,17 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                   xfs_discard.o \
                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
                                   xfs_globals.o \
                                   xfs_ioctl.o \
                                   xfs_iops.o \
+                                   xfs_message.o \
                                   xfs_super.o \
                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
-xfs-y                           += $(addprefix support/, \
+xfs-y                           += support/uuid.o
-                                   debug.o \
-                                   uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
+#include "xfs_message.h"
 /*
 * Greedy allocation.  May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -378,28 +413,19 @@ xfs_submit_ioend_bio(
        if (xfs_ioend_new_eof(ioend))
                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +496,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +625,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +652,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +679,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +734,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +789,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +803,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -934,83 +846,38 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                goto out_invalidate;
-        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+        xfs_alert(ip->i_mount,
                "page discard on page %p, inode 0x%llx, offset %llu.",
                        page, ip->i_ino, offset);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "page discard unable to remove delalloc mapping.");
                        }
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1047,10 +914,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1101,110 +968,78 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
                        uptodate = 0;
                /*
-                 * A hole may still be marked uptodate because discard_buffer
+                 * set_page_dirty dirties all buffers in a page, independent
-                 * leaves the flag set.
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
                 */
                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        ASSERT(!buffer_dirty(bh));
                        imap_valid = 0;
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE &&
-                                    wbc->nonblocking)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
+                        }
-                         */
+                } else {
-                        if (!imap_valid || flags != BMAPI_READ) {
+                        if (PageUptodate(page)) {
-                                flags = BMAPI_READ;
+                                ASSERT(buffer_mapped(bh));
-                                size = xfs_probe_cluster(inode, page, bh, head);
+                                imap_valid = 0;
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1233,7 +1068,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1302,13 +1137,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1317,15 +1158,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        ASSERT(offset <= mp->m_maxioffset);
-                          &new);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1392,6 +1263,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1420,7 +1295,7 @@ xfs_get_blocks_direct(
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
 * extents.  In case this is regular synchronous I/O we just call xfs_end_io
- * to do this and we are done.  But in case this was a successfull AIO
+ * to do this and we are done.  But in case this was a successful AIO
 * request this handler is called from interrupt context, from which we
 * can't start transactions.  In that case offload the I/O completion to
 * the workqueues we also use for buffered I/O completion.
@@ -1479,7 +1354,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1505,11 +1380,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_alert(ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
@@ -1588,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
        .writepages             = xfs_vm_writepages,
-        .sync_page              = block_sync_page,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
-#include <linux/list_sort.h>
 #include "xfs_sb.h"
 #include "xfs_inum.h"
@@ -44,12 +43,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -99,77 +93,79 @@ xfs_buf_vmap_len(
 }
 /*
- *      Page Region interfaces.
+ * xfs_buf_lru_add - add a buffer to the LRU.
 *
- *      For pages in filesystems where the blocksize is smaller than the
+ * The LRU takes a new reference to the buffer so that it will only be freed
- *      pagesize, we use the page->private field (long) to hold a bitmap
+ * once the shrinker takes the buffer off the LRU.
- *      of uptodate regions within the page.
- *
- *      Each such region is "bytes per page / bits per long" bytes long.
- *
- *      NBPPR == number-of-bytes-per-page-region
- *      BTOPR == bytes-to-page-region (rounded up)
- *      BTOPRT == bytes-to-page-region-truncated (rounded down)
 */
-#if (BITS_PER_LONG == 32)
+STATIC void
-#define PRSHIFT         (PAGE_CACHE_SHIFT - 5)  /* (32 == 1<<5) */
+xfs_buf_lru_add(
-#elif (BITS_PER_LONG == 64)
+        struct xfs_buf  *bp)
-#define PRSHIFT         (PAGE_CACHE_SHIFT - 6)  /* (64 == 1<<6) */
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-#define NBPPR           (PAGE_CACHE_SIZE/BITS_PER_LONG)
-#define BTOPR(b)        (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
-#define BTOPRT(b)       (((unsigned int)(b) >> PRSHIFT))
-STATIC unsigned long
-page_region_mask(
-        size_t          offset,
-        size_t          length)
 {
-        unsigned long   mask;
+        struct xfs_buftarg *btp = bp->b_target;
-        int             first, final;
-        first = BTOPR(offset);
-        final = BTOPRT(offset + length - 1);
-        first = min(first, final);
-        mask = ~0UL;
-        mask <<= BITS_PER_LONG - (final - first);
-        mask >>= BITS_PER_LONG - (final);
-        ASSERT(offset + length <= PAGE_CACHE_SIZE);
-        ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
-        return mask;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
 }
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
+ * bt_lru_lock.
+ */
 STATIC void
-set_page_region(
+xfs_buf_lru_del(
-        struct page     *page,
+        struct xfs_buf  *bp)
-        size_t          offset,
-        size_t          length)
 {
-        set_page_private(page,
+        struct xfs_buftarg *btp = bp->b_target;
-                page_private(page) | page_region_mask(offset, length));
-        if (page_private(page) == ~0UL)
-                SetPageUptodate(page);
-}
-STATIC int
+        if (list_empty(&bp->b_lru))
-test_page_region(
+                return;
-        struct page     *page,
-        size_t          offset,
-        size_t          length)
-{
-        unsigned long   mask = page_region_mask(offset, length);
-        return (mask && (page_private(page) & mask) == mask);
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
 */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,10 +182,12 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
-        INIT_LIST_HEAD(&bp->b_hash_list);
+        RB_CLEAR_NODE(&bp->b_rbnode);
-        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
+        sema_init(&bp->b_sema, 0); /* held, no waiters */
        XB_SET_OWNER(bp);
        bp->b_target = target;
        bp->b_file_offset = range_base;
@@ -262,9 +260,9 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
-        ASSERT(list_empty(&bp->b_hash_list));
+        ASSERT(list_empty(&bp->b_lru));
-        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
+        if (bp->b_flags & _XBF_PAGES) {
                uint            i;
                if (xfs_buf_is_vmapped(bp))
@@ -274,56 +272,77 @@ xfs_buf_free(
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
-                        if (bp->b_flags & _XBF_PAGE_CACHE)
+                        __free_page(page);
-                                ASSERT(!PagePrivate(page));
-                        page_cache_release(page);
                }
-        }
+        } else if (bp->b_flags & _XBF_KMEM)
+                kmem_free(bp->b_addr);
        _xfs_buf_free_pages(bp);
        xfs_buf_deallocate(bp);
 }
 /*
- *      Finds all pages for buffer in question and builds it's page list.
+ * Allocates all the pages for buffer in question and builds it's page list.
 */
 STATIC int
-_xfs_buf_lookup_pages(
+xfs_buf_allocate_memory(
        xfs_buf_t               *bp,
        uint                    flags)
 {
-        struct address_space    *mapping = bp->b_target->bt_mapping;
-        size_t                  blocksize = bp->b_target->bt_bsize;
        size_t                  size = bp->b_count_desired;
        size_t                  nbytes, offset;
        gfp_t                   gfp_mask = xb_to_gfp(flags);
        unsigned short          page_count, i;
-        pgoff_t                 first;
        xfs_off_t               end;
        int                     error;
+        /*
+         * for buffers that are contained within a single page, just allocate
+         * the memory from the heap - there's no need for the complexity of
+         * page arrays to keep allocation down to order 0.
+         */
+        if (bp->b_buffer_length < PAGE_SIZE) {
+                bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+                if (!bp->b_addr) {
+                        /* low memory - use alloc_page loop instead */
+                        goto use_alloc_page;
+                }
+                if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+                                                                PAGE_MASK) !=
+                    ((unsigned long)bp->b_addr & PAGE_MASK)) {
+                        /* b_addr spans two pages - use alloc_page instead */
+                        kmem_free(bp->b_addr);
+                        bp->b_addr = NULL;
+                        goto use_alloc_page;
+                }
+                bp->b_offset = offset_in_page(bp->b_addr);
+                bp->b_pages = bp->b_page_array;
+                bp->b_pages[0] = virt_to_page(bp->b_addr);
+                bp->b_page_count = 1;
+                bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+                return 0;
+        }
+use_alloc_page:
        end = bp->b_file_offset + bp->b_buffer_length;
        page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
        error = _xfs_buf_get_pages(bp, page_count, flags);
        if (unlikely(error))
                return error;
-        bp->b_flags |= _XBF_PAGE_CACHE;
        offset = bp->b_offset;
-        first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
+        bp->b_flags |= _XBF_PAGES;
        for (i = 0; i < bp->b_page_count; i++) {
                struct page     *page;
                uint            retries = 0;
+retry:
-              retry:
+                page = alloc_page(gfp_mask);
-                page = find_or_create_page(mapping, first + i, gfp_mask);
                if (unlikely(page == NULL)) {
                        if (flags & XBF_READ_AHEAD) {
                                bp->b_page_count = i;
-                                for (i = 0; i < bp->b_page_count; i++)
+                                error = ENOMEM;
-                                        unlock_page(bp->b_pages[i]);
+                                goto out_free_pages;
-                                return -ENOMEM;
                        }
                        /*
@@ -333,65 +352,55 @@ _xfs_buf_lookup_pages(
                         * handle buffer allocation failures we can't do much.
                         */
                        if (!(++retries % 100))
-                                printk(KERN_ERR
+                                xfs_err(NULL,
-                                        "XFS: possible memory allocation "
+                "possible memory allocation deadlock in %s (mode:0x%x)",
-                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
                XFS_STATS_INC(xb_page_found);
-                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
+                nbytes = min_t(size_t, size, PAGE_SIZE - offset);
                size -= nbytes;
-                ASSERT(!PagePrivate(page));
-                if (!PageUptodate(page)) {
-                        page_count--;
-                        if (blocksize >= PAGE_CACHE_SIZE) {
-                                if (flags & XBF_READ)
-                                        bp->b_flags |= _XBF_PAGE_LOCKED;
-                        } else if (!PagePrivate(page)) {
-                                if (test_page_region(page, offset, nbytes))
-                                        page_count++;
-                        }
-                }
                bp->b_pages[i] = page;
                offset = 0;
        }
+        return 0;
-        if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+out_free_pages:
-                for (i = 0; i < bp->b_page_count; i++)
+        for (i = 0; i < bp->b_page_count; i++)
-                        unlock_page(bp->b_pages[i]);
+                __free_page(bp->b_pages[i]);
-        }
-        if (page_count == bp->b_page_count)
-                bp->b_flags |= XBF_DONE;
        return error;
 }
 /*
- *      Map buffer into kernel address-space if nessecary.
+ *      Map buffer into kernel address-space if necessary.
 */
 STATIC int
 _xfs_buf_map_pages(
        xfs_buf_t               *bp,
        uint                    flags)
 {
-        /* A single page buffer is always mappable */
+        ASSERT(bp->b_flags & _XBF_PAGES);
        if (bp->b_page_count == 1) {
+                /* A single page buffer is always mappable */
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-                bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                int retried = 0;
-                                        -1, PAGE_KERNEL);
-                if (unlikely(bp->b_addr == NULL))
+                do {
+                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                                                -1, PAGE_KERNEL);
+                        if (bp->b_addr)
+                                break;
+                        vm_unmap_aliases();
+                } while (retried++ <= 1);
+                if (!bp->b_addr)
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
@@ -422,8 +431,10 @@ _xfs_buf_find(
 {
        xfs_off_t               range_base;
        size_t                  range_length;
-        xfs_bufhash_t           *hash;
+        struct xfs_perag        *pag;
-        xfs_buf_t               *bp, *n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        xfs_buf_t               *bp;
        range_base = (ioff << BBSHIFT);
        range_length = (isize << BBSHIFT);
@@ -432,14 +443,37 @@ _xfs_buf_find(
        ASSERT(!(range_length < (1 << btp->bt_sshift)));
        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
-        hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
+        /* get tree root */
+        pag = xfs_perag_get(btp->bt_mount,
-        spin_lock(&hash->bh_lock);
+                                xfs_daddr_to_agno(btp->bt_mount, ioff));
-        list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+        /* walk tree */
-                ASSERT(btp == bp->b_target);
+        spin_lock(&pag->pag_buf_lock);
-                if (bp->b_file_offset == range_base &&
+        rbp = &pag->pag_buf_tree.rb_node;
-                    bp->b_buffer_length == range_length) {
+        parent = NULL;
+        bp = NULL;
+        while (*rbp) {
+                parent = *rbp;
+                bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+                if (range_base < bp->b_file_offset)
+                        rbp = &(*rbp)->rb_left;
+                else if (range_base > bp->b_file_offset)
+                        rbp = &(*rbp)->rb_right;
+                else {
+                        /*
+                         * found a block offset match. If the range doesn't
+                         * match, the only way this is allowed is if the buffer
+                         * in the cache is stale and the transaction that made
+                         * it stale has not yet committed. i.e. we are
+                         * reallocating a busy extent. Skip this buffer and
+                         * continue searching to the right for an exact match.
+                         */
+                        if (bp->b_buffer_length != range_length) {
+                                ASSERT(bp->b_flags & XBF_STALE);
+                                rbp = &(*rbp)->rb_right;
+                                continue;
+                        }
                        atomic_inc(&bp->b_hold);
                        goto found;
                }
@@ -449,46 +483,42 @@ _xfs_buf_find(
        if (new_bp) {
                _xfs_buf_initialize(new_bp, btp, range_base,
                                range_length, flags);
-                new_bp->b_hash = hash;
+                rb_link_node(&new_bp->b_rbnode, parent, rbp);
-                list_add(&new_bp->b_hash_list, &hash->bh_list);
+                rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+                /* the buffer keeps the perag reference until it is freed */
+                new_bp->b_pag = pag;
+                spin_unlock(&pag->pag_buf_lock);
        } else {
                XFS_STATS_INC(xb_miss_locked);
+                spin_unlock(&pag->pag_buf_lock);
+                xfs_perag_put(pag);
        }
-        spin_unlock(&hash->bh_lock);
        return new_bp;
 found:
-        spin_unlock(&hash->bh_lock);
+        spin_unlock(&pag->pag_buf_lock);
+        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
+        /*
+         * if the buffer is stale, clear all the external state associated with
+         * it. We need to keep flags such as how we allocated the buffer memory
+         * intact here.
+         */
        if (bp->b_flags & XBF_STALE) {
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-                bp->b_flags &= XBF_MAPPED;
+                bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
        }
        trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -509,7 +539,7 @@ xfs_buf_get(
        xfs_buf_flags_t         flags)
 {
        xfs_buf_t               *bp, *new_bp;
-        int                     error = 0, i;
+        int                     error = 0;
        new_bp = xfs_buf_allocate(flags);
        if (unlikely(!new_bp))
@@ -517,7 +547,7 @@ xfs_buf_get(
        bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
        if (bp == new_bp) {
-                error = _xfs_buf_lookup_pages(bp, flags);
+                error = xfs_buf_allocate_memory(bp, flags);
                if (error)
                        goto no_buffer;
        } else {
@@ -526,14 +556,11 @@ xfs_buf_get(
                        return NULL;
        }
-        for (i = 0; i < bp->b_page_count; i++)
-                mark_page_accessed(bp->b_pages[i]);
        if (!(bp->b_flags & XBF_MAPPED)) {
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
-                        printk(KERN_WARNING "%s: failed to map pages\n",
+                        xfs_warn(target->bt_mount,
-                                        __func__);
+                                "%s: failed to map pages\n", __func__);
                        goto no_buffer;
                }
        }
@@ -625,17 +652,47 @@ void
 xfs_buf_readahead(
        xfs_buftarg_t           *target,
        xfs_off_t               ioff,
-        size_t                  isize,
+        size_t                  isize)
-        xfs_buf_flags_t         flags)
 {
-        struct backing_dev_info *bdi;
+        if (bdi_read_congested(target->bt_bdi))
-        bdi = target->bt_mapping->backing_dev_info;
-        if (bdi_read_congested(bdi))
                return;
-        flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+        xfs_buf_read(target, ioff, isize,
-        xfs_buf_read(target, ioff, isize, flags);
+                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+        struct xfs_mount        *mp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             daddr,
+        size_t                  length,
+        int                     flags)
+{
+        xfs_buf_t               *bp;
+        int                     error;
+        bp = xfs_buf_get_uncached(target, length, flags);
+        if (!bp)
+                return NULL;
+        /* set up the buffer for a read IO */
+        xfs_buf_lock(bp);
+        XFS_BUF_SET_ADDR(bp, daddr);
+        XFS_BUF_READ(bp);
+        XFS_BUF_BUSY(bp);
+        xfsbdstrat(mp, bp);
+        error = xfs_buf_iowait(bp);
+        if (error || bp->b_error) {
+                xfs_buf_relse(bp);
+                return NULL;
+        }
+        return bp;
 }
 xfs_buf_t *
@@ -651,6 +708,27 @@ xfs_buf_get_empty(
        return bp;
 }
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+        struct xfs_buf          *bp,
+        size_t                  len)
+{
+        if (bp->b_pages)
+                _xfs_buf_free_pages(bp);
+        bp->b_pages = NULL;
+        bp->b_page_count = 0;
+        bp->b_addr = NULL;
+        bp->b_file_offset = 0;
+        bp->b_buffer_length = bp->b_count_desired = len;
+        bp->b_bn = XFS_BUF_DADDR_NULL;
+        bp->b_flags &= ~XBF_MAPPED;
+}
 static inline struct page *
 mem_to_page(
        void                    *addr)
@@ -675,10 +753,10 @@ xfs_buf_associate_memory(
        size_t                  buflen;
        int                     page_count;
-        pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
+        pageaddr = (unsigned long)mem & PAGE_MASK;
        offset = (unsigned long)mem - pageaddr;
-        buflen = PAGE_CACHE_ALIGN(len + offset);
+        buflen = PAGE_ALIGN(len + offset);
-        page_count = buflen >> PAGE_CACHE_SHIFT;
+        page_count = buflen >> PAGE_SHIFT;
        /* Free any previous set of page pointers */
        if (bp->b_pages)
@@ -695,21 +773,21 @@ xfs_buf_associate_memory(
        for (i = 0; i < bp->b_page_count; i++) {
                bp->b_pages[i] = mem_to_page((void *)pageaddr);
-                pageaddr += PAGE_CACHE_SIZE;
+                pageaddr += PAGE_SIZE;
        }
        bp->b_count_desired = len;
        bp->b_buffer_length = buflen;
        bp->b_flags |= XBF_MAPPED;
-        bp->b_flags &= ~_XBF_PAGE_LOCKED;
        return 0;
 }
 xfs_buf_t *
-xfs_buf_get_noaddr(
+xfs_buf_get_uncached(
+        struct xfs_buftarg      *target,
        size_t                  len,
-        xfs_buftarg_t           *target)
+        int                     flags)
 {
        unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
        int                     error, i;
@@ -725,7 +803,7 @@ xfs_buf_get_noaddr(
                goto fail_free_buf;
        for (i = 0; i < page_count; i++) {
-                bp->b_pages[i] = alloc_page(GFP_KERNEL);
+                bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
                if (!bp->b_pages[i])
                        goto fail_free_mem;
        }
@@ -733,14 +811,14 @@ xfs_buf_get_noaddr(
        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
        if (unlikely(error)) {
-                printk(KERN_WARNING "%s: failed to map pages\n",
+                xfs_warn(target->bt_mount,
-                                __func__);
+                        "%s: failed to map pages\n", __func__);
                goto fail_free_mem;
        }
        xfs_buf_unlock(bp);
-        trace_xfs_buf_get_noaddr(bp, _RET_IP_);
+        trace_xfs_buf_get_uncached(bp, _RET_IP_);
        return bp;
 fail_free_mem:
@@ -774,29 +852,32 @@ void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
-        xfs_bufhash_t           *hash = bp->b_hash;
+        struct xfs_perag        *pag = bp->b_pag;
        trace_xfs_buf_rele(bp, _RET_IP_);
-        if (unlikely(!hash)) {
+        if (!pag) {
-                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
+                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
                return;
        }
+        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
-        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                if (bp->b_relse) {
+                if (!(bp->b_flags & XBF_STALE) &&
-                        atomic_inc(&bp->b_hold);
+                           atomic_read(&bp->b_lru_ref)) {
-                        spin_unlock(&hash->bh_lock);
+                        xfs_buf_lru_add(bp);
-                        (*(bp->b_relse)) (bp);
+                        spin_unlock(&pag->pag_buf_lock);
-                } else if (bp->b_flags & XBF_FS_MANAGED) {
-                        spin_unlock(&hash->bh_lock);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                        list_del_init(&bp->b_hash_list);
+                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                        spin_unlock(&hash->bh_lock);
+                        spin_unlock(&pag->pag_buf_lock);
+                        xfs_perag_put(pag);
                        xfs_buf_free(bp);
                }
        }
@@ -804,20 +885,15 @@ xfs_buf_rele(
 /*
- *      Mutual exclusion on buffers.  Locking model:
+ *      Lock a buffer object, if it is not already locked.
 *
- *      Buffers associated with inodes for which buffer locking
+ *      If we come across a stale, pinned, locked buffer, we know that we are
- *      is not enabled are not protected by semaphores, and are
+ *      being asked to lock a buffer that has been reallocated. Because it is
- *      assumed to be exclusively owned by the caller.  There is a
+ *      pinned, we know that the log has not been pushed to disk and hence it
- *      spinlock in the buffer, used by the caller when concurrent
+ *      will still be locked.  Rather than continuing to have trylock attempts
- *      access is possible.
+ *      fail until someone else pushes the log, push it ourselves before
- */
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
-/*
- *      Locks a buffer object, if it is not already locked.
- *      Note that this in no way locks the underlying pages, so it is only
- *      useful for synchronizing concurrent use of buffer objects, not for
- *      synchronizing independent access to the underlying pages.
 */
 int
 xfs_buf_cond_lock(
@@ -828,6 +904,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
@@ -841,10 +919,7 @@ xfs_buf_lock_value(
 }
 /*
- *      Locks a buffer object.
+ *      Lock a buffer object.
- *      Note that this in no way locks the underlying pages, so it is only
- *      useful for synchronizing concurrent use of buffer objects, not for
- *      synchronizing independent access to the underlying pages.
 *
 *      If we come across a stale, pinned, locked buffer, we know that we
 *      are being asked to lock a buffer that has been reallocated. Because
@@ -859,9 +934,7 @@ xfs_buf_lock(
        trace_xfs_buf_lock(bp, _RET_IP_);
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-                xfs_log_force(bp->b_mount, 0);
+                xfs_log_force(bp->b_target->bt_mount, 0);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
        XB_SET_OWNER(bp);
@@ -905,9 +978,7 @@ xfs_buf_wait_unpin(
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (atomic_read(&bp->b_pin_count) == 0)
                        break;
-                if (atomic_read(&bp->b_io_remaining))
+                io_schedule();
-                        blk_run_address_space(bp->b_target->bt_mapping);
-                schedule();
        }
        remove_wait_queue(&bp->b_waiters, &wait);
        set_current_state(TASK_RUNNING);
@@ -924,19 +995,7 @@ xfs_buf_iodone_work(
        xfs_buf_t               *bp =
                container_of(work, xfs_buf_t, b_iodone_work);
-        /*
+        if (bp->b_iodone)
-         * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-         * ordered flag and reissue them.  Because we can't tell the higher
-         * layers directly that they should not issue ordered I/O anymore, they
-         * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-         */
-        if ((bp->b_error == EOPNOTSUPP) &&
-            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-                trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-                bp->b_flags &= ~XBF_ORDERED;
-                bp->b_flags |= _XFS_BARRIER_FAILED;
-                xfs_buf_iorequest(bp);
-        } else if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
@@ -982,7 +1041,6 @@ xfs_bwrite(
 {
        int                     error;
-        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
@@ -1003,8 +1061,6 @@ xfs_bdwrite(
 {
        trace_xfs_buf_bdwrite(bp, _RET_IP_);
-        bp->b_mount = mp;
        bp->b_flags &= ~XBF_READ;
        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
@@ -1013,7 +1069,7 @@ xfs_bdwrite(
 /*
 * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
 * so that the proper iodone callbacks get called.
 */
 STATIC int
@@ -1030,21 +1086,21 @@ xfs_bioerror(
        XFS_BUF_ERROR(bp, EIO);
        /*
-         * We're calling biodone, so delete XBF_DONE flag.
+         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
         */
        XFS_BUF_UNREAD(bp);
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_UNDONE(bp);
        XFS_BUF_STALE(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
        return EIO;
 }
 /*
 * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
+ * here ourselves, and avoiding the xfs_buf_ioend call.
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
@@ -1093,7 +1149,7 @@ int
 xfs_bdstrat_cb(
        struct xfs_buf  *bp)
 {
-        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
                trace_xfs_bdstrat_shut(bp, _RET_IP_);
                /*
                 * Metadata write that didn't get logged but
@@ -1134,10 +1190,8 @@ _xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
 {
-        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-                bp->b_flags &= ~_XBF_PAGE_LOCKED;
                xfs_buf_ioend(bp, schedule);
-        }
 }
 STATIC void
@@ -1146,35 +1200,12 @@ xfs_buf_bio_end_io(
        int                     error)
 {
        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
-        unsigned int            blocksize = bp->b_target->bt_bsize;
-        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        xfs_buf_ioerror(bp, -error);
        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-        do {
-                struct page     *page = bvec->bv_page;
-                ASSERT(!PagePrivate(page));
-                if (unlikely(bp->b_error)) {
-                        if (bp->b_flags & XBF_READ)
-                                ClearPageUptodate(page);
-                } else if (blocksize >= PAGE_CACHE_SIZE) {
-                        SetPageUptodate(page);
-                } else if (!PagePrivate(page) &&
-                                (bp->b_flags & _XBF_PAGE_CACHE)) {
-                        set_page_region(page, bvec->bv_offset, bvec->bv_len);
-                }
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (bp->b_flags & _XBF_PAGE_LOCKED)
-                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
        _xfs_buf_ioend(bp, 1);
        bio_put(bio);
 }
@@ -1188,14 +1219,13 @@ _xfs_buf_ioapply(
        int                     offset = bp->b_offset;
        int                     size = bp->b_count_desired;
        sector_t                sector = bp->b_bn;
-        unsigned int            blocksize = bp->b_target->bt_bsize;
        total_nr_pages = bp->b_page_count;
        map_i = 0;
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
-                rw = WRITE_BARRIER;
+                rw = WRITE_FLUSH_FUA;
        } else if (bp->b_flags & XBF_LOG_BUFFER) {
                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1209,29 +1239,6 @@ _xfs_buf_ioapply(
                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
        }
-        /* Special code path for reading a sub page size buffer in --
-         * we populate up the whole page, and hence the other metadata
-         * in the same page.  This optimization is only valid when the
-         * filesystem block size is not smaller than the page size.
-         */
-        if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-            ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
-              (XBF_READ|_XBF_PAGE_LOCKED)) &&
-            (blocksize >= PAGE_CACHE_SIZE)) {
-                bio = bio_alloc(GFP_NOIO, 1);
-                bio->bi_bdev = bp->b_target->bt_bdev;
-                bio->bi_sector = sector - (offset >> BBSHIFT);
-                bio->bi_end_io = xfs_buf_bio_end_io;
-                bio->bi_private = bp;
-                bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
-                size = 0;
-                atomic_inc(&bp->b_io_remaining);
-                goto submit_io;
-        }
 next_chunk:
        atomic_inc(&bp->b_io_remaining);
@@ -1245,8 +1252,9 @@ next_chunk:
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
        for (; size && nr_pages; nr_pages--, map_i++) {
-                int     rbytes, nbytes = PAGE_CACHE_SIZE - offset;
+                int     rbytes, nbytes = PAGE_SIZE - offset;
                if (nbytes > size)
                        nbytes = size;
@@ -1261,7 +1269,6 @@ next_chunk:
                total_nr_pages--;
        }
-submit_io:
        if (likely(bio->bi_size)) {
                if (xfs_buf_is_vmapped(bp)) {
                        flush_kernel_vmap_range(bp->b_addr,
@@ -1271,18 +1278,7 @@ submit_io:
                if (size)
                        goto next_chunk;
        } else {
-                /*
-                 * if we get here, no pages were added to the bio. However,
-                 * we can't just error out here - if the pages are locked then
-                 * we have to unlock them otherwise we can hang on a later
-                 * access to the page.
-                 */
                xfs_buf_ioerror(bp, EIO);
-                if (bp->b_flags & _XBF_PAGE_LOCKED) {
-                        int i;
-                        for (i = 0; i < bp->b_page_count; i++)
-                                unlock_page(bp->b_pages[i]);
-                }
                bio_put(bio);
        }
 }
@@ -1327,8 +1323,6 @@ xfs_buf_iowait(
 {
        trace_xfs_buf_iowait(bp, _RET_IP_);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1346,8 +1340,8 @@ xfs_buf_offset(
                return XFS_BUF_PTR(bp) + offset;
        offset += bp->b_offset;
-        page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
+        page = bp->b_pages[offset >> PAGE_SHIFT];
-        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
+        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
 }
 /*
@@ -1369,9 +1363,9 @@ xfs_buf_iomove(
                page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
                cpoff = xfs_buf_poff(boff + bp->b_offset);
                csize = min_t(size_t,
-                              PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
+                              PAGE_SIZE-cpoff, bp->b_count_desired-boff);
-                ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+                ASSERT(((csize + cpoff) <= PAGE_SIZE));
                switch (mode) {
                case XBRW_ZERO:
@@ -1394,89 +1388,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
-        xfs_buftarg_t   *btp)
+        struct xfs_buftarg      *btp)
 {
-        xfs_buf_t       *bp, *n;
+        struct xfs_buf          *bp;
-        xfs_bufhash_t   *hash;
-        uint            i;
+restart:
+        spin_lock(&btp->bt_lru_lock);
-        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+        while (!list_empty(&btp->bt_lru)) {
-                hash = &btp->bt_hash[i];
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-again:
+                if (atomic_read(&bp->b_hold) > 1) {
-                spin_lock(&hash->bh_lock);
+                        spin_unlock(&btp->bt_lru_lock);
-                list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
+                        delay(100);
-                        ASSERT(btp == bp->b_target);
+                        goto restart;
-                        if (!(bp->b_flags & XBF_FS_MANAGED)) {
-                                spin_unlock(&hash->bh_lock);
-                                /*
-                                 * Catch superblock reference count leaks
-                                 * immediately
-                                 */
-                                BUG_ON(bp->b_bn == 0);
-                                delay(100);
-                                goto again;
-                        }
                }
-                spin_unlock(&hash->bh_lock);
+                /*
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      Allocate buffer hash table for a given target.
+xfs_buftarg_shrink(
- *      For devices containing metadata (i.e. not the log/realtime devices)
+        struct shrinker         *shrink,
- *      we need to allocate a much larger hash table.
+        struct shrink_control   *sc)
- */
-STATIC void
-xfs_alloc_bufhash(
-        xfs_buftarg_t           *btp,
-        int                     external)
 {
-        unsigned int            i;
+        struct xfs_buftarg      *btp = container_of(shrink,
+                                        struct xfs_buftarg, bt_shrinker);
+        struct xfs_buf          *bp;
+        int nr_to_scan = sc->nr_to_scan;
+        LIST_HEAD(dispose);
-        btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
+        if (!nr_to_scan)
-        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+                return btp->bt_lru_nr;
-                                         sizeof(xfs_bufhash_t));
-        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-                spin_lock_init(&btp->bt_hash[i].bh_lock);
-                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
-        }
-}
-STATIC void
+        spin_lock(&btp->bt_lru_lock);
-xfs_free_bufhash(
+        while (!list_empty(&btp->bt_lru)) {
-        xfs_buftarg_t           *btp)
+                if (nr_to_scan-- <= 0)
-{
+                        break;
-        kmem_free_large(btp->bt_hash);
-        btp->bt_hash = NULL;
-}
-/*
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
- *      buftarg list for delwrite queue processing
- */
-static LIST_HEAD(xfs_buftarg_list);
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
-STATIC void
+                /*
-xfs_register_buftarg(
+                 * Decrement the b_lru_ref count unless the value is already
-        xfs_buftarg_t           *btp)
+                 * zero. If the value is already zero, we need to reclaim the
-{
+                 * buffer, otherwise it gets another trip through the LRU.
-        spin_lock(&xfs_buftarg_lock);
+                 */
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-        spin_unlock(&xfs_buftarg_lock);
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
-}
+                        continue;
+                }
-STATIC void
+                /*
-xfs_unregister_buftarg(
+                 * remove the buffer from the LRU now to avoid needing another
-        xfs_buftarg_t           *btp)
+                 * lock round trip inside xfs_buf_rele().
-{
+                 */
-        spin_lock(&xfs_buftarg_lock);
+                list_move(&bp->b_lru, &dispose);
-        list_del(&btp->bt_list);
+                btp->bt_lru_nr--;
-        spin_unlock(&xfs_buftarg_lock);
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1484,18 +1473,13 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-        xfs_free_bufhash(btp);
-        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1511,21 +1495,12 @@ xfs_setsize_buftarg_flags(
        btp->bt_smask = sectorsize - 1;
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
-                printk(KERN_WARNING
+                xfs_warn(btp->bt_mount,
-                        "XFS: Cannot set_blocksize to %u on device %s\n",
+                        "Cannot set_blocksize to %u on device %s\n",
                        sectorsize, XFS_BUFTARG_NAME(btp));
                return EINVAL;
        }
-        if (verbose &&
-            (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
-                printk(KERN_WARNING
-                        "XFS: %u byte sectors in use on device %s.  "
-                        "This is suboptimal; %u or greater is ideal.\n",
-                        sectorsize, XFS_BUFTARG_NAME(btp),
-                        (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
-        }
        return 0;
 }
@@ -1540,7 +1515,7 @@ xfs_setsize_buftarg_early(
        struct block_device     *bdev)
 {
        return xfs_setsize_buftarg_flags(btp,
-                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
+                        PAGE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 int
@@ -1553,62 +1528,22 @@ xfs_setsize_buftarg(
 }
 STATIC int
-xfs_mapping_buftarg(
-        xfs_buftarg_t           *btp,
-        struct block_device     *bdev)
-{
-        struct backing_dev_info *bdi;
-        struct inode            *inode;
-        struct address_space    *mapping;
-        static const struct address_space_operations mapping_aops = {
-                .sync_page = block_sync_page,
-                .migratepage = fail_migrate_page,
-        };
-        inode = new_inode(bdev->bd_inode->i_sb);
-        if (!inode) {
-                printk(KERN_WARNING
-                        "XFS: Cannot allocate mapping inode for device %s\n",
-                        XFS_BUFTARG_NAME(btp));
-                return ENOMEM;
-        }
-        inode->i_mode = S_IFBLK;
-        inode->i_bdev = bdev;
-        inode->i_rdev = bdev->bd_dev;
-        bdi = blk_get_backing_dev_info(bdev);
-        if (!bdi)
-                bdi = &default_backing_dev_info;
-        mapping = &inode->i_data;
-        mapping->a_ops = &mapping_aops;
-        mapping->backing_dev_info = bdi;
-        mapping_set_gfp_mask(mapping, GFP_NOFS);
-        btp->bt_mapping = mapping;
-        return 0;
-}
-STATIC int
 xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
 xfs_alloc_buftarg(
+        struct xfs_mount        *mp,
        struct block_device     *bdev,
        int                     external,
        const char              *fsname)
@@ -1617,15 +1552,22 @@ xfs_alloc_buftarg(
        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
-        if (xfs_setsize_buftarg_early(btp, bdev))
+        btp->bt_bdi = blk_get_backing_dev_info(bdev);
+        if (!btp->bt_bdi)
                goto error;
-        if (xfs_mapping_buftarg(btp, bdev))
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
+        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
-        xfs_alloc_bufhash(btp, external);
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1730,27 +1672,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1771,7 +1692,6 @@ xfs_buf_delwri_split(
        INIT_LIST_HEAD(list);
        spin_lock(dwlk);
        list_for_each_entry_safe(bp, n, dwq, b_list) {
-                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1705,7 @@ xfs_buf_delwri_split(
                                         _XBF_RUN_QUEUES);
                        bp->b_flags |= XBF_WRITE;
                        list_move_tail(&bp->b_list, list);
+                        trace_xfs_buf_delwri_split(bp, _RET_IP_);
                } else
                        skipped++;
        }
@@ -1838,8 +1759,8 @@ xfsbufd(
        do {
                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
                long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
-                int     count = 0;
                struct list_head tmp;
+                struct blk_plug plug;
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1855,16 +1776,15 @@ xfsbufd(
                xfs_buf_delwri_split(target, &tmp, age);
                list_sort(NULL, &tmp, xfs_buf_cmp);
+                blk_start_plug(&plug);
                while (!list_empty(&tmp)) {
                        struct xfs_buf *bp;
                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
                        xfs_bdstrat_cb(bp);
-                        count++;
                }
-                if (count)
+                blk_finish_plug(&plug);
-                        blk_run_address_space(target->bt_mapping);
        } while (!kthread_should_stop());
        return 0;
@@ -1884,6 +1804,7 @@ xfs_flush_buftarg(
        int             pincount = 0;
        LIST_HEAD(tmp_list);
        LIST_HEAD(wait_list);
+        struct blk_plug plug;
        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1898,6 +1819,8 @@ xfs_flush_buftarg(
         * we do that after issuing all the IO.
         */
        list_sort(NULL, &tmp_list, xfs_buf_cmp);
+        blk_start_plug(&plug);
        while (!list_empty(&tmp_list)) {
                bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
                ASSERT(target == bp->b_target);
@@ -1908,15 +1831,15 @@ xfs_flush_buftarg(
                }
                xfs_bdstrat_cb(bp);
        }
+        blk_finish_plug(&plug);
        if (wait) {
-                /* Expedite and wait for IO to complete. */
+                /* Wait for IO to complete. */
-                blk_run_address_space(target->bt_mapping);
                while (!list_empty(&wait_list)) {
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
-                        xfs_iowait(bp);
+                        xfs_buf_iowait(bp);
                        xfs_buf_relse(bp);
                }
        }
@@ -1933,19 +1856,19 @@ xfs_buf_init(void)
                goto out;
        xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                        WQ_RESCUER | WQ_HIGHPRI, 1);
+                                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_workqueue("xfsdatad");
+        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                                WQ_MEM_RECLAIM, 1);
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1961,7 +1884,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
 #define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_DELWRI      (1 << 6) /* buffer has dirty pages */
 #define XBF_STALE       (1 << 7) /* buffer has been staled, do not find it */
-#define XBF_FS_MANAGED  (1 << 8) /* filesystem controls freeing memory */
 #define XBF_ORDERED     (1 << 11)/* use ordered writes */
 #define XBF_READ_AHEAD  (1 << 12)/* asynchronous read-ahead */
 #define XBF_LOG_BUFFER  (1 << 13)/* this is a buffer used for the log */
@@ -62,38 +61,11 @@ typedef enum {
 #define XBF_DONT_BLOCK  (1 << 16)/* do not block in current thread */
 /* flags used only internally */
-#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
 #define _XBF_PAGES      (1 << 18)/* backed by refcounted pages */
 #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
+#define _XBF_KMEM       (1 << 20)/* backed by heap memory */
 #define _XBF_DELWRI_Q   (1 << 21)/* buffer on delwri queue */
-/*
- * Special flag for supporting metadata blocks smaller than a FSB.
- *
- * In this case we can have multiple xfs_buf_t on a single page and
- * need to lock out concurrent xfs_buf_t readers as they only
- * serialise access to the buffer.
- *
- * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
- * between reads of the page. Hence we can have one thread read the
- * page and modify it, but then race with another thread that thinks
- * the page is not up-to-date and hence reads it again.
- *
- * The result is that the first modifcation to the page is lost.
- * This sort of AGF/AGI reading race can happen when unlinking inodes
- * that require truncation and results in the AGI unlinked list
- * modifications being lost.
- */
-#define _XBF_PAGE_LOCKED        (1 << 22)
-/*
- * If we try a barrier write, but it fails we have to communicate
- * this to the upper layers.  Unfortunately b_error gets overwritten
- * when the buffer is re-issued so we have to add another flag to
- * keep this information.
- */
-#define _XFS_BARRIER_FAILED     (1 << 23)
 typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
@@ -104,19 +76,15 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_DONE,             "DONE" }, \
        { XBF_DELWRI,           "DELWRI" }, \
        { XBF_STALE,            "STALE" }, \
-        { XBF_FS_MANAGED,       "FS_MANAGED" }, \
        { XBF_ORDERED,          "ORDERED" }, \
        { XBF_READ_AHEAD,       "READ_AHEAD" }, \
        { XBF_LOCK,             "LOCK" },       /* should never be set */\
        { XBF_TRYLOCK,          "TRYLOCK" },    /* ditto */\
        { XBF_DONT_BLOCK,       "DONT_BLOCK" }, /* ditto */\
-        { _XBF_PAGE_CACHE,      "PAGE_CACHE" }, \
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_RUN_QUEUES,      "RUN_QUEUES" }, \
-        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
+        { _XBF_KMEM,            "KMEM" }, \
-        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }, \
+        { _XBF_DELWRI_Q,        "DELWRI_Q" }
-        { _XFS_BARRIER_FAILED,  "BARRIER_FAILED" }
 typedef enum {
        XBT_FORCE_SLEEP = 0,
@@ -131,70 +99,67 @@ typedef struct xfs_bufhash {
 typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
-        struct address_space    *bt_mapping;
+        struct backing_dev_info *bt_bdi;
+        struct xfs_mount        *bt_mount;
        unsigned int            bt_bsize;
        unsigned int            bt_sshift;
        size_t                  bt_smask;
-        /* per device buffer hash table */
-        uint                    bt_hashshift;
-        xfs_bufhash_t           *bt_hash;
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
-} xfs_buftarg_t;
-/*
+        /* LRU control structures */
- *      xfs_buf_t:  Buffer structure for pagecache-based buffers
+        struct shrinker         bt_shrinker;
- *
+        struct list_head        bt_lru;
- * This buffer structure is used by the pagecache buffer management routines
+        spinlock_t              bt_lru_lock;
- * to refer to an assembly of pages forming a logical buffer.
+        unsigned int            bt_lru_nr;
- *
+} xfs_buftarg_t;
- * The buffer structure is used on a temporary basis only, and discarded when
- * released.  The real data storage is recorded in the pagecache. Buffers are
- * hashed to the block device on which the file system resides.
- */
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
 typedef struct xfs_buf {
+        /*
+         * first cacheline holds all the fields needed for an uncontended cache
+         * hit to be fully processed. The semaphore straddles the cacheline
+         * boundary, but the counter and lock sits on the first cacheline,
+         * which is the only bit that is touched if we hit the semaphore
+         * fast-path on locking.
+         */
+        struct rb_node          b_rbnode;       /* rbtree node */
+        xfs_off_t               b_file_offset;  /* offset in file */
+        size_t                  b_buffer_length;/* size of buffer in bytes */
+        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
+        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
-        unsigned long           b_queuetime;    /* time buffer was queued */
-        atomic_t                b_pin_count;    /* pin count */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
-        xfs_buf_flags_t         b_flags;        /* status flags */
+        struct xfs_perag        *b_pag;         /* contains rbtree root */
-        struct list_head        b_hash_list;    /* hash table list */
-        xfs_bufhash_t           *b_hash;        /* hash table list start */
        xfs_buftarg_t           *b_target;      /* buffer target (device) */
-        atomic_t                b_hold;         /* reference count */
        xfs_daddr_t             b_bn;           /* block number for I/O */
-        xfs_off_t               b_file_offset;  /* offset in file */
-        size_t                  b_buffer_length;/* size of buffer in bytes */
        size_t                  b_count_desired;/* desired transfer size */
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
-        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-        struct xfs_mount        *b_mount;
-        unsigned short          b_error;        /* error code on I/O */
-        unsigned int            b_page_count;   /* size of page array */
-        unsigned int            b_offset;       /* page offset in first page */
        struct page             **b_pages;      /* array of page pointers */
        struct page             *b_page_array[XB_PAGES]; /* inline pages */
+        unsigned long           b_queuetime;    /* time buffer was queued */
+        atomic_t                b_pin_count;    /* pin count */
+        atomic_t                b_io_remaining; /* #outstanding I/O requests */
+        unsigned int            b_page_count;   /* size of page array */
+        unsigned int            b_offset;       /* page offset in first page */
+        unsigned short          b_error;        /* error code on I/O */
 #ifdef XFS_BUF_LOCK_TRACKING
        int                     b_last_holder;
 #endif
@@ -213,11 +178,14 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
-                                xfs_buf_flags_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+                                struct xfs_buftarg *target,
+                                xfs_daddr_t daddr, size_t length, int flags);
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +210,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
@@ -267,7 +237,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -276,8 +247,6 @@ extern void xfs_buf_terminate(void);
                                        XFS_BUF_DONE(bp);       \
                                } while (0)
-#define XFS_BUF_UNMANAGE(bp)    ((bp)->b_flags &= ~XBF_FS_MANAGED)
 #define XFS_BUF_DELAYWRITE(bp)          ((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)        xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)        ((bp)->b_flags & XBF_DELWRI)
@@ -320,7 +289,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
@@ -333,9 +301,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
@@ -351,30 +325,15 @@ extern void xfs_buf_terminate(void);
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-        if (!bp->b_relse)
+        xfs_buf_unlock(bp);
-                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
-#define xfs_biodone(bp)         xfs_buf_ioend(bp, 0)
-#define xfs_biomove(bp, off, len, data, rw) \
-            xfs_buf_iomove((bp), (off), (len), (data), \
-                ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
-#define xfs_biozero(bp, off, len) \
-            xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-#define xfs_iowait(bp)  xfs_buf_iowait(bp)
-#define xfs_baread(target, rablkno, ralen)  \
-        xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+                        struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-#include <linux/capability.h>
-/*
- * Credentials
- */
-typedef const struct cred cred_t;
-#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..244e797dae32
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+STATIC int
+xfs_trim_extents(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_fsblock_t           start,
+        xfs_fsblock_t           len,
+        xfs_fsblock_t           minlen,
+        __uint64_t              *blocks_trimmed)
+{
+        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        struct xfs_perag        *pag;
+        int                     error;
+        int                     i;
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+        if (error || !agbp)
+                goto out_put_perag;
+        cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Force out the log.  This means any transactions that might have freed
+         * space before we took the AGF buffer lock are now on disk, and the
+         * volatile disk cache is flushed.
+         */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * Look up the longest btree in the AGF and start with it.
+         */
+        error = xfs_alloc_lookup_le(cur, 0,
+                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+        if (error)
+                goto out_del_cursor;
+        /*
+         * Loop until we are done with all extents that are large
+         * enough to be worth discarding.
+         */
+        while (i) {
+                xfs_agblock_t fbno;
+                xfs_extlen_t flen;
+                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+                if (error)
+                        goto out_del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                /*
+                 * Too small?  Give up.
+                 */
+                if (flen < minlen) {
+                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                        goto out_del_cursor;
+                }
+                /*
+                 * If the extent is entirely outside of the range we are
+                 * supposed to discard skip it.  Do not bother to trim
+                 * down partially overlapping ranges for now.
+                 */
+                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                /*
+                 * If any blocks in the range are still busy, skip the
+                 * discard and try again the next time.
+                 */
+                if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                        trace_xfs_discard_busy(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                trace_xfs_discard_extent(mp, agno, fbno, flen);
+                error = -blkdev_issue_discard(bdev,
+                                XFS_AGB_TO_DADDR(mp, agno, fbno),
+                                XFS_FSB_TO_BB(mp, flen),
+                                GFP_NOFS, 0);
+                if (error)
+                        goto out_del_cursor;
+                *blocks_trimmed += flen;
+next_extent:
+                error = xfs_btree_decrement(cur, 0, &i);
+                if (error)
+                        goto out_del_cursor;
+        }
+out_del_cursor:
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_buf_relse(agbp);
+out_put_perag:
+        xfs_perag_put(pag);
+        return error;
+}
+int
+xfs_ioc_trim(
+        struct xfs_mount                *mp,
+        struct fstrim_range __user      *urange)
+{
+        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+        unsigned int            granularity = q->limits.discard_granularity;
+        struct fstrim_range     range;
+        xfs_fsblock_t           start, len, minlen;
+        xfs_agnumber_t          start_agno, end_agno, agno;
+        __uint64_t              blocks_trimmed = 0;
+        int                     error, last_error = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (!blk_queue_discard(q))
+                return -XFS_ERROR(EOPNOTSUPP);
+        if (copy_from_user(&range, urange, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        /*
+         * Truncating down the len isn't actually quite correct, but using
+         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+         * used by the fstrim application.  In the end it really doesn't
+         * matter as trimming blocks is an advisory interface.
+         */
+        start = XFS_B_TO_FSBT(mp, range.start);
+        len = XFS_B_TO_FSBT(mp, range.len);
+        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start_agno >= mp->m_sb.sb_agcount)
+                return -XFS_ERROR(EINVAL);
+        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        if (end_agno >= mp->m_sb.sb_agcount)
+                end_agno = mp->m_sb.sb_agcount - 1;
+        for (agno = start_agno; agno <= end_agno; agno++) {
+                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                          &blocks_trimmed);
+                if (error)
+                        last_error = error;
+        }
+        if (last_error)
+                return last_error;
+        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+        if (copy_to_user(urange, &range, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+int
+xfs_discard_extents(
+        struct xfs_mount        *mp,
+        struct list_head        *list)
+{
+        struct xfs_busy_extent  *busyp;
+        int                     error = 0;
+        list_for_each_entry(busyp, list, list) {
+                trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+                                         busyp->length);
+                error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+                                XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+                                XFS_FSB_TO_BB(mp, busyp->length),
+                                GFP_NOFS, 0);
+                if (error && error != EOPNOTSUPP) {
+                        xfs_info(mp,
+         "discard failed for extent [0x%llu,%u], error %d",
+                                 (unsigned long long)busyp->bno,
+                                 busyp->length,
+                                 error);
+                        return error;
+                }
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..344879aea646
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,10 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+struct fstrim_range;
+struct list_head;
+extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int      xfs_discard_extents(struct xfs_mount *, struct list_head *);
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
@@ -81,8 +89,10 @@ xfs_fs_encode_fh(
         * seven combinations work.  The real answer is "don't use v2".
         */
        len = xfs_fileid_length(fileid_type);
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return 255;
+        }
        *max_len = len;
        switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..7f782af286bf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
 #include "xfs_trace.h"
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_lock(&VFS_I(ip)->i_mutex);
+        xfs_ilock(ip, type);
+}
+static inline void
+xfs_rw_iunlock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_iunlock(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+static inline void
+xfs_rw_ilock_demote(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_ilock_demote(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+/*
 *      xfs_iozero
 *
 *      xfs_iozero clears the specified range of buffer supplied,
@@ -96,19 +131,34 @@ xfs_file_fsync(
 {
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
        trace_xfs_file_fsync(ip);
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
        xfs_ioend_wait(ip);
+        if (mp->m_flags & XFS_MOUNT_BARRIER) {
+                /*
+                 * If we have an RT and/or log subvolume we need to make sure
+                 * to flush the write cache the device used for file data
+                 * first.  This is to ensure newly written file data make
+                 * it to disk before logging the new inode size in case of
+                 * an extending write.
+                 */
+                if (XFS_IS_REALTIME_INODE(ip))
+                        xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+                else if (mp->m_logdev_targp != mp->m_ddev_targp)
+                        xfs_blkdev_issue_flush(mp->m_ddev_targp);
+        }
        /*
         * We always need to make sure that the required inode state is safe on
         * disk.  The inode might be clean but we still might need to force the
@@ -140,9 +190,9 @@ xfs_file_fsync(
                 * updates.  The sync transaction will also force the log.
                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
                error = xfs_trans_reserve(tp, 0,
-                                XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+                                XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
                if (error) {
                        xfs_trans_cancel(tp, 0);
                        return -error;
@@ -174,28 +224,25 @@ xfs_file_fsync(
                 * force the log.
                 */
                if (xfs_ipincount(ip)) {
-                        error = _xfs_log_force_lsn(ip->i_mount,
+                        error = _xfs_log_force_lsn(mp,
                                        ip->i_itemp->ili_last_lsn,
                                        XFS_LOG_SYNC, &log_flushed);
                }
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
-        if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+        /*
-                /*
+         * If we only have a single device, and the log force about was
-                 * If the log write didn't issue an ordered tag we need
+         * a no-op we might have to flush the data device cache here.
-                 * to flush the disk cache for the data device now.
+         * This can only happen for fdatasync/O_DSYNC if we were overwriting
-                 */
+         * an already allocated file and thus do not have any metadata to
-                if (!log_flushed)
+         * commit.
-                        xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+         */
+        if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
-                /*
+            mp->m_logdev_targp == mp->m_ddev_targp &&
-                 * If this inode is on the RT dev we need to flush that
+            !XFS_IS_REALTIME_INODE(ip) &&
-                 * cache as well.
+            !log_flushed)
-                 */
+                xfs_blkdev_issue_flush(mp->m_ddev_targp);
-                if (XFS_IS_REALTIME_INODE(ip))
-                        xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-        }
        return -error;
 }
@@ -262,22 +309,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                        if (ret) {
+                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                                return ret;
+                        }
                }
-                mutex_unlock(&inode->i_mutex);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                if (ret) {
+        } else
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +331,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -309,7 +355,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +363,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
+STATIC void
+xfs_aio_write_isize_update(
+        struct inode    *inode,
+        loff_t          *ppos,
+        ssize_t         bytes_written)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_fsize_t             isize = i_size_read(inode);
+        if (bytes_written > 0)
+                XFS_STATS_ADD(xs_write_bytes, bytes_written);
+        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                        *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occurred.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+        struct xfs_inode        *ip)
+{
+        if (ip->i_new_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +428,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize, new_size;
+        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -355,27 +452,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
+        xfs_aio_write_isize_update(inode, ppos, ret);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+        xfs_aio_write_newsize_update(ip);
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,247 +641,318 @@ out_lock:
        return error;
 }
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_write_checks(
-        struct kiocb            *iocb,
+        struct file             *file,
-        const struct iovec      *iovp,
+        loff_t                  *pos,
-        unsigned long           nr_segs,
+        size_t                  *count,
-        loff_t                  pos)
+        int                     *iolock)
 {
-        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             new_size;
-        ssize_t                 ret = 0, error = 0;
+        int                     error = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        size_t                  ocount = 0, count;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
+        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                *iolock = 0;
+                return error;
+        }
-        BUG_ON(iocb->ki_pos != pos);
+        new_size = *pos + *count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
-        if (unlikely(file->f_flags & O_DIRECT))
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                ioflags |= IO_ISDIRECT;
+                file_update_time(file);
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
-        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        /*
+         * If the offset is beyond the size of the file, we need to zero any
+         * blocks that fall between the existing EOF and the start of this
+         * write.
+         */
+        if (*pos > ip->i_size)
+                error = -xfs_zero_eof(ip, *pos, ip->i_size);
+        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
-        count = ocount;
+        /*
-        if (count == 0)
+         * If we're writing the file then make sure to clear the setuid and
-                return 0;
+         * setgid bits if the process is not being run by root.  This keeps
+         * people from modifying setuid and setgid binaries.
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+         */
+        return file_remove_suid(file);
-        if (XFS_FORCED_SHUTDOWN(mp))
+}
-                return -EIO;
-relock:
+/*
-        if (ioflags & IO_ISDIRECT) {
+ * xfs_file_dio_aio_write - handle direct IO writes
-                iolock = XFS_IOLOCK_SHARED;
+ *
-                need_i_mutex = 0;
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
-        } else {
+ * By separating it from the buffered write path we remove all the tricky to
-                iolock = XFS_IOLOCK_EXCL;
+ * follow locking changes and looping.
-                need_i_mutex = 1;
+ *
-                mutex_lock(&inode->i_mutex);
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos,
+        size_t                  ocount,
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        size_t                  count = ocount;
+        int                     unaligned_io = 0;
+        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp;
+        *iolock = 0;
+        if ((pos & target->bt_smask) || (count & target->bt_smask))
+                return -XFS_ERROR(EINVAL);
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+                unaligned_io = 1;
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+                *iolock = XFS_IOLOCK_EXCL;
+        else
+                *iolock = XFS_IOLOCK_SHARED;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        if (ret)
+                return ret;
+        if (mapping->nrpages) {
+                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                        FI_REMAPF_LOCKED);
+                if (ret)
+                        return ret;
        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        /*
+         * If we are doing unaligned IO, wait for all other IO to drain,
-start:
+         * otherwise demote the lock if we had to flush cached pages
-        error = -generic_write_checks(file, &pos, &count,
+         */
-                                        S_ISBLK(inode->i_mode));
+        if (unaligned_io)
-        if (error) {
+                xfs_ioend_wait(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
-                goto out_unlock_mutex;
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                *iolock = XFS_IOLOCK_SHARED;
        }
-        if (ioflags & IO_ISDIRECT) {
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-                xfs_buftarg_t   *target =
+        ret = generic_file_direct_write(iocb, iovp,
-                        XFS_IS_REALTIME_INODE(ip) ?
+                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+        /* No fallback to buffered IO on errors for XFS. */
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        ASSERT(ret < 0 || ret == count);
-                        return XFS_ERROR(-EINVAL);
+        return ret;
-                }
+}
-                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+STATIC ssize_t
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+xfs_file_buffered_aio_write(
-                        iolock = XFS_IOLOCK_EXCL;
+        struct kiocb            *iocb,
-                        need_i_mutex = 1;
+        const struct iovec      *iovp,
-                        mutex_lock(&inode->i_mutex);
+        unsigned long           nr_segs,
-                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        loff_t                  pos,
-                        goto start;
+        size_t                  ocount,
-                }
+        int                     *iolock)
-        }
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     enospc = 0;
+        size_t                  count = ocount;
-        new_size = pos + count;
+        *iolock = XFS_IOLOCK_EXCL;
-        if (new_size > ip->i_size)
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-                ip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS)))
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-                file_update_time(file);
+        if (ret)
+                return ret;
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+write_retry:
+        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                        pos, &iocb->ki_pos, count, ret);
        /*
-         * If the offset is beyond the size of the file, we have a couple
+         * if we just got an ENOSPC, flush the inode now we aren't holding any
-         * of things to do. First, if there is already space allocated
+         * page locks and retry *once*
-         * we need to either create holes or zero the disk or ...
-         *
-         * If there is a page where the previous size lands, we need
-         * to zero it out up to the new size.
         */
+        if (ret == -ENOSPC && !enospc) {
-        if (pos > ip->i_size) {
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                error = xfs_zero_eof(ip, pos, ip->i_size);
+                if (ret)
-                if (error) {
+                        return ret;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                enospc = 1;
-                        goto out_unlock_internal;
+                goto write_retry;
-                }
        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        current->backing_dev_info = NULL;
+        return ret;
+}
-        /*
+STATIC ssize_t
-         * If we're writing the file then make sure to clear the
+xfs_file_aio_write(
-         * setuid and setgid bits if the process is not being run
+        struct kiocb            *iocb,
-         * by root.  This keeps people from modifying setuid and
+        const struct iovec      *iovp,
-         * setgid binaries.
+        unsigned long           nr_segs,
-         */
+        loff_t                  pos)
-        error = -file_remove_suid(file);
+{
-        if (unlikely(error))
+        struct file             *file = iocb->ki_filp;
-                goto out_unlock_internal;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     iolock;
+        size_t                  ocount = 0;
-        /* We can write back this queue in page reclaim */
+        XFS_STATS_INC(xs_write_calls);
-        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
+        BUG_ON(iocb->ki_pos != pos);
-                if (mapping->nrpages) {
-                        WARN_ON(need_i_mutex == 0);
-                        error = xfs_flushinval_pages(ip,
-                                        (pos & PAGE_CACHE_MASK),
-                                        -1, FI_REMAPF_LOCKED);
-                        if (error)
-                                goto out_unlock_internal;
-                }
-                if (need_i_mutex) {
+        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-                        /* demote the lock now the cached pages are gone */
+        if (ret)
-                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                return ret;
-                        mutex_unlock(&inode->i_mutex);
-                        iolock = XFS_IOLOCK_SHARED;
+        if (ocount == 0)
-                        need_i_mutex = 0;
+                return 0;
-                }
-                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                /*
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                 * direct-io write to a hole: fall through to buffered I/O
+                return -EIO;
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
+        if (unlikely(file->f_flags & O_DIRECT))
-                        count -= ret;
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
+        else
+                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
-                        ioflags &= ~IO_ISDIRECT;
+        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-                        xfs_iunlock(ip, iolock);
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
+        if (ret <= 0)
-                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+                goto out_unlock;
-                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                                pos, &iocb->ki_pos, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
+                int error, error2;
-        isize = i_size_read(inode);
+                xfs_rw_iunlock(ip, iolock);
-        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                error = filemap_write_and_wait_range(mapping, pos, end);
-                iocb->ki_pos = isize;
+                xfs_rw_ilock(ip, iolock);
-        if (iocb->ki_pos > ip->i_size) {
+                error2 = -xfs_file_fsync(file,
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-                if (iocb->ki_pos > ip->i_size)
+                if (error)
-                        ip->i_size = iocb->ki_pos;
+                        ret = error;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                else if (error2)
+                        ret = error2;
        }
-        error = -ret;
+out_unlock:
-        if (ret <= 0)
+        xfs_aio_write_newsize_update(ip);
-                goto out_unlock_internal;
+        xfs_rw_iunlock(ip, iolock);
+        return ret;
+}
-        XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+        struct file     *file,
+        int             mode,
+        loff_t          offset,
+        loff_t          len)
+{
+        struct inode    *inode = file->f_path.dentry->d_inode;
+        long            error;
+        loff_t          new_size = 0;
+        xfs_flock64_t   bf;
+        xfs_inode_t     *ip = XFS_I(inode);
+        int             cmd = XFS_IOC_RESVSP;
+        int             attr_flags = XFS_ATTR_NOLOCK;
-        /* Handle various SYNC-type writes */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                return -EOPNOTSUPP;
-                loff_t end = pos + ret - 1;
-                int error2;
-                xfs_iunlock(ip, iolock);
+        bf.l_whence = 0;
-                if (need_i_mutex)
+        bf.l_start = offset;
-                        mutex_unlock(&inode->i_mutex);
+        bf.l_len = len;
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                if (!error)
-                        error = error2;
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file,
+        if (mode & FALLOC_FL_PUNCH_HOLE)
-                                         (file->f_flags & __O_SYNC) ? 0 : 1);
+                cmd = XFS_IOC_UNRESVSP;
-                if (!error)
-                        error = error2;
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
        }
- out_unlock_internal:
+        if (file->f_flags & O_DSYNC)
-        if (ip->i_new_size) {
+                attr_flags |= XFS_ATTR_SYNC;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                ip->i_new_size = 0;
+        error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
-                /*
+        if (error)
-                 * If this was a direct or synchronous I/O that failed (such
+                goto out_unlock;
-                 * as ENOSPC) then part of the I/O may have been written to
-                 * disk before the error occured.  In this case the on-disk
+        /* Change file size if needed */
-                 * file size may have been adjusted beyond the in-memory file
+        if (new_size) {
-                 * size and now needs to be truncated back.
+                struct iattr iattr;
-                 */
-                if (ip->i_d.di_size > ip->i_size)
+                iattr.ia_valid = ATTR_SIZE;
-                        ip->i_d.di_size = ip->i_size;
+                iattr.ia_size = new_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
-        xfs_iunlock(ip, iolock);
- out_unlock_mutex:
+out_unlock:
-        if (need_i_mutex)
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                mutex_unlock(&inode->i_mutex);
+        return error;
-        return -error;
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
@@ -921,6 +1071,7 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
+        .fallocate      = xfs_file_fallocate,
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
+        /* can't toss partial tail pages, so mask them out */
+        last &= ~(PAGE_SIZE - 1);
-        if (mapping->nrpages)
+        truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-                truncate_inode_pages(mapping, first);
 }
 int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
        trace_xfs_pagecache_inval(ip, first, last);
-        if (mapping->nrpages) {
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        ret = filemap_write_and_wait_range(mapping, first,
-                ret = filemap_write_and_wait(mapping);
+                                last == -1 ? LLONG_MAX : last);
-                if (!ret)
+        if (!ret)
-                        truncate_inode_pages(mapping, first);
+                truncate_inode_pages_range(mapping, first, last);
-        }
        return -ret;
 }
@@ -71,10 +69,9 @@ xfs_flush_pages(
        int             ret = 0;
        int             ret2;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-                xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        ret = -filemap_fdatawrite_range(mapping, first,
-                ret = -filemap_fdatawrite(mapping);
+                                last == -1 ? LLONG_MAX : last);
-        }
        if (flags & XBF_ASYNC)
                return ret;
        ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
 {
        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-                return -filemap_fdatawait(mapping);
+                return -filemap_fdatawait_range(mapping, first,
+                                        last == -1 ? ip->i_size - 1 : last);
+        }
        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
-#include "xfs_cred.h"
 #include "xfs_sysctl.h"
 /*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-extern uint64_t xfs_panic_mask;         /* set to cause more panics */
-#endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
                goto out_dput;
@@ -623,6 +624,10 @@ xfs_ioc_space(
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
+        if (filp->f_flags & O_DSYNC)
+                attr_flags |= XFS_ATTR_SYNC;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
@@ -694,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
        xfs_mount_t             *mp,
        void                    __user *arg)
 {
-        xfs_fsop_geom_v1_t      fsgeo;
+        xfs_fsop_geom_t         fsgeo;
        int                     error;
-        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
                return -error;
-        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+        /*
+         * Caller should have passed an argument of type
+         * xfs_fsop_geom_v1_t.  This is a proper subset of the
+         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+         */
+        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
@@ -790,7 +800,7 @@ xfs_ioc_fsgetxattr(
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        fa.fsx_xflags = xfs_ip2xflags(ip);
        fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-        fa.fsx_projid = ip->i_d.di_projid;
+        fa.fsx_projid = xfs_get_projid(ip);
        if (attr) {
                if (ip->i_afp) {
@@ -909,10 +919,10 @@ xfs_ioctl_setattr(
                return XFS_ERROR(EIO);
        /*
-         * Disallow 32bit project ids because on-disk structure
+         * Disallow 32bit project ids when projid32bit feature is not enabled.
-         * is 16bit only.
         */
-        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
                return XFS_ERROR(EINVAL);
        /*
@@ -961,7 +971,7 @@ xfs_ioctl_setattr(
        if (mask & FSX_PROJID) {
                if (XFS_IS_QUOTA_RUNNING(mp) &&
                    XFS_IS_PQUOTA_ON(mp) &&
-                    ip->i_d.di_projid != fa->fsx_projid) {
+                    xfs_get_projid(ip) != fa->fsx_projid) {
                        ASSERT(tp);
                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
@@ -984,10 +994,22 @@ xfs_ioctl_setattr(
                /*
                 * Extent size must be a multiple of the appropriate block
-                 * size, if set at all.
+                 * size, if set at all. It must also be smaller than the
+                 * maximum extent size supported by the filesystem.
+                 *
+                 * Also, for non-realtime files, limit the extent size hint to
+                 * half the size of the AGs in the filesystem so alignment
+                 * doesn't result in extents larger than an AG.
                 */
                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
+                        xfs_extlen_t    size;
+                        xfs_fsblock_t   extsize_fsb;
+                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                        if (extsize_fsb > MAXEXTLEN) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
                        if (XFS_IS_REALTIME_INODE(ip) ||
                            ((mask & FSX_XFLAGS) &&
@@ -996,6 +1018,10 @@ xfs_ioctl_setattr(
                                       mp->m_sb.sb_blocklog;
                        } else {
                                size = mp->m_sb.sb_blocksize;
+                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+                                        code = XFS_ERROR(EINVAL);
+                                        goto error_return;
+                                }
                        }
                        if (fa->fsx_extsize % size) {
@@ -1063,12 +1089,12 @@ xfs_ioctl_setattr(
                 * Change the ownerships and register quota modifications
                 * in the transaction.
                 */
-                if (ip->i_d.di_projid != fa->fsx_projid) {
+                if (xfs_get_projid(ip) != fa->fsx_projid) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
                                olddquot = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
-                        ip->i_d.di_projid = fa->fsx_projid;
+                        xfs_set_projid(ip, fa->fsx_projid);
                        /*
                         * We may have to rev the inode as well as
@@ -1088,8 +1114,8 @@ xfs_ioctl_setattr(
                xfs_diflags_to_linux(ip);
        }
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
        XFS_STATS_INC(xs_ig_attrchg);
@@ -1294,6 +1320,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
+        case FITRIM:
+                return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -1301,7 +1329,8 @@ xfs_file_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64: {
+        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_ZERO_RANGE: {
                xfs_flock64_t           bf;
                if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
            get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
            get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
-            get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+            get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+            get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
            put_user(buffer->bs_extents,  &p32->bs_extents)     ||
            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
+            put_user(buffer->bs_projid_hi,      &p32->bs_projid_hi)     ||
            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
        case XFS_IOC_FSGEOMETRY_V1:
        case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSRT:
+        case XFS_IOC_ZERO_RANGE:
                return xfs_file_ioctl(filp, cmd, p);
 #else
        case XFS_IOC_ALLOCSP_32:
@@ -583,7 +586,8 @@ xfs_file_compat_ioctl(
        case XFS_IOC_RESVSP_32:
        case XFS_IOC_UNRESVSP_32:
        case XFS_IOC_RESVSP64_32:
-        case XFS_IOC_UNRESVSP64_32: {
+        case XFS_IOC_UNRESVSP64_32:
+        case XFS_IOC_ZERO_RANGE_32: {
                struct xfs_flock64      bf;
                if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
        __s32           bs_extsize;     /* extent size                  */
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
+        __u16           bs_projid_lo;   /* lower part of project id     */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
+#define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
+        __u16           bs_projid_hi;   /* high part of project id      */
+        unsigned char   bs_pad[12];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
@@ -182,6 +184,7 @@ typedef struct compat_xfs_flock64 {
 #define XFS_IOC_UNRESVSP_32     _IOW('X', 41, struct compat_xfs_flock64)
 #define XFS_IOC_RESVSP64_32     _IOW('X', 42, struct compat_xfs_flock64)
 #define XFS_IOC_UNRESVSP64_32   _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32   _IOW('X', 57, struct compat_xfs_flock64)
 typedef struct compat_xfs_fsop_geom_v1 {
        __u32           blocksize;      /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..d44d92cd12b1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
@@ -71,7 +70,7 @@ xfs_synchronize_times(
 /*
 * If the linux inode is valid, mark it dirty.
- * Used when commiting a dirty inode into a transaction so that
+ * Used when committing a dirty inode into a transaction so that
 * the inode will get written back by the linux code
 */
 void
@@ -95,41 +94,6 @@ xfs_mark_inode_dirty(
 }
 /*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- */
-void
-xfs_ichgtime(
-        xfs_inode_t     *ip,
-        int             flags)
-{
-        struct inode    *inode = VFS_I(ip);
-        timespec_t      tv;
-        int             sync_it = 0;
-        tv = current_fs_time(inode->i_sb);
-        if ((flags & XFS_ICHGTIME_MOD) &&
-            !timespec_equal(&inode->i_mtime, &tv)) {
-                inode->i_mtime = tv;
-                sync_it = 1;
-        }
-        if ((flags & XFS_ICHGTIME_CHG) &&
-            !timespec_equal(&inode->i_ctime, &tv)) {
-                inode->i_ctime = tv;
-                sync_it = 1;
-        }
-        /*
-         * Update complete - now make sure everyone knows that the inode
-         * is dirty.
-         */
-        if (sync_it)
-                xfs_mark_inode_dirty_sync(ip);
-}
-/*
 * Hook in SELinux.  This is not quite correct yet, what we really need
 * here (as we do for default ACLs) is a mechanism by which creation of
 * these attrs can be journalled at inode creation time (along with the
@@ -138,7 +102,8 @@ xfs_ichgtime(
 STATIC int
 xfs_init_security(
        struct inode    *inode,
-        struct inode    *dir)
+        struct inode    *dir,
+        const struct qstr *qstr)
 {
        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
@@ -146,7 +111,7 @@ xfs_init_security(
        unsigned char   *name;
        int             error;
-        error = security_inode_init_security(inode, dir, (char **)&name,
+        error = security_inode_init_security(inode, dir, qstr, (char **)&name,
                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
@@ -217,20 +182,20 @@ xfs_vn_mknod(
        if (IS_POSIXACL(dir)) {
                default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
                if (IS_ERR(default_acl))
-                        return -PTR_ERR(default_acl);
+                        return PTR_ERR(default_acl);
                if (!default_acl)
                        mode &= ~current_umask();
        }
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
+        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
        if (unlikely(error))
                goto out_free_acl;
        inode = VFS_I(ip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -352,7 +317,7 @@ xfs_vn_link(
        if (unlikely(error))
                return -error;
-        atomic_inc(&inode->i_count);
+        ihold(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -397,13 +362,13 @@ xfs_vn_symlink(
                (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
+        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
        if (unlikely(error))
                goto out;
        inode = VFS_I(cip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -540,58 +505,6 @@ xfs_vn_setattr(
        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
-STATIC long
-xfs_vn_fallocate(
-        struct inode    *inode,
-        int             mode,
-        loff_t          offset,
-        loff_t          len)
-{
-        long            error;
-        loff_t          new_size = 0;
-        xfs_flock64_t   bf;
-        xfs_inode_t     *ip = XFS_I(inode);
-        /* preallocation on directories not yet supported */
-        error = -ENODEV;
-        if (S_ISDIR(inode->i_mode))
-                goto out_error;
-        bf.l_whence = 0;
-        bf.l_start = offset;
-        bf.l_len = len;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        /* check the new inode size is valid before allocating */
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-            offset + len > i_size_read(inode)) {
-                new_size = offset + len;
-                error = inode_newsize_ok(inode, new_size);
-                if (error)
-                        goto out_unlock;
-        }
-        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                       0, XFS_ATTR_NOLOCK);
-        if (error)
-                goto out_unlock;
-        /* Change file size if needed */
-        if (new_size) {
-                struct iattr iattr;
-                iattr.ia_valid = ATTR_SIZE;
-                iattr.ia_size = new_size;
-                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-        }
-out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-        return error;
-}
 #define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 /*
@@ -685,7 +598,6 @@ static const struct inode_operations xfs_inode_operations = {
        .getxattr               = generic_getxattr,
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .fallocate              = xfs_vn_fallocate,
        .fiemap                 = xfs_vn_fiemap,
 };
@@ -795,7 +707,10 @@ xfs_setup_inode(
        inode->i_ino = ip->i_ino;
        inode->i_state = I_NEW;
-        inode_add_to_lists(ip->i_mount->m_super, inode);
+        inode_sb_list_add(inode);
+        /* make the inode look hashed for the writeback code */
+        hlist_add_fake(&inode->i_hash);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,10 +37,8 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
-#include <support/debug.h>
 #include <support/uuid.h>
 #include <linux/semaphore.h>
@@ -71,6 +69,8 @@
 #include <linux/random.h>
 #include <linux/ctype.h>
 #include <linux/writeback.h>
+#include <linux/capability.h>
+#include <linux/list_sort.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -79,15 +79,14 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
 #include <xfs_iops.h>
 #include <xfs_aops.h>
 #include <xfs_super.h>
-#include <xfs_globals.h>
 #include <xfs_buf.h>
+#include <xfs_message.h>
 /*
 * Feature macros (disable/enable)
@@ -144,7 +143,7 @@
 #define SYNCHRONIZE()   barrier()
 #define __return_address __builtin_return_address(0)
-#define dfltprid        0
+#define XFS_PROJID_DEFAULT      0
 #define MAXPATHLEN      1024
 #define MIN(a,b)        (min(a,b))
@@ -282,4 +281,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #define __arch_pack
 #endif
+#define ASSERT_ALWAYS(expr)     \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef DEBUG
+#define ASSERT(expr)    ((void)0)
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+#else /* DEBUG */
+#define ASSERT(expr)    \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef STATIC
+# define STATIC noinline
+#endif
+#endif /* DEBUG */
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..bd672def95ac
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        struct va_format        *vaf)
+{
+        if (mp && mp->m_fsname) {
+                printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+                return;
+        }
+        printk("%sXFS: %pV\n", level, vaf);
+}
+#define define_xfs_printk_level(func, kern_level)               \
+void func(const struct xfs_mount *mp, const char *fmt, ...)     \
+{                                                               \
+        struct va_format        vaf;                            \
+        va_list                 args;                           \
+                                                                \
+        va_start(args, fmt);                                    \
+                                                                \
+        vaf.fmt = fmt;                                          \
+        vaf.va = &args;                                         \
+                                                                \
+        __xfs_printk(kern_level, mp, &vaf);                     \
+        va_end(args);                                           \
+}                                                               \
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+void
+xfs_alert_tag(
+        const struct xfs_mount  *mp,
+        int                     panic_tag,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+                xfs_alert(mp, "Transforming an alert into a BUG.");
+                do_panic = 1;
+        }
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        __xfs_printk(KERN_ALERT, mp, &vaf);
+        va_end(args);
+        BUG_ON(do_panic);
+}
+void
+assfail(char *expr, char *file, int line)
+{
+        xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+                expr, file, line);
+        BUG();
+}
+void
+xfs_hex_dump(void *p, int length)
+{
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..7fb7ea007672
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,39 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+struct xfs_mount;
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+                         const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#ifdef DEBUG
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+extern void assfail(char *expr, char *f, int l);
+extern void xfs_hex_dump(void *p, int length);
+#endif  /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..a1a881e68a9a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
@@ -111,8 +110,10 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
+#define MNTOPT_DELAYLOG    "delaylog"   /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
+#define MNTOPT_NODELAYLOG  "nodelaylog" /* Delayed logging disabled */
+#define MNTOPT_DISCARD     "discard"    /* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard"  /* Do not discard unused blocks */
 /*
 * Table driven mount option parser.
@@ -174,6 +175,15 @@ xfs_parseargs(
        __uint8_t               iosizelog = 0;
        /*
+         * set up the mount name first so all the errors will refer to the
+         * correct device.
+         */
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        /*
         * Copy binary VFS mount flags we are interested in.
         */
        if (sb->s_flags & MS_RDONLY)
@@ -190,6 +200,7 @@ xfs_parseargs(
        mp->m_flags |= XFS_MOUNT_BARRIER;
        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+        mp->m_flags |= XFS_MOUNT_DELAYLOG;
        /*
         * These can be overridden by the mount option parsing.
@@ -208,24 +219,21 @@ xfs_parseargs(
                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -233,14 +241,12 @@ xfs_parseargs(
                        if (!mp->m_logname)
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -249,8 +255,7 @@ xfs_parseargs(
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -258,8 +263,7 @@ xfs_parseargs(
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -281,16 +285,14 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        dsunit = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -298,8 +300,7 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
@@ -354,26 +355,26 @@ xfs_parseargs(
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
-                        cmn_err(CE_WARN,
-                                "Enabling EXPERIMENTAL delayed logging feature "
-                                "- use at your own risk.\n");
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+                        mp->m_flags |= XFS_MOUNT_DISCARD;
+                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, "ihashsize")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: ihashsize no longer used, option is deprecated.");
+        "ihashsize no longer used, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisdsync has no effect, option is deprecated.");
+        "osyncisdsync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisosync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisosync has no effect, option is deprecated.");
+        "osyncisosync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
+        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "unknown mount option [%s].", this_char);
-                                "XFS: unknown mount option [%s].", this_char);
                        return EINVAL;
                }
        }
@@ -383,40 +384,44 @@ xfs_parseargs(
         */
        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                xfs_warn(mp, "no-recovery mounts must be read-only.");
                return EINVAL;
        }
        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: sunit and swidth options incompatible with the noalign option");
+        "sunit and swidth options incompatible with the noalign option");
+                return EINVAL;
+        }
+        if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+            !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+                xfs_warn(mp,
+        "the discard option is incompatible with the nodelaylog option");
                return EINVAL;
        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "quota support not available in this kernel.");
-                        "XFS: quota support not available in this kernel.");
                return EINVAL;
        }
 #endif
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "cannot mount with both project and group quota");
-                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "sunit and swidth must be specified together");
-                        "XFS: sunit and swidth must be specified together");
                return EINVAL;
        }
        if (dsunit && (dswidth % dsunit != 0)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
+        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
                return EINVAL;
        }
@@ -442,8 +447,7 @@ done:
            mp->m_logbufs != 0 &&
            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
                return XFS_ERROR(EINVAL);
        }
@@ -452,22 +456,16 @@ done:
            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
             !is_power_of_2(mp->m_logbsize))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
                return XFS_ERROR(EINVAL);
        }
-        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
-        if (!mp->m_fsname)
-                return ENOMEM;
-        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
        if (iosizelog) {
                if (iosizelog > XFS_MAX_IO_LOG ||
                    iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
-                "XFS: invalid log iosize: %d [not %d-%d]",
                                iosizelog, XFS_MIN_IO_LOG,
                                XFS_MAX_IO_LOG);
                        return XFS_ERROR(EINVAL);
@@ -503,6 +501,7 @@ xfs_showargs(
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
+                { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -577,7 +576,7 @@ xfs_max_file_offset(
        /* Figure out maximum filesize, on Linux this can depend on
         * the filesystem blocksize (on 32 bit platforms).
-         * __block_prepare_write does this in an [unsigned] long...
+         * __block_write_begin does this in an [unsigned] long...
         *      page->index << (PAGE_CACHE_SHIFT - bbits)
         * So, for page sized blocks (4K on 32 bit platforms),
         * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -610,10 +609,11 @@ xfs_blkdev_get(
 {
        int                     error = 0;
-        *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+        *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                printk("XFS: Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
        }
        return -error;
@@ -624,77 +624,14 @@ xfs_blkdev_put(
        struct block_device     *bdev)
 {
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-/*
- * Try to write out the superblock using barriers.
- */
-STATIC int
-xfs_barrier_test(
-        xfs_mount_t     *mp)
-{
-        xfs_buf_t       *sbp = xfs_getsb(mp, 0);
-        int             error;
-        XFS_BUF_UNDONE(sbp);
-        XFS_BUF_UNREAD(sbp);
-        XFS_BUF_UNDELAYWRITE(sbp);
-        XFS_BUF_WRITE(sbp);
-        XFS_BUF_UNASYNC(sbp);
-        XFS_BUF_ORDERED(sbp);
-        xfsbdstrat(mp, sbp);
-        error = xfs_iowait(sbp);
-        /*
-         * Clear all the flags we set and possible error state in the
-         * buffer.  We only did the write to try out whether barriers
-         * worked and shouldn't leave any traces in the superblock
-         * buffer.
-         */
-        XFS_BUF_DONE(sbp);
-        XFS_BUF_ERROR(sbp, 0);
-        XFS_BUF_UNORDERED(sbp);
-        xfs_buf_relse(sbp);
-        return error;
-}
-STATIC void
-xfs_mountfs_check_barriers(xfs_mount_t *mp)
-{
-        int error;
-        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
-                  "Disabling barriers, not supported with external log device");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
-        if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
-                  "Disabling barriers, underlying device is readonly");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
-        error = xfs_barrier_test(mp);
-        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
-                  "Disabling barriers, trial barrier write failed");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
 }
 void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
-                        BLKDEV_IFL_WAIT);
 }
 STATIC void
@@ -747,8 +684,8 @@ xfs_open_devices(
                        goto out_close_logdev;
                if (rtdev == ddev || rtdev == logdev) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
                        error = EINVAL;
                        goto out_close_rtdev;
                }
@@ -758,18 +695,20 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
+        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+                                                        mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
+                mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+                                                        mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -829,63 +768,6 @@ xfs_setup_devices(
        return 0;
 }
-/*
- * XFS AIL push thread support
- */
-void
-xfsaild_wakeup(
-        struct xfs_ail          *ailp,
-        xfs_lsn_t               threshold_lsn)
-{
-        ailp->xa_target = threshold_lsn;
-        wake_up_process(ailp->xa_task);
-}
-STATIC int
-xfsaild(
-        void    *data)
-{
-        struct xfs_ail  *ailp = data;
-        xfs_lsn_t       last_pushed_lsn = 0;
-        long            tout = 0; /* milliseconds */
-        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
-                /* swsusp */
-                try_to_freeze();
-                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
-                        continue;
-                tout = xfsaild_push(ailp, &last_pushed_lsn);
-        }
-        return 0;
-}       /* xfsaild */
-int
-xfsaild_start(
-        struct xfs_ail  *ailp)
-{
-        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
-                                    ailp->xa_mount->m_fsname);
-        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(ailp->xa_task);
-        return 0;
-}
-void
-xfsaild_stop(
-        struct xfs_ail  *ailp)
-{
-        kthread_stop(ailp->xa_task);
-}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -938,7 +820,7 @@ out_reclaim:
 * Slab object creation initialisation for the XFS inode.
 * This covers only the idempotent fields in the XFS inode;
 * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
 * fields in the xfs inode that left in the initialise state
 * when freeing the inode.
 */
@@ -972,12 +854,7 @@ xfs_fs_inode_init_once(
 /*
 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode. Care must be taken
+ * we catch unlogged VFS level updates to the inode.
- * here - the transaction code calls mark_inode_dirty_sync() to mark the
- * VFS inode dirty in a transaction and clears the i_update_core field;
- * it must clear the field after calling mark_inode_dirty_sync() to
- * correctly indicate that the dirty state has been propagated into the
- * inode log item.
 *
 * We need the barrier() to maintain correct ordering between unlogged
 * updates and the transaction commit code that clears the i_update_core
@@ -986,7 +863,8 @@ xfs_fs_inode_init_once(
 */
 STATIC void
 xfs_fs_dirty_inode(
-        struct inode    *inode)
+        struct inode    *inode,
+        int             flags)
 {
        barrier();
        XFS_I(inode)->i_update_core = 1;
@@ -1084,7 +962,7 @@ xfs_fs_write_inode(
                        error = 0;
                        goto out_unlock;
                }
-                error = xfs_iflush(ip, 0);
+                error = xfs_iflush(ip, SYNC_TRYLOCK);
        }
 out_unlock:
@@ -1126,6 +1004,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
@@ -1195,22 +1075,12 @@ xfs_fs_sync_fs(
                return -error;
        if (laptop_mode) {
-                int     prev_sync_seq = mp->m_sync_seq;
                /*
                 * The disk must be active because we're syncing.
                 * We schedule xfssyncd now (now that the disk is
                 * active) instead of later (when it might not be).
                 */
-                wake_up_process(mp->m_sync_task);
+                flush_delayed_work_sync(&mp->m_sync_work);
-                /*
-                 * We have to wait for the sync iteration to complete.
-                 * If we don't, the disk activity caused by the sync
-                 * will come after the sync is completed, and that
-                 * triggers another sync from laptop mode.
-                 */
-                wait_event(mp->m_wait_single_sync_task,
-                                mp->m_sync_seq != prev_sync_seq);
        }
        return 0;
@@ -1308,14 +1178,6 @@ xfs_fs_remount(
                switch (token) {
                case Opt_barrier:
                        mp->m_flags |= XFS_MOUNT_BARRIER;
-                        /*
-                         * Test if barriers are actually working if we can,
-                         * else delay this check until the filesystem is
-                         * marked writeable.
-                         */
-                        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                                xfs_mountfs_check_barriers(mp);
                        break;
                case Opt_nobarrier:
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
@@ -1338,8 +1200,8 @@ xfs_fs_remount(
                         * options that we can't actually change.
                         */
 #if 0
-                        printk(KERN_INFO
+                        xfs_info(mp,
-        "XFS: mount option \"%s\" not supported for remount\n", p);
+                "mount option \"%s\" not supported for remount\n", p);
                        return -EINVAL;
 #else
                        break;
@@ -1350,8 +1212,6 @@ xfs_fs_remount(
        /* ro -> rw */
        if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
-                if (mp->m_flags & XFS_MOUNT_BARRIER)
-                        xfs_mountfs_check_barriers(mp);
                /*
                 * If this is the first remount to writeable state we
@@ -1360,8 +1220,7 @@ xfs_fs_remount(
                if (mp->m_update_flags) {
                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "failed to write sb changes");
-                                        "XFS: failed to write sb changes");
                                return error;
                        }
                        mp->m_update_flags = 0;
@@ -1407,7 +1266,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
@@ -1445,15 +1304,15 @@ xfs_finish_flags(
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
                } else if (mp->m_logbsize > 0 &&
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size must be greater than or equal to log stripe size");
+                "logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size for version 1 logs must be 16K or 32K");
+                "logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
                }
        }
@@ -1470,8 +1329,8 @@ xfs_finish_flags(
         * prohibit r/w mounts of read-only filesystems
         */
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: cannot mount a read-only filesystem as read-write");
+                        "cannot mount a read-only filesystem as read-write");
                return XFS_ERROR(EROFS);
        }
@@ -1495,9 +1354,6 @@ xfs_fs_fill_super(
        spin_lock_init(&mp->m_sb_lock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
-        INIT_LIST_HEAD(&mp->m_sync_list);
-        spin_lock_init(&mp->m_sync_lock);
-        init_waitqueue_head(&mp->m_wait_single_sync_task);
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1521,8 +1377,9 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        if (xfs_icsb_init_counters(mp))
+        error = xfs_icsb_init_counters(mp);
-                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+        if (error)
+                goto out_close_devices;
        error = xfs_readsb(mp, flags);
        if (error)
@@ -1536,17 +1393,18 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_sb;
-        if (mp->m_flags & XFS_MOUNT_BARRIER)
-                xfs_mountfs_check_barriers(mp);
        error = xfs_filestream_mount(mp);
        if (error)
                goto out_free_sb;
-        error = xfs_mountfs(mp);
+        /*
-        if (error)
+         * we must configure the block size in the superblock before we run the
-                goto out_filestream_unmount;
+         * full mount process as the mount process can lookup and cache inodes.
+         * For the same reason we must also initialise the syncd and register
+         * the inode cache shrinker so that inodes can be reclaimed during
+         * operations like a quotacheck that iterate all inodes in the
+         * filesystem.
+         */
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1554,6 +1412,16 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
+        error = xfs_syncd_init(mp);
+        if (error)
+                goto out_filestream_unmount;
+        xfs_inode_shrinker_register(mp);
+        error = xfs_mountfs(mp);
+        if (error)
+                goto out_syncd_stop;
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = ENOENT;
@@ -1569,20 +1437,18 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
-        error = xfs_syncd_init(mp);
-        if (error)
-                goto fail_vnrele;
-        xfs_inode_shrinker_register(mp);
        return 0;
+ out_syncd_stop:
+        xfs_inode_shrinker_unregister(mp);
+        xfs_syncd_stop(mp);
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
        xfs_freesb(mp);
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
+ out_close_devices:
        xfs_close_devices(mp);
 out_free_fsname:
        xfs_free_fsname(mp);
@@ -1599,6 +1465,9 @@ xfs_fs_fill_super(
        }
 fail_unmount:
+        xfs_inode_shrinker_unregister(mp);
+        xfs_syncd_stop(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
         * This can and will cause log traffic as inodes go inactive
@@ -1612,16 +1481,14 @@ xfs_fs_fill_super(
        goto out_free_sb;
 }
-STATIC int
+STATIC struct dentry *
-xfs_fs_get_sb(
+xfs_fs_mount(
        struct file_system_type *fs_type,
        int                     flags,
        const char              *dev_name,
-        void                    *data,
+        void                    *data)
-        struct vfsmount         *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+        return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
-                           mnt);
 }
 static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1509,7 @@ static const struct super_operations xfs_super_operations = {
 static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
-        .get_sb                 = xfs_fs_get_sb,
+        .mount                  = xfs_fs_mount,
        .kill_sb                = kill_block_super,
        .fs_flags               = FS_REQUIRES_DEV,
 };
@@ -1790,6 +1657,38 @@ xfs_destroy_zones(void)
 }
 STATIC int __init
+xfs_init_workqueues(void)
+{
+        /*
+         * max_active is set to 8 to give enough concurency to allow
+         * multiple work operations on each CPU to run. This allows multiple
+         * filesystems to be running sync work concurrently, and scales with
+         * the number of CPUs in the system.
+         */
+        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_syncd_wq)
+                goto out;
+        xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_ail_wq)
+                goto out_destroy_syncd;
+        return 0;
+out_destroy_syncd:
+        destroy_workqueue(xfs_syncd_wq);
+out:
+        return -ENOMEM;
+}
+STATIC void
+xfs_destroy_workqueues(void)
+{
+        destroy_workqueue(xfs_ail_wq);
+        destroy_workqueue(xfs_syncd_wq);
+}
+STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
@@ -1804,10 +1703,14 @@ init_xfs_fs(void)
        if (error)
                goto out;
-        error = xfs_mru_cache_init();
+        error = xfs_init_workqueues();
        if (error)
                goto out_destroy_zones;
+        error = xfs_mru_cache_init();
+        if (error)
+                goto out_destroy_wq;
        error = xfs_filestream_init();
        if (error)
                goto out_mru_cache_uninit;
@@ -1841,6 +1744,8 @@ init_xfs_fs(void)
        xfs_filestream_uninit();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
+ out_destroy_wq:
+        xfs_destroy_workqueues();
 out_destroy_zones:
        xfs_destroy_zones();
 out:
@@ -1857,6 +1762,7 @@ exit_xfs_fs(void)
        xfs_buf_terminate();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
+        xfs_destroy_workqueues();
        xfs_destroy_zones();
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
 # define XFS_DBG_STRING         "no debug"
 #endif
+#define XFS_VERSION_STRING      "SGI XFS"
 #define XFS_BUILD_OPTIONS       XFS_ACL_STRING \
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -39,42 +40,61 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
-STATIC xfs_inode_t *
+/*
-xfs_inode_ag_lookup(
+ * The inode lookup is done in batches to keep the amount of lock traffic and
-        struct xfs_mount        *mp,
+ * radix tree lookups to a minimum. The batch size is a trade off between
-        struct xfs_perag        *pag,
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
-        uint32_t                *first_index,
+ * be too greedy.
-        int                     tag)
+ */
+#define XFS_LOOKUP_BATCH        32
+STATIC int
+xfs_inode_ag_walk_grab(
+        struct xfs_inode        *ip)
 {
-        int                     nr_found;
+        struct inode            *inode = VFS_I(ip);
-        struct xfs_inode        *ip;
+        ASSERT(rcu_read_lock_held());
        /*
-         * use a gang lookup to find the next inode in the tree
+         * check for stale RCU freed inode
-         * as the tree is sparse and a gang lookup walks to find
+         *
-         * the number of objects requested.
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
         */
-        if (tag == XFS_ICI_NO_TAG) {
+        spin_lock(&ip->i_flags_lock);
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+        if (!ip->i_ino)
-                                (void **)&ip, *first_index, 1);
+                goto out_unlock_noent;
-        } else {
-                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-                                (void **)&ip, *first_index, 1, tag);
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
+        /* nothing to sync during shutdown */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return EFSCORRUPTED;
+        /* If we can't grab the inode, it must on it's way to reclaim. */
+        if (!igrab(inode))
+                return ENOENT;
+        if (is_bad_inode(inode)) {
+                IRELE(ip);
+                return ENOENT;
        }
-        if (!nr_found)
-                return NULL;
-        /*
+        /* inode is valid */
-         * Update the index for the next lookup. Catch overflows
+        return 0;
-         * into the next AG range which can occur if we have inodes
-         * in the last block of the AG and we are currently
+out_unlock_noent:
-         * pointing to the last inode.
+        spin_unlock(&ip->i_flags_lock);
-         */
+        return ENOENT;
-        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                return NULL;
-        return ip;
 }
 STATIC int
@@ -83,49 +103,83 @@ xfs_inode_ag_walk(
        struct xfs_perag        *pag,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
-        int                     flags,
+        int                     flags)
-        int                     tag,
-        int                     exclusive,
-        int                     *nr_to_scan)
 {
        uint32_t                first_index;
        int                     last_error = 0;
        int                     skipped;
+        int                     done;
+        int                     nr_found;
 restart:
+        done = 0;
        skipped = 0;
        first_index = 0;
+        nr_found = 0;
        do {
+                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                int             error = 0;
-                xfs_inode_t     *ip;
+                int             i;
-                if (exclusive)
+                rcu_read_lock();
-                        write_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                else
+                                        (void **)batch, first_index,
-                        read_lock(&pag->pag_ici_lock);
+                                        XFS_LOOKUP_BATCH);
-                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+                if (!nr_found) {
-                if (!ip) {
+                        rcu_read_unlock();
-                        if (exclusive)
-                                write_unlock(&pag->pag_ici_lock);
-                        else
-                                read_unlock(&pag->pag_ici_lock);
                        break;
                }
-                /* execute releases pag->pag_ici_lock */
+                /*
-                error = execute(ip, pag, flags);
+                 * Grab the inodes before we drop the lock. if we found
-                if (error == EAGAIN) {
+                 * nothing, nr == 0 and the loop will be skipped.
-                        skipped++;
+                 */
-                        continue;
+                for (i = 0; i < nr_found; i++) {
+                        struct xfs_inode *ip = batch[i];
+                        if (done || xfs_inode_ag_walk_grab(ip))
+                                batch[i] = NULL;
+                        /*
+                         * Update the index for the next lookup. Catch
+                         * overflows into the next AG range which can occur if
+                         * we have inodes in the last block of the AG and we
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
+                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
+                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                done = 1;
+                }
+                /* unlock now we've grabbed the inodes. */
+                rcu_read_unlock();
+                for (i = 0; i < nr_found; i++) {
+                        if (!batch[i])
+                                continue;
+                        error = execute(batch[i], pag, flags);
+                        IRELE(batch[i]);
+                        if (error == EAGAIN) {
+                                skipped++;
+                                continue;
+                        }
+                        if (error && last_error != EFSCORRUPTED)
+                                last_error = error;
                }
-                if (error)
-                        last_error = error;
                /* bail out if the filesystem is corrupted.  */
                if (error == EFSCORRUPTED)
                        break;
-        } while ((*nr_to_scan)--);
+        } while (nr_found && !done);
        if (skipped) {
                delay(1);
@@ -134,110 +188,32 @@ restart:
        return last_error;
 }
-/*
- * Select the next per-ag structure to iterate during the walk. The reclaim
- * walk is optimised only to walk AGs with reclaimable inodes in them.
- */
-static struct xfs_perag *
-xfs_inode_ag_iter_next_pag(
-        struct xfs_mount        *mp,
-        xfs_agnumber_t          *first,
-        int                     tag)
-{
-        struct xfs_perag        *pag = NULL;
-        if (tag == XFS_ICI_RECLAIM_TAG) {
-                int found;
-                int ref;
-                spin_lock(&mp->m_perag_lock);
-                found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-                                (void **)&pag, *first, 1, tag);
-                if (found <= 0) {
-                        spin_unlock(&mp->m_perag_lock);
-                        return NULL;
-                }
-                *first = pag->pag_agno + 1;
-                /* open coded pag reference increment */
-                ref = atomic_inc_return(&pag->pag_ref);
-                spin_unlock(&mp->m_perag_lock);
-                trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
-        } else {
-                pag = xfs_perag_get(mp, *first);
-                (*first)++;
-        }
-        return pag;
-}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
        int                     (*execute)(struct xfs_inode *ip,
                                           struct xfs_perag *pag, int flags),
-        int                     flags,
+        int                     flags)
-        int                     tag,
-        int                     exclusive,
-        int                     *nr_to_scan)
 {
        struct xfs_perag        *pag;
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
-        int                     nr;
-        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
        ag = 0;
-        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
+        while ((pag = xfs_perag_get(mp, ag))) {
-                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+                ag = pag->pag_agno + 1;
-                                                exclusive, &nr);
+                error = xfs_inode_ag_walk(mp, pag, execute, flags);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
                                break;
                }
-                if (nr <= 0)
-                        break;
        }
-        if (nr_to_scan)
-                *nr_to_scan = nr;
        return XFS_ERROR(last_error);
 }
-/* must be called with pag_ici_lock held and releases it */
-int
-xfs_sync_inode_valid(
-        struct xfs_inode        *ip,
-        struct xfs_perag        *pag)
-{
-        struct inode            *inode = VFS_I(ip);
-        int                     error = EFSCORRUPTED;
-        /* nothing to sync during shutdown */
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                goto out_unlock;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        error = ENOENT;
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                goto out_unlock;
-        /* If we can't grab the inode, it must on it's way to reclaim. */
-        if (!igrab(inode))
-                goto out_unlock;
-        if (is_bad_inode(inode)) {
-                IRELE(ip);
-                goto out_unlock;
-        }
-        /* inode is valid */
-        error = 0;
-out_unlock:
-        read_unlock(&pag->pag_ici_lock);
-        return error;
-}
 STATIC int
 xfs_sync_inode_data(
        struct xfs_inode        *ip,
@@ -248,10 +224,6 @@ xfs_sync_inode_data(
        struct address_space *mapping = inode->i_mapping;
        int                     error = 0;
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_wait;
@@ -268,7 +240,6 @@ xfs_sync_inode_data(
 out_wait:
        if (flags & SYNC_WAIT)
                xfs_ioend_wait(ip);
-        IRELE(ip);
        return error;
 }
@@ -280,10 +251,6 @@ xfs_sync_inode_attr(
 {
        int                     error = 0;
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        if (xfs_inode_clean(ip))
                goto out_unlock;
@@ -300,9 +267,18 @@ xfs_sync_inode_attr(
        error = xfs_iflush(ip, flags);
+        /*
+         * We don't want to try again on non-blocking flushes that can't run
+         * again immediately. If an inode really must be written, then that's
+         * what the SYNC_WAIT flag is for.
+         */
+        if (error == EAGAIN) {
+                ASSERT(!(flags & SYNC_WAIT));
+                error = 0;
+        }
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        IRELE(ip);
        return error;
 }
@@ -318,8 +294,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-                                      XFS_ICI_NO_TAG, 0, NULL);
        if (error)
                return XFS_ERROR(error);
@@ -337,8 +312,7 @@ xfs_sync_attr(
 {
        ASSERT((flags & ~SYNC_WAIT) == 0);
-        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
-                                     XFS_ICI_NO_TAG, 0, NULL);
 }
 STATIC int
@@ -401,7 +375,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -440,7 +414,7 @@ xfs_quiesce_fs(
 /*
 * Second stage of a quiesce. The data is already synced, now we have to take
 * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
+ * wait for any remaining transactions to drain out before proceeding.
 */
 void
 xfs_quiesce_attr(
@@ -464,69 +438,18 @@ xfs_quiesce_attr(
        /* Push the superblock and write an unmount record */
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "xfs_attr_quiesce: failed to log sb changes. "
                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
 }
-/*
+static void
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+xfs_syncd_queue_sync(
- * Doing this has two advantages:
+        struct xfs_mount        *mp)
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *),
-        struct completion *completion)
-{
-        struct xfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        work->w_completion = completion;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-        iput(inode);
-}
-void
-xfs_flush_inodes(
-        xfs_inode_t     *ip)
 {
-        struct inode    *inode = VFS_I(ip);
+        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-        DECLARE_COMPLETION_ONSTACK(completion);
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        wait_for_completion(&completion);
-        xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 /*
@@ -536,84 +459,119 @@ xfs_flush_inodes(
 */
 STATIC void
 xfs_sync_worker(
-        struct xfs_mount *mp,
+        struct work_struct *work)
-        void            *unused)
 {
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_sync_work);
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp, 0);
+                        error = xfs_fs_log_dummy(mp);
+                else
+                        xfs_log_force(mp, 0);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+                /* start pushing all the metadata that is currently dirty */
+                xfs_ail_push_all(mp->m_ail);
        }
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
+        /* queue us up again */
+        xfs_syncd_queue_sync(mp);
 }
-STATIC int
+/*
-xfssyncd(
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
-        void                    *arg)
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
 {
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        xfs_sync_work_t         *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                if (list_empty(&mp->m_sync_list))
-                        timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
+        /*
-                /*
+         * We can have inodes enter reclaim after we've shut down the syncd
-                 * We can get woken by laptop mode, to do a sync -
+         * workqueue during unmount, so don't allow reclaim work to be queued
-                 * that's the (only!) case where the list would be
+         * during unmount.
-                 * empty with time remaining.
+         */
-                 */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                return;
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_splice_init(&mp->m_sync_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
+        rcu_read_lock();
-                        (*work->w_syncer)(mp, work->w_data);
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                        list_del(&work->w_list);
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                        if (work == &mp->m_sync_work)
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-                                continue;
-                        if (work->w_completion)
-                                complete(work->w_completion);
-                        kmem_free(work);
-                }
        }
+        rcu_read_unlock();
+}
-        return 0;
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        queue_work(xfs_syncd_wq, &mp->m_flush_work);
+        flush_work_sync(&mp->m_flush_work);
+}
+STATIC void
+xfs_flush_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(work,
+                                        struct xfs_mount, m_flush_work);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 int
 xfs_syncd_init(
        struct xfs_mount        *mp)
 {
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        mp->m_sync_work.w_mount = mp;
+        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        mp->m_sync_work.w_completion = NULL;
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
-        if (IS_ERR(mp->m_sync_task))
+        xfs_syncd_queue_sync(mp);
-                return -PTR_ERR(mp->m_sync_task);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -621,7 +579,9 @@ void
 xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
-        kthread_stop(mp->m_sync_task);
+        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
+        cancel_work_sync(&mp->m_flush_work);
 }
 void
@@ -640,6 +600,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -659,12 +623,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -698,6 +662,53 @@ __xfs_inode_clear_reclaim_tag(
 }
 /*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
+        /*
+         * do some unlocked checks first to avoid unnecessary lock traffic.
+         * The first is a flush lock check, the second is a already in reclaim
+         * check. Only do these checks if we are not going to block on locks.
+         */
+        if ((flags & SYNC_TRYLOCK) &&
+            (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+                return 1;
+        }
+        /*
+         * The radix tree lock here protects a thread in xfs_iget from racing
+         * with us starting reclaim on the inode.  Once we have the
+         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                /* not a reclaim candidate. */
+                spin_unlock(&ip->i_flags_lock);
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        return 0;
+}
+/*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -753,25 +764,10 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error = 0;
+        int     error;
-        /*
-         * The radix tree lock here protects a thread in xfs_iget from racing
-         * with us starting reclaim on the inode.  Once we have the
-         * XFS_IRECLAIM flag set it will not touch us.
-         */
-        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                return 0;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+restart:
+        error = 0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -797,9 +793,31 @@ xfs_reclaim_inode(
        if (xfs_inode_clean(ip))
                goto reclaim;
-        /* Now we have an inode that needs flushing */
+        /*
-        error = xfs_iflush(ip, sync_mode);
+         * Now we have an inode that needs flushing.
+         *
+         * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+         * reclaim as we can deadlock with inode cluster removal.
+         * xfs_ifree_cluster() can lock the inode buffer before it locks the
+         * ip->i_lock, and we are doing the exact opposite here. As a result,
+         * doing a blocking xfs_itobp() to get the cluster buffer will result
+         * in an ABBA deadlock with xfs_ifree_cluster().
+         *
+         * As xfs_ifree_cluser() must gather all inodes that are active in the
+         * cache to mark them stale, if we hit this case we don't actually want
+         * to do IO here - we want the inode marked stale so we can simply
+         * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+         * just unlock the inode, back off and try again. Hopefully the next
+         * pass through will see the stale flag set on the inode.
+         */
+        error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
        if (sync_mode & SYNC_WAIT) {
+                if (error == EAGAIN) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        /* backoff longer than in xfs_ifree_cluster */
+                        delay(2);
+                        goto restart;
+                }
                xfs_iflock(ip);
                goto reclaim;
        }
@@ -814,7 +832,7 @@ xfs_reclaim_inode(
         * pass on the error.
         */
        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
        }
@@ -842,12 +860,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -868,45 +886,181 @@ reclaim:
 }
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+        struct xfs_mount        *mp,
+        int                     flags,
+        int                     *nr_to_scan)
+{
+        struct xfs_perag        *pag;
+        int                     error = 0;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
+        int                     trylock = flags & SYNC_TRYLOCK;
+        int                     skipped;
+restart:
+        ag = 0;
+        skipped = 0;
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+                unsigned long   first_index = 0;
+                int             done = 0;
+                int             nr_found = 0;
+                ag = pag->pag_agno + 1;
+                if (trylock) {
+                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+                                skipped++;
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        first_index = pag->pag_ici_reclaim_cursor;
+                } else
+                        mutex_lock(&pag->pag_ici_reclaim_lock);
+                do {
+                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                        int     i;
+                        rcu_read_lock();
+                        nr_found = radix_tree_gang_lookup_tag(
+                                        &pag->pag_ici_root,
+                                        (void **)batch, first_index,
+                                        XFS_LOOKUP_BATCH,
+                                        XFS_ICI_RECLAIM_TAG);
+                        if (!nr_found) {
+                                done = 1;
+                                rcu_read_unlock();
+                                break;
+                        }
+                        /*
+                         * Grab the inodes before we drop the lock. if we found
+                         * nothing, nr == 0 and the loop will be skipped.
+                         */
+                        for (i = 0; i < nr_found; i++) {
+                                struct xfs_inode *ip = batch[i];
+                                if (done || xfs_reclaim_inode_grab(ip, flags))
+                                        batch[i] = NULL;
+                                /*
+                                 * Update the index for the next lookup. Catch
+                                 * overflows into the next AG range which can
+                                 * occur if we have inodes in the last block of
+                                 * the AG and we are currently pointing to the
+                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
+                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
+                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                        done = 1;
+                        }
+                        /* unlock now we've grabbed the inodes. */
+                        rcu_read_unlock();
+                        for (i = 0; i < nr_found; i++) {
+                                if (!batch[i])
+                                        continue;
+                                error = xfs_reclaim_inode(batch[i], pag, flags);
+                                if (error && last_error != EFSCORRUPTED)
+                                        last_error = error;
+                        }
+                        *nr_to_scan -= XFS_LOOKUP_BATCH;
+                } while (nr_found && !done && *nr_to_scan > 0);
+                if (trylock && !done)
+                        pag->pag_ici_reclaim_cursor = first_index;
+                else
+                        pag->pag_ici_reclaim_cursor = 0;
+                mutex_unlock(&pag->pag_ici_reclaim_lock);
+                xfs_perag_put(pag);
+        }
+        /*
+         * if we skipped any AG, and we still have scan count remaining, do
+         * another pass this time using blocking reclaim semantics (i.e
+         * waiting on the reclaim locks and ignoring the reclaim cursors). This
+         * ensure that when we get more reclaimers than AGs we block rather
+         * than spin trying to execute reclaim.
+         */
+        if (trylock && skipped && *nr_to_scan > 0) {
+                trylock = 0;
+                goto restart;
+        }
+        return XFS_ERROR(last_error);
+}
 int
 xfs_reclaim_inodes(
        xfs_mount_t     *mp,
        int             mode)
 {
-        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
+        int             nr_to_scan = INT_MAX;
-                                        XFS_ICI_RECLAIM_TAG, 1, NULL);
+        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
        struct shrinker *shrink,
-        int             nr_to_scan,
+        struct shrink_control *sc)
-        gfp_t           gfp_mask)
 {
        struct xfs_mount *mp;
        struct xfs_perag *pag;
        xfs_agnumber_t  ag;
        int             reclaimable;
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer and push the AIL */
+                xfs_syncd_queue_reclaim(mp);
+                xfs_ail_push_all(mp->m_ail);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
-                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+                                        &nr_to_scan);
-                /* if we don't exhaust the scan, don't bother coming back */
+                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;
       }
        reclaimable = 0;
        ag = 0;
-        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-                                        XFS_ICI_RECLAIM_TAG))) {
+                ag = pag->pag_agno + 1;
                reclaimable += pag->pag_ici_reclaimable;
                xfs_perag_put(pag);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
+extern struct workqueue_struct  *xfs_syncd_wq;  /* sync workqueue */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -47,10 +49,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
-int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag, int write_lock, int *nr_to_scan);
+        int flags);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 static struct ctl_table_header *xfs_table_header;
@@ -36,7 +37,7 @@ xfs_stats_clear_proc_handler(
        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
        if (!ret && write && *valp) {
-                printk("XFS Clearing xfsstats\n");
+                xfs_notice(NULL, "Clearing xfsstats");
                for_each_possible_cpu(c) {
                        preempt_disable();
                        /* save vn_active, it's a universal truth! */
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
        return ret;
 }
+STATIC int
+xfs_panic_mask_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             ret, *valp = ctl->data;
+        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                xfs_panic_mask = *valp;
+#ifdef DEBUG
+                xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+        }
+        return ret;
+}
 #endif /* CONFIG_PROC_FS */
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name,	\
                 unsigned long caller_ip),                                      \
        TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_cond_lock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
 DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_noaddr);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -767,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -785,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -808,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -836,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -843,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -936,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -947,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -959,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -973,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1023,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1146,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
 );
-#define XFS_BUSY_SYNC \
+DECLARE_EVENT_CLASS(xfs_busy_class,
-        { 0,    "async" }, \
-        { 1,    "sync" }
-TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(trans, agno, agbno, len, sync),
-        TP_STRUCT__entry(
-                __field(dev_t, dev)
-                __field(struct xfs_trans *, tp)
-                __field(int, tid)
-                __field(xfs_agnumber_t, agno)
-                __field(xfs_agblock_t, agbno)
-                __field(xfs_extlen_t, len)
-                __field(int, sync)
-        ),
-        TP_fast_assign(
-                __entry->dev = trans->t_mountp->m_super->s_dev;
-                __entry->tp = trans;
-                __entry->tid = trans->t_ticket->t_tid;
-                __entry->agno = agno;
-                __entry->agbno = agbno;
-                __entry->len = len;
-                __entry->sync = sync;
-        ),
-        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
-                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->tp,
-                  __entry->tid,
-                  __entry->agno,
-                  __entry->agbno,
-                  __entry->len,
-                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
-);
-TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
                 xfs_agblock_t agbno, xfs_extlen_t len),
        TP_ARGS(mp, agno, agbno, len),
@@ -1205,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
                  __entry->agbno,
                  __entry->len)
 );
+#define DEFINE_BUSY_EVENT(name) \
-#define XFS_BUSY_STATES \
+DEFINE_EVENT(xfs_busy_class, name, \
-        { 0,    "missing" }, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-        { 1,    "found" }
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
-TRACE_EVENT(xfs_alloc_busysearch,
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
+TRACE_EVENT(xfs_alloc_busy_trim,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len,
-        TP_ARGS(mp, agno, agbno, len, found),
+                 xfs_agblock_t tbno, xfs_extlen_t tlen),
+        TP_ARGS(mp, agno, agbno, len, tbno, tlen),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, found)
+                __field(xfs_agblock_t, tbno)
+                __field(xfs_extlen_t, tlen)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->found = found;
+                __entry->tbno = tbno;
+                __entry->tlen = tlen;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->tbno,
+                  __entry->tlen)
 );
 TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1413,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
                  __entry->wasfromfl,
                  __entry->isfl,
                  __entry->userdata,
-                  __entry->firstblock)
+                  (unsigned long long)__entry->firstblock)
 )
 #define DEFINE_ALLOC_EVENT(name) \
@@ -1421,17 +1399,21 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
 DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
 DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
@@ -1753,6 +1735,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 xfs_agblock_t agbno, xfs_extlen_t len),
+        TP_ARGS(mp, agno, agbno, len),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len)
+)
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VERSION_H__
-#define __XFS_VERSION_H__
-/*
- * Dummy file that can contain a timestamp to put into the
- * XFS init string, to help users keep track of what they're
- * running
- */
-#define XFS_VERSION_STRING "SGI XFS"
-#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..6fa214603819 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
@@ -463,87 +462,68 @@ xfs_qm_dqtobp(
        uint                    flags)
 {
        xfs_bmbt_irec_t map;
-        int             nmaps, error;
+        int             nmaps = 1, error;
        xfs_buf_t       *bp;
-        xfs_inode_t     *quotip;
+        xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_disk_dquot_t *ddq;
-        xfs_dqid_t      id;
+        xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
-        boolean_t       newdquot;
        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
-        mp = dqp->q_mount;
+        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
-        id = be32_to_cpu(dqp->q_core.d_id);
-        nmaps = 1;
-        newdquot = B_FALSE;
-        /*
+        xfs_ilock(quotip, XFS_ILOCK_SHARED);
-         * If we don't know where the dquot lives, find out.
+        if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-         */
-        if (dqp->q_blkno == (xfs_daddr_t) 0) {
-                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id /
-                                        mp->m_quotainfo->qi_dqperchunk;
-                nmaps = 1;
-                quotip = XFS_DQ_TO_QIP(dqp);
-                xfs_ilock(quotip, XFS_ILOCK_SHARED);
                /*
-                 * Return if this type of quotas is turned off while we didn't
+                 * Return if this type of quotas is turned off while we
-                 * have an inode lock
+                 * didn't have the quota inode lock.
                 */
-                if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-                        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                return ESRCH;
-                        return (ESRCH);
+        }
-                }
+        /*
+         * Find the block map; no allocations yet
+         */
+        error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                          XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+                          NULL, 0, &map, &nmaps, NULL);
+        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+        if (error)
+                return error;
+        ASSERT(nmaps == 1);
+        ASSERT(map.br_blockcount == 1);
+        /*
+         * Offset of dquot in the (fixed sized) dquot chunk.
+         */
+        dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+                sizeof(xfs_dqblk_t);
+        ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+        if (map.br_startblock == HOLESTARTBLOCK) {
                /*
-                 * Find the block map; no allocations yet
+                 * We don't allocate unless we're asked to
                 */
-                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                if (!(flags & XFS_QMOPT_DQALLOC))
-                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
+                        return ENOENT;
-                                  XFS_BMAPI_METADATA,
-                                  NULL, 0, &map, &nmaps, NULL);
-                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                ASSERT(tp);
+                error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+                                        dqp->q_fileoffset, &bp);
                if (error)
-                        return (error);
+                        return error;
-                ASSERT(nmaps == 1);
+                tp = *tpp;
-                ASSERT(map.br_blockcount == 1);
+        } else {
+                trace_xfs_dqtobp_read(dqp);
                /*
-                 * offset of dquot in the (fixed sized) dquot chunk.
+                 * store the blkno etc so that we don't have to do the
+                 * mapping all the time
                 */
-                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+                dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-                        sizeof(xfs_dqblk_t);
-                if (map.br_startblock == HOLESTARTBLOCK) {
-                        /*
-                         * We don't allocate unless we're asked to
-                         */
-                        if (!(flags & XFS_QMOPT_DQALLOC))
-                                return (ENOENT);
-                        ASSERT(tp);
-                        if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-                                                dqp->q_fileoffset, &bp)))
-                                return (error);
-                        tp = *tpp;
-                        newdquot = B_TRUE;
-                } else {
-                        /*
-                         * store the blkno etc so that we don't have to do the
-                         * mapping all the time
-                         */
-                        dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-                }
-        }
-        ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
-        ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-        /*
-         * Read in the buffer, unless we've just done the allocation
-         * (in which case we already have the buf).
-         */
-        if (!newdquot) {
-                trace_xfs_dqtobp_read(dqp);
                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                           dqp->q_blkno,
@@ -552,20 +532,22 @@ xfs_qm_dqtobp(
                if (error || !bp)
                        return XFS_ERROR(error);
        }
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        /*
         * calculate the location of the dquot inside the buffer.
         */
-        ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
        /*
         * A simple sanity check in case we got a corrupted dquot...
         */
-        if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+        error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                           "dqtobp")) {
+                           "dqtobp");
+        if (error) {
                if (!(flags & XFS_QMOPT_DQREPAIR)) {
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EIO);
@@ -618,7 +600,7 @@ xfs_qm_dqread(
        /*
         * Reservation counters are defined as reservation plus current usage
-         * to avoid having to add everytime.
+         * to avoid having to add every time.
         */
        dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
        dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
@@ -846,7 +828,7 @@ xfs_qm_dqget(
        if (xfs_do_dqerror) {
                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-                        cmn_err(CE_DEBUG, "Returning error in dqget");
+                        xfs_debug(mp, "Returning error in dqget");
                        return (EIO);
                }
        }
@@ -1176,18 +1158,18 @@ xfs_qm_dqflush(
        xfs_dquot_t             *dqp,
        uint                    flags)
 {
-        xfs_mount_t             *mp;
+        struct xfs_mount        *mp = dqp->q_mount;
-        xfs_buf_t               *bp;
+        struct xfs_buf          *bp;
-        xfs_disk_dquot_t        *ddqp;
+        struct xfs_disk_dquot   *ddqp;
        int                     error;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        ASSERT(!completion_done(&dqp->q_flush));
        trace_xfs_dqflush(dqp);
        /*
-         * If not dirty, or it's pinned and we are not supposed to
+         * If not dirty, or it's pinned and we are not supposed to block, nada.
-         * block, nada.
         */
        if (!XFS_DQ_IS_DIRTY(dqp) ||
            (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1183,47 @@ xfs_qm_dqflush(
         * down forcibly. If that's the case we must not write this dquot
         * to disk, because the log record didn't make it to disk!
         */
-        if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+                dqp->dq_flags &= ~XFS_DQ_DIRTY;
                xfs_dqfunlock(dqp);
                return XFS_ERROR(EIO);
        }
        /*
         * Get the buffer containing the on-disk dquot
-         * We don't need a transaction envelope because we know that the
-         * the ondisk-dquot has already been allocated for.
         */
-        if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+        if (error) {
                ASSERT(error != ENOENT);
-                /*
-                 * Quotas could have gotten turned off (ESRCH)
-                 */
                xfs_dqfunlock(dqp);
-                return (error);
+                return error;
        }
-        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
+        /*
-                           0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+         * Calculate the location of the dquot inside the buffer.
-                xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
+         */
+        ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        /*
+         * A simple sanity check in case we got a corrupted dquot..
+         */
+        error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+                           XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+        if (error) {
+                xfs_buf_relse(bp);
+                xfs_dqfunlock(dqp);
+                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                return XFS_ERROR(EIO);
        }
        /* This is the only portion of data that needs to persist */
-        memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
+        memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
        /*
         * Clear the dirty field and remember the flush lsn for later use.
         */
-        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+        dqp->dq_flags &= ~XFS_DQ_DIRTY;
-        mp = dqp->q_mount;
        xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
                                        &dqp->q_logitem.qli_item.li_lsn);
@@ -1404,8 +1393,8 @@ xfs_qm_dqpurge(
                 */
                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
                if (error)
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp, "%s: dquot %p flush failed",
-                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
+                                __func__, dqp);
                xfs_dqflock(dqp);
        }
        ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1438,36 +1427,38 @@ xfs_qm_dqpurge(
 void
 xfs_qm_dqprint(xfs_dquot_t *dqp)
 {
-        cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
+        struct xfs_mount        *mp = dqp->q_mount;
-        cmn_err(CE_DEBUG, "---- dquotID =  %d",
+        xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+        xfs_debug(mp, "---- dquotID =  %d",
                (int)be32_to_cpu(dqp->q_core.d_id));
-        cmn_err(CE_DEBUG, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
+        xfs_debug(mp, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
-        cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
+        xfs_debug(mp, "---- fs      =  0x%p", dqp->q_mount);
-        cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+        xfs_debug(mp, "---- blkno   =  0x%x", (int) dqp->q_blkno);
-        cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+        xfs_debug(mp, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
-        cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkhlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
-        cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inohlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
-        cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inoslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
-        cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_bcount),
                (int)be64_to_cpu(dqp->q_core.d_bcount));
-        cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- icount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_icount),
                (int)be64_to_cpu(dqp->q_core.d_icount));
-        cmn_err(CE_DEBUG, "---- btimer  =  %d",
+        xfs_debug(mp, "---- btimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_btimer));
-        cmn_err(CE_DEBUG, "---- itimer  =  %d",
+        xfs_debug(mp, "---- itimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_itimer));
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
         */
        error = xfs_qm_dqflush(dqp, 0);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
-                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+                        __func__, error, dqp);
-                        error, dqp);
        xfs_dqunlock(dqp);
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,14 +55,12 @@ uint		ndquot;
 kmem_zone_t     *qm_dqzone;
 kmem_zone_t     *qm_dqtrxzone;
-static cred_t   xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int      xfs_qm_shake(struct shrinker *, int, gfp_t);
+STATIC int      xfs_qm_shake(struct shrinker *, struct shrink_control *);
 static struct shrinker xfs_qm_shaker = {
        .shrink = xfs_qm_shake,
@@ -82,7 +80,7 @@ xfs_qm_dquot_list_print(
        int             i = 0;
        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
+                xfs_debug(mp, "   %d. \"%d (%s)\"   "
                                  "bcnt = %lld, icnt = %lld, refs = %d",
                        i++, be32_to_cpu(dqp->q_core.d_id),
                        DQFLAGTO_TYPESTR(dqp),
@@ -207,7 +205,7 @@ xfs_qm_destroy(
        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
                xfs_dqlock(dqp);
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+                xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
 #endif
                list_del_init(&dqp->q_freelist);
                xfs_Gqm->qm_dqfrlist_cnt--;
@@ -343,9 +341,7 @@ xfs_qm_mount_quotas(
         * quotas immediately.
         */
        if (mp->m_sb.sb_rextents) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-                        "Cannot turn on quotas for realtime filesystem %s",
-                        mp->m_fsname);
                mp->m_qflags = 0;
                goto write_changes;
        }
@@ -404,14 +400,13 @@ xfs_qm_mount_quotas(
                         * off, but the on disk superblock doesn't know that !
                         */
                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: Superblock update failed!",
-                                "XFS mount_quotas: Superblock update failed!");
+                                __func__);
                }
        }
        if (error) {
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "Failed to initialize disk quotas.");
-                        "Failed to initialize disk quotas.");
                return;
        }
@@ -466,12 +461,10 @@ xfs_qm_dqflush_all(
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        int                     recl;
        struct xfs_dquot        *dqp;
-        int                     niters;
        int                     error;
        if (!q)
                return 0;
-        niters = 0;
 again:
        mutex_lock(&q->qi_dqlist_lock);
        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -837,7 +830,7 @@ xfs_qm_dqattach_locked(
                        xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot) :
-                        xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+                        xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
                                                ip->i_udquot, &ip->i_gdquot);
                /*
@@ -1199,87 +1192,6 @@ xfs_qm_list_destroy(
        mutex_destroy(&(list->qh_lock));
 }
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
-        xfs_inode_t     *ip,
-        xfs_dquot_t     **O_udqpp,
-        xfs_dquot_t     **O_gdqpp)
-{
-        int             error;
-        xfs_mount_t     *mp;
-        xfs_dquot_t     *udqp, *gdqp;
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        mp = ip->i_mount;
-        udqp = NULL;
-        gdqp = NULL;
-        if (XFS_IS_UQUOTA_ON(mp)) {
-                ASSERT(ip->i_udquot == NULL);
-                /*
-                 * We want the dquot allocated if it doesn't exist.
-                 */
-                if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
-                                         XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
-                                         &udqp))) {
-                        /*
-                         * Shouldn't be able to turn off quotas here.
-                         */
-                        ASSERT(error != ESRCH);
-                        ASSERT(error != ENOENT);
-                        return error;
-                }
-                ASSERT(udqp);
-        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                ASSERT(ip->i_gdquot == NULL);
-                if (udqp)
-                        xfs_dqunlock(udqp);
-                error = XFS_IS_GQUOTA_ON(mp) ?
-                                xfs_qm_dqget(mp, ip,
-                                             ip->i_d.di_gid, XFS_DQ_GROUP,
-                                             XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-                                             &gdqp) :
-                                xfs_qm_dqget(mp, ip,
-                                             ip->i_d.di_projid, XFS_DQ_PROJ,
-                                             XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-                                             &gdqp);
-                if (error) {
-                        if (udqp)
-                                xfs_qm_dqrele(udqp);
-                        ASSERT(error != ESRCH);
-                        ASSERT(error != ENOENT);
-                        return error;
-                }
-                ASSERT(gdqp);
-                /* Reacquire the locks in the right order */
-                if (udqp) {
-                        if (! xfs_qm_dqlock_nowait(udqp)) {
-                                xfs_dqunlock(gdqp);
-                                xfs_dqlock(udqp);
-                                xfs_dqlock(gdqp);
-                        }
-                }
-        }
-        *O_udqpp = udqp;
-        *O_gdqpp = gdqp;
-#ifdef QUOTADEBUG
-        if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
-        if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
-        return 0;
-}
 /*
 * Create an inode and return with a reference already taken, but unlocked
 * This is how we create quota inodes
@@ -1305,21 +1217,14 @@ xfs_qm_qino_alloc(
                return error;
        }
-        if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0,
+        error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
-                                   &xfs_zerocr, 0, 1, ip, &committed))) {
+        if (error) {
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
                                 XFS_TRANS_ABORT);
                return error;
        }
        /*
-         * Keep an extra reference to this quota inode. This inode is
-         * locked exclusively and joined to the transaction already.
-         */
-        ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-        IHOLD(*ip);
-        /*
         * Make the changes in the superblock, and log those too.
         * sbfields arg may contain fields other than *QUOTINO;
         * VERSIONNUM for example.
@@ -1347,7 +1252,7 @@ xfs_qm_qino_alloc(
        xfs_mod_sb(tp, sbfields);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
                return error;
        }
        return 0;
@@ -1382,7 +1287,7 @@ xfs_qm_reset_dqcounts(
                 * output any warnings because it's perfectly possible to
                 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
                 */
-                (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+                (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
                                      "xfs_quotacheck");
                ddq->d_bcount = 0;
                ddq->d_icount = 0;
@@ -1407,14 +1312,9 @@ xfs_qm_dqiter_bufs(
 {
        xfs_buf_t       *bp;
        int             error;
-        int             notcommitted;
-        int             incr;
        int             type;
        ASSERT(blkcnt > 0);
-        notcommitted = 0;
-        incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
-                XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
        type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
                (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
        error = 0;
@@ -1516,7 +1416,7 @@ xfs_qm_dqiterate(
                                rablkcnt =  map[i+1].br_blockcount;
                                rablkno = map[i+1].br_startblock;
                                while (rablkcnt--) {
-                                        xfs_baread(mp->m_ddev_targp,
+                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
@@ -1546,18 +1446,34 @@ xfs_qm_dqiterate(
 /*
 * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
+ *
- * this updates its incore copy as well as the buffer copy. This is
+ * Given the inode, and a dquot id this updates both the incore dqout as well
- * so that once the quotacheck is done, we can just log all the buffers,
+ * as the buffer copy. This is so that once the quotacheck is done, we can
- * as opposed to logging numerous updates to individual dquots.
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
 */
-STATIC void
+STATIC int
 xfs_qm_quotacheck_dqadjust(
-        xfs_dquot_t             *dqp,
+        struct xfs_inode        *ip,
+        xfs_dqid_t              id,
+        uint                    type,
        xfs_qcnt_t              nblks,
        xfs_qcnt_t              rtblks)
 {
-        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_dquot        *dqp;
+        int                     error;
+        error = xfs_qm_dqget(mp, ip, id, type,
+                             XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+        if (error) {
+                /*
+                 * Shouldn't be able to turn off quotas here.
+                 */
+                ASSERT(error != ESRCH);
+                ASSERT(error != ENOENT);
+                return error;
+        }
        trace_xfs_dqadjust(dqp);
@@ -1582,11 +1498,13 @@ xfs_qm_quotacheck_dqadjust(
         * There are no timers for the default values set in the root dquot.
         */
        if (dqp->q_core.d_id) {
-                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
-                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
        }
        dqp->dq_flags |= XFS_DQ_DIRTY;
+        xfs_qm_dqput(dqp);
+        return 0;
 }
 STATIC int
@@ -1629,8 +1547,7 @@ xfs_qm_dqusage_adjust(
        int             *res)           /* result code value */
 {
        xfs_inode_t     *ip;
-        xfs_dquot_t     *udqp, *gdqp;
+        xfs_qcnt_t      nblks, rtblks = 0;
-        xfs_qcnt_t      nblks, rtblks;
        int             error;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1567,24 @@ xfs_qm_dqusage_adjust(
         * the case in all other instances. It's OK that we do this because
         * quotacheck is done only at mount time.
         */
-        if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+        if (error) {
                *res = BULKSTAT_RV_NOTHING;
                return error;
        }
-        /*
+        ASSERT(ip->i_delayed_blks == 0);
-         * Obtain the locked dquots. In case of an error (eg. allocation
-         * fails for ENOSPC), we return the negative of the error number
-         * to bulkstat, so that it can get propagated to quotacheck() and
-         * making us disable quotas for the file system.
-         */
-        if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                IRELE(ip);
-                *res = BULKSTAT_RV_GIVEUP;
-                return error;
-        }
-        rtblks = 0;
+        if (XFS_IS_REALTIME_INODE(ip)) {
-        if (! XFS_IS_REALTIME_INODE(ip)) {
-                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
-        } else {
                /*
                 * Walk thru the extent list and count the realtime blocks.
                 */
-                if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
+                error = xfs_qm_get_rtblks(ip, &rtblks);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error)
-                        IRELE(ip);
+                        goto error0;
-                        if (udqp)
-                                xfs_qm_dqput(udqp);
-                        if (gdqp)
-                                xfs_qm_dqput(gdqp);
-                        *res = BULKSTAT_RV_GIVEUP;
-                        return error;
-                }
-                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
        }
-        ASSERT(ip->i_delayed_blks == 0);
-        /*
+        nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
-         * We can't release the inode while holding its dquot locks.
-         * The inode can go into inactive and might try to acquire the dquotlocks.
-         * So, just unlock here and do a vn_rele at the end.
-         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        /*
         * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1599,36 @@ xfs_qm_dqusage_adjust(
         * and quotaoffs don't race. (Quotachecks happen at mount time only).
         */
        if (XFS_IS_UQUOTA_ON(mp)) {
-                ASSERT(udqp);
+                error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
-                xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
+                                                   XFS_DQ_USER, nblks, rtblks);
-                xfs_qm_dqput(udqp);
+                if (error)
+                        goto error0;
        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                ASSERT(gdqp);
+        if (XFS_IS_GQUOTA_ON(mp)) {
-                xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
+                error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
-                xfs_qm_dqput(gdqp);
+                                                   XFS_DQ_GROUP, nblks, rtblks);
+                if (error)
+                        goto error0;
        }
-        /*
-         * Now release the inode. This will send it to 'inactive', and
-         * possibly even free blocks.
-         */
-        IRELE(ip);
-        /*
+        if (XFS_IS_PQUOTA_ON(mp)) {
-         * Goto next inode.
+                error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
-         */
+                                                   XFS_DQ_PROJ, nblks, rtblks);
+                if (error)
+                        goto error0;
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
        *res = BULKSTAT_RV_DIDONE;
        return 0;
+error0:
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
+        *res = BULKSTAT_RV_GIVEUP;
+        return error;
 }
 /*
@@ -1759,7 +1659,7 @@ xfs_qm_quotacheck(
         */
        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+        xfs_notice(mp, "Quotacheck needed: Please wait.");
        /*
         * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1837,9 +1737,9 @@ xfs_qm_quotacheck(
 error_return:
        if (error) {
-                cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
+                xfs_warn(mp,
-                        "Disabling quotas.",
+        "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
-                        mp->m_fsname, error);
+                        error);
                /*
                 * We must turn off quotas.
                 */
@@ -1847,12 +1747,11 @@ xfs_qm_quotacheck(
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
                if (xfs_mount_reset_sbqflags(mp)) {
-                        cmn_err(CE_WARN, "XFS quotacheck %s: "
+                        xfs_warn(mp,
-                                "Failed to reset quota flags.", mp->m_fsname);
+                                "Quotacheck: Failed to reset quota flags.");
                }
-        } else {
+        } else
-                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+                xfs_notice(mp, "Quotacheck: Done.");
-        }
        return (error);
 }
@@ -1946,12 +1845,14 @@ xfs_qm_dqreclaim_one(void)
        xfs_dquot_t     *dqpout;
        xfs_dquot_t     *dqp;
        int             restarts;
+        int             startagain;
        restarts = 0;
        dqpout = NULL;
        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+        startagain = 0;
        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1968,13 +1869,10 @@ startagain:
                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
+                        restarts++;
+                        startagain = 1;
+                        goto dqunlock;
                }
                /*
@@ -1989,23 +1887,20 @@ startagain:
                        ASSERT(list_empty(&dqp->q_mplist));
                        list_del_init(&dqp->q_freelist);
                        xfs_Gqm->qm_dqfrlist_cnt--;
-                        xfs_dqunlock(dqp);
                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
+                        goto dqunlock;
                }
                ASSERT(dqp->q_hash);
                ASSERT(!list_empty(&dqp->q_mplist));
                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
+                 * Try to grab the flush lock. If this dquot is in the process
-                 * getting flushed to disk, we don't want to reclaim it.
+                 * of getting flushed to disk, we don't want to reclaim it.
                 */
-                if (!xfs_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp))
-                        xfs_dqunlock(dqp);
+                        goto dqunlock;
-                        continue;
-                }
                /*
                 * We have the flush lock so we know that this is not in the
@@ -2024,11 +1919,10 @@ startagain:
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp, "%s: dquot %p flush failed",
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+                                        __func__, dqp);
                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        goto dqunlock;
-                        continue;
                }
                /*
@@ -2050,13 +1944,8 @@ startagain:
                 */
                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
                        restarts++;
-                        mutex_unlock(&dqp->q_hash->qh_lock);
+                        startagain = 1;
-                        xfs_dqfunlock(dqp);
+                        goto qhunlock;
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        goto startagain;
                }
                ASSERT(dqp->q_nrefs == 0);
@@ -2069,14 +1958,20 @@ startagain:
                xfs_Gqm->qm_dqfrlist_cnt--;
                dqpout = dqp;
                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
                mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
                xfs_dqfunlock(dqp);
+dqunlock:
                xfs_dqunlock(dqp);
                if (dqpout)
                        break;
                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        return NULL;
+                        break;
+                if (startagain) {
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                        goto again;
+                }
        }
        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        return dqpout;
@@ -2114,10 +2009,10 @@ xfs_qm_shake_freelist(
 STATIC int
 xfs_qm_shake(
        struct shrinker *shrink,
-        int             nr_to_scan,
+        struct shrink_control *sc)
-        gfp_t           gfp_mask)
 {
        int     ndqused, nfree, n;
+        gfp_t gfp_mask = sc->gfp_mask;
        if (!kmem_shake_allow(gfp_mask))
                return 0;
@@ -2202,7 +2097,7 @@ xfs_qm_write_sb_changes(
        int             error;
 #ifdef QUOTADEBUG
-        cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
        if ((error = xfs_trans_reserve(tp, 0,
@@ -2224,7 +2119,7 @@ xfs_qm_write_sb_changes(
 /*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * Given an inode, a uid, gid and prid make sure that we have
 * allocated relevant dquot(s) on disk, and that we won't exceed inode
 * quotas by creating this file.
 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2227,7 @@ xfs_qm_vop_dqalloc(
                        xfs_dqunlock(gq);
                }
        } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-                if (ip->i_d.di_projid != prid) {
+                if (xfs_get_projid(ip) != prid) {
                        xfs_iunlock(ip, lockflags);
                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
                                                 XFS_DQ_PROJ,
@@ -2454,7 +2349,7 @@ xfs_qm_vop_chown_reserve(
        }
        if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
                if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-                     ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))
+                     xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
                        prjflags = XFS_QMOPT_ENOSPC;
                if (prjflags ||
@@ -2558,7 +2453,7 @@ xfs_qm_vop_create_dqattach(
                ip->i_gdquot = gdqp;
                ASSERT(XFS_IS_OQUOTA_ON(mp));
                ASSERT((XFS_IS_GQUOTA_ON(mp) ?
-                        ip->i_d.di_gid : ip->i_d.di_projid) ==
+                        ip->i_d.di_gid : xfs_get_projid(ip)) ==
                                be32_to_cpu(gdqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 * block in the dquot/xqm code.
 */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
-/*
- * When doing a quotacheck, we log dquot clusters of this many FSBs at most
- * in a single transaction. We don't want to ask for too huge a log reservation.
- */
-#define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
-        if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
+        if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
                xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
                xfs_qm_dqput(dqp);
        }
@@ -119,8 +119,7 @@ xfs_qm_newmount(
             (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
            (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
            xfs_dev_is_read_only(mp, "changing quota state")) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "please mount with%s%s%s%s.",
-                        "XFS: please mount with%s%s%s%s.",
                        (!quotaondisk ? "out quota" : ""),
                        (uquotaondisk ? " usrquota" : ""),
                        (pquotaondisk ? " prjquota" : ""),
@@ -135,7 +134,7 @@ xfs_qm_newmount(
                 */
                if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
                        /*
-                         * If an error occured, qm_mount_quotas code
+                         * If an error occurred, qm_mount_quotas code
                         * has already disabled quotas. So, just finish
                         * mounting, and get on with the boring life
                         * without disk quotas.
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
-#ifdef DEBUG
-# define qdprintk(s, args...)   cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...)   do { } while (0)
-#endif
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
@@ -178,7 +172,7 @@ xfs_qm_scall_quotaoff(
        /*
         * Next we make the changes in the quota flag in the mount struct.
         * This isn't protected by a particular lock directly, because we
-         * don't want to take a mrlock everytime we depend on quotas being on.
+         * don't want to take a mrlock every time we depend on quotas being on.
         */
        mp->m_qflags &= ~(flags);
@@ -276,7 +270,7 @@ xfs_qm_scall_trunc_qfile(
                goto out_unlock;
        }
-        xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 out_unlock:
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
        int             error = 0, error2 = 0;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+                xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+                        __func__, flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -318,20 +313,19 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        uint            accflags;
        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
         * Switching on quota accounting must be done at mount time.
         */
-        accflags = flags & XFS_ALL_QUOTA_ACCT;
        flags &= ~(XFS_ALL_QUOTA_ACCT);
        sbflags = 0;
        if (flags == 0) {
-                qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+                xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+                        __func__, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -352,12 +346,13 @@ xfs_qm_scall_quotaon(
            (flags & XFS_GQUOTA_ACCT) == 0 &&
            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
            (flags & XFS_OQUOTA_ENFD))) {
-                qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+                xfs_debug(mp,
-                        flags, mp->m_sb.sb_qflags);
+                        "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+                        __func__, flags, mp->m_sb.sb_qflags);
                return XFS_ERROR(EINVAL);
        }
        /*
-         * If everything's upto-date incore, then don't waste time.
+         * If everything's up to-date incore, then don't waste time.
         */
        if ((mp->m_qflags & flags) == flags)
                return XFS_ERROR(EEXIST);
@@ -541,7 +536,7 @@ xfs_qm_scall_setqlim(
                        q->qi_bsoftlimit = soft;
                }
        } else {
-                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+                xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +552,7 @@ xfs_qm_scall_setqlim(
                        q->qi_rtbsoftlimit = soft;
                }
        } else {
-                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +569,7 @@ xfs_qm_scall_setqlim(
                        q->qi_isoftlimit = soft;
                }
        } else {
-                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+                xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
        }
        /*
@@ -875,21 +870,14 @@ xfs_dqrele_inode(
        struct xfs_perag        *pag,
        int                     flags)
 {
-        int                     error;
        /* skip quota inodes */
        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
-                read_unlock(&pag->pag_ici_lock);
                return 0;
        }
-        error = xfs_sync_inode_valid(ip, pag);
-        if (error)
-                return error;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +888,6 @@ xfs_dqrele_inode(
                ip->i_gdquot = NULL;
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        IRELE(ip);
        return 0;
 }
@@ -918,8 +904,7 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
-                                XFS_ICI_NO_TAG, 0, NULL);
 }
 /*------------------------------------------------------------------------*/
@@ -949,10 +934,11 @@ struct mutex  qcheck_lock;
 #define DQTEST_LIST_PRINT(l, NXT, title) \
 { \
          xfs_dqtest_t  *dqp; int i = 0;\
-          cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+          xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
          for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
               dqp = (xfs_dqtest_t *)dqp->NXT) { \
-                cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+                xfs_debug(dqp->q_mount,         \
+                        "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
                         ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),      \
                         dqp->d_bcount, dqp->d_icount); } \
 }
@@ -976,16 +962,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 }
 STATIC void
 xfs_qm_dqtest_print(
-        xfs_dqtest_t    *d)
+        struct xfs_mount        *mp,
+        struct dqtest           *d)
 {
-        cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
+        xfs_debug(mp, "-----------DQTEST DQUOT----------------");
-        cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
+        xfs_debug(mp, "---- dquot ID = %d", d->d_id);
-        cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
+        xfs_debug(mp, "---- fs       = 0x%p", d->q_mount);
-        cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount   = %Lu (0x%x)",
                d->d_bcount, (int)d->d_bcount);
-        cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- icount   = %Lu (0x%x)",
                d->d_icount, (int)d->d_icount);
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 STATIC void
@@ -999,12 +986,14 @@ xfs_qm_dqtest_failed(
 {
        qmtest_nfails++;
        if (error)
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, error, reason);
+                        "quotacheck failed id=%d, err=%d\nreason: %s",
+                        d->d_id, error, reason);
        else
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, reason, (int)a, (int)b);
+                        "quotacheck failed id=%d (%s) [%d != %d]",
-        xfs_qm_dqtest_print(d);
+                        d->d_id, reason, (int)a, (int)b);
+        xfs_qm_dqtest_print(dqp->q_mount, d);
        if (dqp)
                xfs_qm_dqprint(dqp);
 }
@@ -1031,9 +1020,9 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_bcount) >=
            be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
                if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] BLK TIMER NOT STARTED",
+                                "%d [%s] BLK TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
@@ -1041,16 +1030,16 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_icount) >=
            be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
                if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] INO TIMER NOT STARTED",
+                                "%d [%s] INO TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
 #ifdef QUOTADEBUG
        if (!err) {
-                cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
+                xfs_debug(dqp->q_mount, "%d [%s] qchecked",
-                        d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                        d->d_id, DQFLAGTO_TYPESTR(d));
        }
 #endif
        return (err);
@@ -1147,8 +1136,8 @@ xfs_qm_internalqcheck_adjust(
        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
                *res = BULKSTAT_RV_NOTHING;
-                qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+                xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
-                        (unsigned long long) ino,
+                        __func__, (unsigned long long) ino,
                        (unsigned long long) mp->m_sb.sb_uquotino,
                        (unsigned long long) mp->m_sb.sb_gquotino);
                return XFS_ERROR(EINVAL);
@@ -1175,7 +1164,7 @@ xfs_qm_internalqcheck_adjust(
        }
        xfs_qm_internalqcheck_get_dquots(mp,
                                        (xfs_dqid_t) ip->i_d.di_uid,
-                                        (xfs_dqid_t) ip->i_d.di_projid,
+                                        (xfs_dqid_t) xfs_get_projid(ip),
                                        (xfs_dqid_t) ip->i_d.di_gid,
                                        &ud, &gd);
        if (XFS_IS_UQUOTA_ON(mp)) {
@@ -1233,12 +1222,12 @@ xfs_qm_internalqcheck(
                                 xfs_qm_internalqcheck_adjust,
                                 0, NULL, &done);
                if (error) {
-                        cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
+                        xfs_debug(mp, "Bulkstat returned error 0x%x", error);
                        break;
                }
        } while (!done);
-        cmn_err(CE_DEBUG, "Checking results against system dquots");
+        xfs_debug(mp, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
                xfs_dqtest_t    *d, *n;
                xfs_dqhash_t    *h;
@@ -1256,10 +1245,10 @@ xfs_qm_internalqcheck(
        }
        if (qmtest_nfails) {
-                cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
+                xfs_debug(mp, "******** quotacheck failed  ********");
-                cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
+                xfs_debug(mp, "failures = %d", qmtest_nfails);
        } else {
-                cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
+                xfs_debug(mp, "******** quotacheck successful! ********");
        }
        kmem_free(qmtest_udqtab);
        kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
             (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
              (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
+                xfs_debug(mp,
-                          " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
+                        "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+                        nblks, *resbcountp, hardlimit);
 #endif
                if (nblks > 0) {
                        /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 975aa10e1a47..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-#include "debug.h"
-/* xfs_mount.h drags a lot of crap in, sorry.. */
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_error.h"
-static char             message[1024];  /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL       7
-#define XFS_ERR_MASK            ((1 << 3) - 1)
-static const char * const       err_level[XFS_MAX_ERR_LEVEL+1] =
-                                        {KERN_EMERG, KERN_ALERT, KERN_CRIT,
-                                         KERN_ERR, KERN_WARNING, KERN_NOTICE,
-                                         KERN_INFO, KERN_DEBUG};
-void
-cmn_err(register int level, char *fmt, ...)
-{
-        char    *fp = fmt;
-        int     len;
-        ulong   flags;
-        va_list ap;
-        level &= XFS_ERR_MASK;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
-        spin_lock_irqsave(&xfs_err_lock,flags);
-        va_start(ap, fmt);
-        if (*fmt == '!') fp++;
-        len = vsnprintf(message, sizeof(message), fp, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
-        va_end(ap);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
-}
-void
-xfs_fs_vcmn_err(
-        int                     level,
-        struct xfs_mount        *mp,
-        char                    *fmt,
-        va_list                 ap)
-{
-        unsigned long           flags;
-        int                     len = 0;
-        level &= XFS_ERR_MASK;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
-        spin_lock_irqsave(&xfs_err_lock,flags);
-        if (mp) {
-                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
-                /*
-                 * Skip the printk if we can't print anything useful
-                 * due to an over-long device name.
-                 */
-                if (len >= sizeof(message))
-                        goto out;
-        }
-        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
- out:
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
-}
-void
-assfail(char *expr, char *file, int line)
-{
-        printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
-        BUG();
-}
-void
-xfs_hex_dump(void *p, int length)
-{
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
-}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index d2d20462fd4f..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_DEBUG_H__
-#define __XFS_SUPPORT_DEBUG_H__
-#include <stdarg.h>
-#define CE_DEBUG        7               /* debug        */
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
-#define CE_WARN         4               /* warning      */
-#define CE_ALERT        1               /* alert        */
-#define CE_PANIC        0               /* panic        */
-extern void cmn_err(int, char *, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void assfail(char *expr, char *f, int l);
-#define ASSERT_ALWAYS(expr)     \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef DEBUG
-#define ASSERT(expr)    ((void)0)
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-#else /* DEBUG */
-#define ASSERT(expr)    \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC noinline
-#endif
-#endif /* DEBUG */
-#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,9 @@ struct xfs_busy_extent {
        xfs_agnumber_t  agno;
        xfs_agblock_t   bno;
        xfs_extlen_t    length;
-        xlog_tid_t      tid;            /* transaction that created this */
+        unsigned int    flags;
+#define XFS_ALLOC_BUSY_DISCARDED        0x01    /* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD     0x02    /* do not discard */
 };
 /*
@@ -227,9 +229,18 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
+        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
+        unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
+        /* buffer cache index */
+        spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
+        struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+        /* for rcu-safe freeing */
+        struct rcu_head rcu_head;
 #endif
        int             pagb_count;     /* pagb slots in use */
 } xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,23 +41,13 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agblock_t bno, xfs_extlen_t len);
-/*
- * Prototypes for per-ag allocation routines
- */
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-        xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+                xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
-/*
+                xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
- * Internal functions.
- */
 /*
 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -94,7 +84,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -127,7 +117,7 @@ xfs_alloc_update(
 /*
 * Get the data from the pointed-to record.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           *bno,   /* output: starting block of extent */
@@ -151,27 +141,28 @@ xfs_alloc_get_rec(
 */
 STATIC void
 xfs_alloc_compute_aligned(
+        xfs_alloc_arg_t *args,          /* allocation argument structure */
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
-        xfs_extlen_t    alignment,      /* alignment for allocation */
-        xfs_extlen_t    minlen,         /* minimum length for allocation */
        xfs_agblock_t   *resbno,        /* result block number */
        xfs_extlen_t    *reslen)        /* result length */
 {
        xfs_agblock_t   bno;
-        xfs_extlen_t    diff;
        xfs_extlen_t    len;
-        if (alignment > 1 && foundlen >= minlen) {
+        /* Trim busy sections out of found extent */
-                bno = roundup(foundbno, alignment);
+        xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
-                diff = bno - foundbno;
-                len = diff >= foundlen ? 0 : foundlen - diff;
+        if (args->alignment > 1 && len >= args->minlen) {
+                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
+                xfs_extlen_t    diff = aligned_bno - bno;
+                *resbno = aligned_bno;
+                *reslen = diff >= len ? 0 : len - diff;
        } else {
-                bno = foundbno;
+                *resbno = bno;
-                len = foundlen;
+                *reslen = len;
        }
-        *resbno = bno;
-        *reslen = len;
 }
 /*
@@ -285,7 +276,6 @@ xfs_alloc_fix_minleft(
                return 1;
        agf = XFS_BUF_TO_AGF(args->agbp);
        diff = be32_to_cpu(agf->agf_freeblks)
-                + be32_to_cpu(agf->agf_flcount)
                - args->len - args->minleft;
        if (diff >= 0)
                return 1;
@@ -468,6 +458,27 @@ xfs_alloc_read_agfl(
        return 0;
 }
+STATIC int
+xfs_alloc_update_counters(
+        struct xfs_trans        *tp,
+        struct xfs_perag        *pag,
+        struct xfs_buf          *agbp,
+        long                    len)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        pag->pagf_freeblks += len;
+        be32_add_cpu(&agf->agf_freeblks, len);
+        xfs_trans_agblocks_delta(tp, len);
+        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+                     be32_to_cpu(agf->agf_length)))
+                return EFSCORRUPTED;
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+        return 0;
+}
 /*
 * Allocation group level functions.
 */
@@ -509,49 +520,36 @@ xfs_alloc_ag_vextent(
                ASSERT(0);
                /* NOTREACHED */
        }
-        if (error)
+        if (error || args->agbno == NULLAGBLOCK)
                return error;
-        /*
-         * If the allocation worked, need to change the agf structure
-         * (and log it), and the superblock.
-         */
-        if (args->agbno != NULLAGBLOCK) {
-                xfs_agf_t       *agf;   /* allocation group freelist header */
-                long            slen = (long)args->len;
-                ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+        ASSERT(args->len >= args->minlen);
-                ASSERT(!(args->wasfromfl) || !args->isfl);
+        ASSERT(args->len <= args->maxlen);
-                ASSERT(args->agbno % args->alignment == 0);
+        ASSERT(!args->wasfromfl || !args->isfl);
-                if (!(args->wasfromfl)) {
+        ASSERT(args->agbno % args->alignment == 0);
-                        agf = XFS_BUF_TO_AGF(args->agbp);
+        if (!args->wasfromfl) {
-                        be32_add_cpu(&agf->agf_freeblks, -(args->len));
+                error = xfs_alloc_update_counters(args->tp, args->pag,
-                        xfs_trans_agblocks_delta(args->tp,
+                                                  args->agbp,
-                                                 -((long)(args->len)));
+                                                  -((long)(args->len)));
-                        args->pag->pagf_freeblks -= args->len;
+                if (error)
-                        ASSERT(be32_to_cpu(agf->agf_freeblks) <=
+                        return error;
-                                be32_to_cpu(agf->agf_length));
-                        xfs_alloc_log_agf(args->tp, args->agbp,
+                ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
-                                                XFS_AGF_FREEBLKS);
+                                              args->agbno, args->len));
-                        /*
-                         * Search the busylist for these blocks and mark the
-                         * transaction as synchronous if blocks are found. This
-                         * avoids the need to block due to a synchronous log
-                         * force to ensure correct ordering as the synchronous
-                         * transaction will guarantee that for us.
-                         */
-                        if (xfs_alloc_busy_search(args->mp, args->agno,
-                                                args->agbno, args->len))
-                                xfs_trans_set_sync(args->tp);
-                }
-                if (!args->isfl)
-                        xfs_trans_mod_sb(args->tp,
-                                args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
-                                        XFS_TRANS_SB_FDBLOCKS, -slen);
-                XFS_STATS_INC(xs_allocx);
-                XFS_STATS_ADD(xs_allocb, args->len);
        }
-        return 0;
+        if (!args->isfl) {
+                xfs_trans_mod_sb(args->tp, args->wasdel ?
+                                 XFS_TRANS_SB_RES_FDBLOCKS :
+                                 XFS_TRANS_SB_FDBLOCKS,
+                                 -((long)(args->len)));
+        }
+        XFS_STATS_INC(xs_allocx);
+        XFS_STATS_ADD(xs_allocb, args->len);
+        return error;
 }
 /*
@@ -566,72 +564,77 @@ xfs_alloc_ag_vextent_exact(
 {
        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
-        xfs_agblock_t   end;    /* end of allocated extent */
        int             error;
        xfs_agblock_t   fbno;   /* start block of found extent */
-        xfs_agblock_t   fend;   /* end block of found extent */
        xfs_extlen_t    flen;   /* length of found extent */
+        xfs_agblock_t   tbno;   /* start block of trimmed extent */
+        xfs_extlen_t    tlen;   /* length of trimmed extent */
+        xfs_agblock_t   tend;   /* end block of trimmed extent */
+        xfs_agblock_t   end;    /* end of allocated extent */
        int             i;      /* success/failure of operation */
-        xfs_agblock_t   maxend; /* end of maximal extent */
-        xfs_agblock_t   minend; /* end of minimal extent */
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
-        minend = args->agbno + args->minlen;
-        maxend = args->agbno + args->maxlen;
-        fend = fbno + flen;
        /*
-         * Give up if the freespace isn't long enough for the minimum request.
+         * Check for overlapping busy extents.
         */
-        if (fend < minend) {
+        xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
-         * End of extent will be smaller of the freespace end and the
+         * Give up if the start of the extent is busy, or the freespace isn't
-         * maximal requested end.
+         * long enough for the minimum request.
         */
-        end = XFS_AGBLOCK_MIN(fend, maxend);
+        if (tbno > args->agbno)
+                goto not_found;
+        if (tlen < args->minlen)
+                goto not_found;
+        tend = tbno + tlen;
+        if (tend < args->agbno + args->minlen)
+                goto not_found;
        /*
+         * End of extent will be smaller of the freespace end and the
+         * maximal requested end.
+         *
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
-        ASSERT(args->agbno + rlen <= fend);
+        ASSERT(args->agbno + rlen <= tend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +643,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +671,94 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,  /* extent length */
+        xfs_agblock_t           *sbnoa, /* aligned extent found by search */
+        xfs_extlen_t            *slena, /* aligned extent length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (*sbnoa >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (*sbnoa <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbnoa,
+                                                       *slena, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -687,6 +787,7 @@ xfs_alloc_ag_vextent_near(
        xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
 #if defined(DEBUG) && defined(__KERNEL__)
        /*
         * Randomly don't execute the first algorithm.
@@ -695,13 +796,20 @@ xfs_alloc_ag_vextent_near(
        dofirst = random32() & 1;
 #endif
+restart:
+        bno_cur_lt = NULL;
+        bno_cur_gt = NULL;
+        ltlen = 0;
+        gtlena = 0;
+        ltlena = 0;
        /*
         * Get a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
-        ltlen = 0;
-        bno_cur_lt = bno_cur_gt = NULL;
        /*
         * See if there are any free extents as big as maxlen.
         */
@@ -717,11 +825,13 @@ xfs_alloc_ag_vextent_near(
                        goto error0;
                if (i == 0 || ltlen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        trace_xfs_alloc_near_noentry(args);
                        return 0;
                }
                ASSERT(i == 1);
        }
        args->wasfromfl = 0;
        /*
         * First algorithm.
         * If the requested extent is large wrt the freespaces available
@@ -775,8 +885,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
                                continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -785,7 +895,7 @@ xfs_alloc_ag_vextent_near(
                        if (args->len < blen)
                                continue;
                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                                args->alignment, ltbno, ltlen, &ltnew);
+                                args->alignment, ltbnoa, ltlena, &ltnew);
                        if (ltnew != NULLAGBLOCK &&
                            (args->len > blen || ltdiff < bdiff)) {
                                bdiff = ltdiff;
@@ -896,8 +1006,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -912,8 +1022,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
-                                        args->minlen, &gtbnoa, &gtlena);
+                                                  &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -925,211 +1035,62 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+                                args->alignment, ltbnoa, ltlena, &ltnew);
-                                args->alignment, ltbno, ltlen, &ltnew);
-                        /*
+                        error = xfs_alloc_find_best_extent(args,
-                         * Not perfect.
+                                                &bno_cur_lt, &bno_cur_gt,
-                         */
+                                                ltdiff, &gtbno, &gtlen,
-                        if (ltdiff) {
+                                                &gtbnoa, &gtlena,
-                                /*
+                                                0 /* search right */);
-                                 * Look until we find a better one, run out of
+                } else {
-                                 * space, or run off the end.
+                        ASSERT(gtlena >= args->minlen);
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
                        /*
-                         * Fix up the length.
+                         * Right side is good, look for a left side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+                                args->alignment, gtbnoa, gtlena, &gtnew);
-                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
+                        error = xfs_alloc_find_best_extent(args,
-                         * Right side entry isn't perfect.
+                                                &bno_cur_gt, &bno_cur_lt,
-                         */
+                                                gtdiff, &ltbno, &ltlen,
-                        if (gtdiff) {
+                                                &ltbnoa, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+                if (!forced++) {
+                        trace_xfs_alloc_near_busy(args);
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                        goto restart;
+                }
                trace_xfs_alloc_size_neither(args);
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1107,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
@@ -1158,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
                return 0;
        }
        rlen = args->len;
-        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
+        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-                ltlen, &ltnew);
+                                     ltbnoa, ltlena, &ltnew);
        ASSERT(ltnew >= ltbno);
-        ASSERT(ltnew + rlen <= ltbno + ltlen);
+        ASSERT(ltnew + rlen <= ltbnoa + ltlena);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        args->agbno = ltnew;
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
                goto error0;
@@ -1206,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
        int             i;              /* temp status variable */
        xfs_agblock_t   rbno;           /* returned block number */
        xfs_extlen_t    rlen;           /* length of returned extent */
+        int             forced = 0;
+restart:
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
         */
        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
                        args->maxlen + args->alignment - 1, &i)))
                goto error0;
        /*
-         * If none, then pick up the last entry in the tree unless the
+         * If none or we have busy extents that we cannot allocate from, then
-         * tree is empty.
+         * we have to settle for a smaller extent. In the case that there are
+         * no large extents, this will return the last entry in the tree unless
+         * the tree is empty. In the case that there are only busy large
+         * extents, this will return the largest small extent unless there
+         * are no smaller extents available.
         */
-        if (!i) {
+        if (!i || forced > 1) {
-                if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
+                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
-                                &flen, &i)))
+                                                   &fbno, &flen, &i);
+                if (error)
                        goto error0;
                if (i == 0 || flen == 0) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1233,23 +1205,56 @@ xfs_alloc_ag_vextent_size(
                        return 0;
                }
                ASSERT(i == 1);
+                xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+        } else {
+                /*
+                 * Search for a non-busy extent that is large enough.
+                 * If we are at low space, don't check, or if we fall of
+                 * the end of the btree, turn off the busy check and
+                 * restart.
+                 */
+                for (;;) {
+                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        xfs_alloc_compute_aligned(args, fbno, flen,
+                                                  &rbno, &rlen);
+                        if (rlen >= args->maxlen)
+                                break;
+                        error = xfs_btree_increment(cnt_cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        if (i == 0) {
+                                /*
+                                 * Our only valid extents must have been busy.
+                                 * Make it unbusy by forcing the log out and
+                                 * retrying. If we've been here before, forcing
+                                 * the log isn't making the extents available,
+                                 * which means they have probably been freed in
+                                 * this transaction.  In that case, we have to
+                                 * give up on them and we'll attempt a minlen
+                                 * allocation the next time around.
+                                 */
+                                xfs_btree_del_cursor(cnt_cur,
+                                                     XFS_BTREE_NOERROR);
+                                trace_xfs_alloc_size_busy(args);
+                                if (!forced++)
+                                        xfs_log_force(args->mp, XFS_LOG_SYNC);
+                                goto restart;
+                        }
+                }
        }
-        /*
-         * There's a freespace as big as maxlen+alignment-1, get it.
-         */
-        else {
-                if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        }
        /*
         * In the first case above, we got the last entry in the
         * by-size btree.  Now we check to see if the space hits maxlen
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-        xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
-                &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1274,8 +1279,8 @@ xfs_alloc_ag_vextent_size(
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        if (flen < bestrlen)
                                break;
-                        xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+                        xfs_alloc_compute_aligned(args, fbno, flen,
-                                args->minlen, &rbno, &rlen);
+                                                  &rbno, &rlen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1303,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
         * Fix up the length.
         */
        args->len = rlen;
-        xfs_alloc_fix_len(args);
+        if (rlen < args->minlen) {
-        if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
+                if (!forced++) {
-                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-                trace_xfs_alloc_size_nominleft(args);
+                        trace_xfs_alloc_size_busy(args);
-                args->agbno = NULLAGBLOCK;
+                        xfs_log_force(args->mp, XFS_LOG_SYNC);
-                return 0;
+                        goto restart;
+                }
+                goto out_nominleft;
        }
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args))
+                goto out_nominleft;
        rlen = args->len;
        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
        /*
@@ -1339,6 +1350,12 @@ error0:
        if (bno_cur)
                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
        return error;
+out_nominleft:
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        trace_xfs_alloc_size_nominleft(args);
+        args->agbno = NULLAGBLOCK;
+        return 0;
 }
 /*
@@ -1378,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
                if (error)
                        goto error0;
                if (fbno != NULLAGBLOCK) {
+                        xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+                                             args->userdata);
                        if (args->userdata) {
                                xfs_buf_t       *bp;
@@ -1453,6 +1473,7 @@ xfs_free_ag_extent(
        xfs_mount_t     *mp;            /* mount point struct for filesystem */
        xfs_agblock_t   nbno;           /* new starting block of freespace */
        xfs_extlen_t    nlen;           /* new length of freespace */
+        xfs_perag_t     *pag;           /* per allocation group data */
        mp = tp->t_mountp;
        /*
@@ -1651,45 +1672,23 @@ xfs_free_ag_extent(
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        cnt_cur = NULL;
        /*
         * Update the freespace totals in the ag and superblock.
         */
-        {
+        pag = xfs_perag_get(mp, agno);
-                xfs_agf_t       *agf;
+        error = xfs_alloc_update_counters(tp, pag, agbp, len);
-                xfs_perag_t     *pag;           /* per allocation group data */
+        xfs_perag_put(pag);
+        if (error)
-                pag = xfs_perag_get(mp, agno);
+                goto error0;
-                pag->pagf_freeblks += len;
-                xfs_perag_put(pag);
-                agf = XFS_BUF_TO_AGF(agbp);
+        if (!isfl)
-                be32_add_cpu(&agf->agf_freeblks, len);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                xfs_trans_agblocks_delta(tp, len);
+        XFS_STATS_INC(xs_freex);
-                XFS_WANT_CORRUPTED_GOTO(
+        XFS_STATS_ADD(xs_freeb, len);
-                        be32_to_cpu(agf->agf_freeblks) <=
-                        be32_to_cpu(agf->agf_length),
-                        error0);
-                xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-                if (!isfl)
-                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                XFS_STATS_INC(xs_freex);
-                XFS_STATS_ADD(xs_freeb, len);
-        }
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
-        /*
-         * Since blocks move to the free list without the coordination
-         * used in xfs_bmap_finish, we can't allow block to be available
-         * for reallocation and non-transaction writing (user data)
-         * until we know that the transaction that moved it to the free
-         * list is permanently on disk.  We track the blocks by declaring
-         * these blocks as "busy"; the busy list is maintained on a per-ag
-         * basis and each transaction records which entries should be removed
-         * when the iclog commits to disk.  If a busy block is allocated,
-         * the iclog is pushed up to the LSN that freed the block.
-         */
-        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1984,21 +1983,6 @@ xfs_alloc_get_freelist(
        xfs_alloc_log_agf(tp, agbp, logflags);
        *bnop = bno;
-        /*
-         * As blocks are freed, they are added to the per-ag busy list and
-         * remain there until the freeing transaction is committed to disk.
-         * Now that we have allocated blocks, this list must be searched to see
-         * if a block is being reused.  If one is, then the freeing transaction
-         * must be pushed to disk before this transaction.
-         *
-         * We do this by setting the current transaction to a sync transaction
-         * which guarantees that the freeing transaction is on disk before this
-         * transaction. This is done instead of a synchronous log force here so
-         * that we don't sit and wait with the AGF locked in the transaction
-         * during the log force.
-         */
-        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
-                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2456,131 +2440,54 @@ xfs_free_extent(
        memset(&args, 0, sizeof(xfs_alloc_arg_t));
        args.tp = tp;
        args.mp = tp->t_mountp;
+        /*
+         * validate that the block number is legal - the enables us to detect
+         * and handle a silent filesystem corruption rather than crashing.
+         */
        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-        ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+        if (args.agno >= args.mp->m_sb.sb_agcount)
+                return EFSCORRUPTED;
        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+        if (args.agbno >= args.mp->m_sb.sb_agblocks)
+                return EFSCORRUPTED;
        args.pag = xfs_perag_get(args.mp, args.agno);
-        if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
+        ASSERT(args.pag);
+        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+        if (error)
                goto error0;
-#ifdef DEBUG
-        ASSERT(args.agbp != NULL);
+        /* validate the extent size is legal now we have the agf locked */
-        ASSERT((args.agbno + len) <=
+        if (args.agbno + len >
-                be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length));
+                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-#endif
+                error = EFSCORRUPTED;
+                goto error0;
+        }
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+        if (!error)
+                xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
 error0:
        xfs_perag_put(args.pag);
        return error;
 }
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transactions have not yet hit disk.  If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_busy_clear - remove an item from the per-ag busy list
- * xfs_alloc_busy_search - search for a busy extent
- */
-/*
- * Insert a new extent into the busy tree.
- *
- * The busy extent tree is indexed by the start block of the busy extent.
- * there can be multiple overlapping ranges in the busy extent tree but only
- * ever one entry at a given start block. The reason for this is that
- * multi-block extents can be freed, then smaller chunks of that extent
- * allocated and freed again before the first transaction commit is on disk.
- * If the exact same start block is freed a second time, we have to wait for
- * that busy extent to pass out of the tree before the new extent is inserted.
- * There are two main cases we have to handle here.
- *
- * The first case is a transaction that triggers a "free - allocate - free"
- * cycle. This can occur during btree manipulations as a btree block is freed
- * to the freelist, then allocated from the free list, then freed again. In
- * this case, the second extxpnet free is what triggers the duplicate and as
- * such the transaction IDs should match. Because the extent was allocated in
- * this transaction, the transaction must be marked as synchronous. This is
- * true for all cases where the free/alloc/free occurs in the one transaction,
- * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
- * This serves to catch violations of the second case quite effectively.
- *
- * The second case is where the free/alloc/free occur in different
- * transactions. In this case, the thread freeing the extent the second time
- * can't mark the extent busy immediately because it is already tracked in a
- * transaction that may be committing.  When the log commit for the existing
- * busy extent completes, the busy extent will be removed from the tree. If we
- * allow the second busy insert to continue using that busy extent structure,
- * it can be freed before this transaction is safely in the log.  Hence our
- * only option in this case is to force the log to remove the existing busy
- * extent from the list before we insert the new one with the current
- * transaction ID.
- *
- * The problem we are trying to avoid in the free-alloc-free in separate
- * transactions is most easily described with a timeline:
- *
- *      Thread 1        Thread 2        Thread 3        xfslogd
- *      xact alloc
- *      free X
- *      mark busy
- *      commit xact
- *      free xact
- *                      xact alloc
- *                      alloc X
- *                      busy search
- *                      mark xact sync
- *                      commit xact
- *                      free xact
- *                      force log
- *                      checkpoint starts
- *                      ....
- *                                      xact alloc
- *                                      free X
- *                                      mark busy
- *                                      finds match
- *                                      *** KABOOM! ***
- *                                      ....
- *                                                      log IO completes
- *                                                      unbusy X
- *                      checkpoint completes
- *
- * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
- * the checkpoint completes, and the busy extent it matched will have been
- * removed from the tree when it is woken. Hence it can then continue safely.
- *
- * However, to ensure this matching process is robust, we need to use the
- * transaction ID for identifying transaction, as delayed logging results in
- * the busy extent and transaction lifecycles being different. i.e. the busy
- * extent is active for a lot longer than the transaction.  Hence the
- * transaction structure can be freed and reallocated, then mark the same
- * extent busy again in the new transaction. In this case the new transaction
- * will have a different tid but can have the same address, and hence we need
- * to check against the tid.
- *
- * Future: for delayed logging, we could avoid the log force if the extent was
- * first freed in the current checkpoint sequence. This, however, requires the
- * ability to pin the current checkpoint in memory until this transaction
- * commits to ensure that both the original free and the current one combine
- * logically into the one checkpoint. If the checkpoint sequences are
- * different, however, we still need to wait on a log force.
- */
 void
 xfs_alloc_busy_insert(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
        xfs_agblock_t           bno,
-        xfs_extlen_t            len)
+        xfs_extlen_t            len,
+        unsigned int            flags)
 {
        struct xfs_busy_extent  *new;
        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
        struct rb_node          **rbp;
-        struct rb_node          *parent;
+        struct rb_node          *parent = NULL;
-        int                     match;
        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
        if (!new) {
@@ -2589,7 +2496,7 @@ xfs_alloc_busy_insert(
                 * block, make this a synchronous transaction to insure that
                 * the block is not reused before this transaction commits.
                 */
-                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
                xfs_trans_set_sync(tp);
                return;
        }
@@ -2597,66 +2504,29 @@ xfs_alloc_busy_insert(
        new->agno = agno;
        new->bno = bno;
        new->length = len;
-        new->tid = xfs_log_get_trans_ident(tp);
        INIT_LIST_HEAD(&new->list);
+        new->flags = flags;
        /* trace before insert to be able to see failed inserts */
-        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
        pag = xfs_perag_get(tp->t_mountp, new->agno);
-restart:
        spin_lock(&pag->pagb_lock);
        rbp = &pag->pagb_tree.rb_node;
-        parent = NULL;
+        while (*rbp) {
-        busyp = NULL;
-        match = 0;
-        while (*rbp && match >= 0) {
                parent = *rbp;
                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
                if (new->bno < busyp->bno) {
-                        /* may overlap, but exact start block is lower */
                        rbp = &(*rbp)->rb_left;
-                        if (new->bno + new->length > busyp->bno)
+                        ASSERT(new->bno + new->length <= busyp->bno);
-                                match = busyp->tid == new->tid ? 1 : -1;
                } else if (new->bno > busyp->bno) {
-                        /* may overlap, but exact start block is higher */
                        rbp = &(*rbp)->rb_right;
-                        if (bno < busyp->bno + busyp->length)
+                        ASSERT(bno >= busyp->bno + busyp->length);
-                                match = busyp->tid == new->tid ? 1 : -1;
                } else {
-                        match = busyp->tid == new->tid ? 1 : -1;
+                        ASSERT(0);
-                        break;
                }
        }
-        if (match < 0) {
-                /* overlap marked busy in different transaction */
-                spin_unlock(&pag->pagb_lock);
-                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
-                goto restart;
-        }
-        if (match > 0) {
-                /*
-                 * overlap marked busy in same transaction. Update if exact
-                 * start block match, otherwise combine the busy extents into
-                 * a single range.
-                 */
-                if (busyp->bno == new->bno) {
-                        busyp->length = max(busyp->length, new->length);
-                        spin_unlock(&pag->pagb_lock);
-                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
-                        xfs_perag_put(pag);
-                        kmem_free(new);
-                        return;
-                }
-                rb_erase(&busyp->rb_node, &pag->pagb_tree);
-                new->length = max(busyp->bno + busyp->length,
-                                        new->bno + new->length) -
-                                min(busyp->bno, new->bno);
-                new->bno = min(busyp->bno, new->bno);
-        } else
-                busyp = NULL;
        rb_link_node(&new->rb_node, parent, rbp);
        rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2664,7 +2534,6 @@ restart:
        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        kmem_free(busyp);
 }
 /*
@@ -2676,7 +2545,7 @@ restart:
 * will require a synchronous transaction, but it can still be
 * used to distinguish between a partial or exact match.
 */
-static int
+int
 xfs_alloc_busy_search(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
@@ -2713,31 +2582,466 @@ xfs_alloc_busy_search(
                }
        }
        spin_unlock(&pag->pagb_lock);
-        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
        return match;
 }
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent.  If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation.  We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        struct xfs_busy_extent  *busyp,
+        xfs_agblock_t           fbno,
+        xfs_extlen_t            flen,
+        bool                    userdata)
+{
+        xfs_agblock_t           fend = fbno + flen;
+        xfs_agblock_t           bbno = busyp->bno;
+        xfs_agblock_t           bend = bbno + busyp->length;
+        /*
+         * This extent is currently being discarded.  Give the thread
+         * performing the discard a chance to mark the extent unbusy
+         * and retry.
+         */
+        if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+                spin_unlock(&pag->pagb_lock);
+                delay(1);
+                spin_lock(&pag->pagb_lock);
+                return false;
+        }
+        /*
+         * If there is a busy extent overlapping a user allocation, we have
+         * no choice but to force the log and retry the search.
+         *
+         * Fortunately this does not happen during normal operation, but
+         * only if the filesystem is very low on space and has to dip into
+         * the AGFL for normal allocations.
+         */
+        if (userdata)
+                goto out_force_log;
+        if (bbno < fbno && bend > fend) {
+                /*
+                 * Case 1:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +---------+
+                 *        fbno   fend
+                 */
+                /*
+                 * We would have to split the busy extent to be able to track
+                 * it correct, which we cannot do because we would have to
+                 * modify the list of busy extents attached to the transaction
+                 * or CIL context, which is immutable.
+                 *
+                 * Force out the log to clear the busy extent and retry the
+                 * search.
+                 */
+                goto out_force_log;
+        } else if (bbno >= fbno && bend <= fend) {
+                /*
+                 * Case 2:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *    +-----------------+
+                 *    fbno           fend
+                 *
+                 * Case 3:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *    +--------------------------+
+                 *    fbno                    fend
+                 *
+                 * Case 4:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +--------------------------+
+                 *    fbno                    fend
+                 *
+                 * Case 5:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +-----------------------------------+
+                 *    fbno                             fend
+                 *
+                 */
+                /*
+                 * The busy extent is fully covered by the extent we are
+                 * allocating, and can simply be removed from the rbtree.
+                 * However we cannot remove it from the immutable list
+                 * tracking busy extents in the transaction or CIL context,
+                 * so set the length to zero to mark it invalid.
+                 *
+                 * We also need to restart the busy extent search from the
+                 * tree root, because erasing the node can rearrange the
+                 * tree topology.
+                 */
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                busyp->length = 0;
+                return false;
+        } else if (fend < bend) {
+                /*
+                 * Case 6:
+                 *              bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *             +---------+
+                 *             fbno   fend
+                 *
+                 * Case 7:
+                 *             bbno           bend
+                 *             +BBBBBBBBBBBBBBBBB+
+                 *    +------------------+
+                 *    fbno            fend
+                 *
+                 */
+                busyp->bno = fend;
+        } else if (bbno < fbno) {
+                /*
+                 * Case 8:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +-------------+
+                 *        fbno       fend
+                 *
+                 * Case 9:
+                 *    bbno           bend
+                 *    +BBBBBBBBBBBBBBBBB+
+                 *        +----------------------+
+                 *        fbno                fend
+                 */
+                busyp->length = fbno - busyp->bno;
+        } else {
+                ASSERT(0);
+        }
+        trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+        return true;
+out_force_log:
+        spin_unlock(&pag->pagb_lock);
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+        spin_lock(&pag->pagb_lock);
+        return false;
+}
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
 void
-xfs_alloc_busy_clear(
+xfs_alloc_busy_reuse(
        struct xfs_mount        *mp,
-        struct xfs_busy_extent  *busyp)
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           fbno,
+        xfs_extlen_t            flen,
+        bool                    userdata)
 {
        struct xfs_perag        *pag;
+        struct rb_node          *rbp;
-        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+        ASSERT(flen > 0);
-                                                busyp->length);
-        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+        pag = xfs_perag_get(mp, agno);
-                                                busyp->length) == 1);
+        spin_lock(&pag->pagb_lock);
+restart:
+        rbp = pag->pagb_tree.rb_node;
+        while (rbp) {
+                struct xfs_busy_extent *busyp =
+                        rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                xfs_agblock_t   bbno = busyp->bno;
+                xfs_agblock_t   bend = bbno + busyp->length;
-        list_del_init(&busyp->list);
+                if (fbno + flen <= bbno) {
+                        rbp = rbp->rb_left;
+                        continue;
+                } else if (fbno >= bend) {
+                        rbp = rbp->rb_right;
+                        continue;
+                }
-        pag = xfs_perag_get(mp, busyp->agno);
+                if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
-        spin_lock(&pag->pagb_lock);
+                                                  userdata))
-        rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                        goto restart;
+        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+}
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy.  If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+        struct xfs_alloc_arg    *args,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len,
+        xfs_agblock_t           *rbno,
+        xfs_extlen_t            *rlen)
+{
+        xfs_agblock_t           fbno;
+        xfs_extlen_t            flen;
+        struct rb_node          *rbp;
+        ASSERT(len > 0);
+        spin_lock(&args->pag->pagb_lock);
+restart:
+        fbno = bno;
+        flen = len;
+        rbp = args->pag->pagb_tree.rb_node;
+        while (rbp && flen >= args->minlen) {
+                struct xfs_busy_extent *busyp =
+                        rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                xfs_agblock_t   fend = fbno + flen;
+                xfs_agblock_t   bbno = busyp->bno;
+                xfs_agblock_t   bend = bbno + busyp->length;
+                if (fend <= bbno) {
+                        rbp = rbp->rb_left;
+                        continue;
+                } else if (fbno >= bend) {
+                        rbp = rbp->rb_right;
+                        continue;
+                }
+                /*
+                 * If this is a metadata allocation, try to reuse the busy
+                 * extent instead of trimming the allocation.
+                 */
+                if (!args->userdata &&
+                    !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
+                        if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+                                                          busyp, fbno, flen,
+                                                          false))
+                                goto restart;
+                        continue;
+                }
+                if (bbno <= fbno) {
+                        /* start overlap */
+                        /*
+                         * Case 1:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +---------+
+                         *        fbno   fend
+                         *
+                         * Case 2:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +-------------+
+                         *    fbno       fend
+                         *
+                         * Case 3:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +-------------+
+                         *        fbno       fend
+                         *
+                         * Case 4:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +-----------------+
+                         *    fbno           fend
+                         *
+                         * No unbusy region in extent, return failure.
+                         */
+                        if (fend <= bend)
+                                goto fail;
+                        /*
+                         * Case 5:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *        +----------------------+
+                         *        fbno                fend
+                         *
+                         * Case 6:
+                         *    bbno           bend
+                         *    +BBBBBBBBBBBBBBBBB+
+                         *    +--------------------------+
+                         *    fbno                    fend
+                         *
+                         * Needs to be trimmed to:
+                         *                       +-------+
+                         *                       fbno fend
+                         */
+                        fbno = bend;
+                } else if (bend >= fend) {
+                        /* end overlap */
+                        /*
+                         * Case 7:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +------------------+
+                         *    fbno            fend
+                         *
+                         * Case 8:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +--------------------------+
+                         *    fbno                    fend
+                         *
+                         * Needs to be trimmed to:
+                         *    +-------+
+                         *    fbno fend
+                         */
+                        fend = bbno;
+                } else {
+                        /* middle overlap */
+                        /*
+                         * Case 9:
+                         *             bbno           bend
+                         *             +BBBBBBBBBBBBBBBBB+
+                         *    +-----------------------------------+
+                         *    fbno                             fend
+                         *
+                         * Can be trimmed to:
+                         *    +-------+        OR         +-------+
+                         *    fbno fend                   fbno fend
+                         *
+                         * Backward allocation leads to significant
+                         * fragmentation of directories, which degrades
+                         * directory performance, therefore we always want to
+                         * choose the option that produces forward allocation
+                         * patterns.
+                         * Preferring the lower bno extent will make the next
+                         * request use "fend" as the start of the next
+                         * allocation;  if the segment is no longer busy at
+                         * that point, we'll get a contiguous allocation, but
+                         * even if it is still busy, we will get a forward
+                         * allocation.
+                         * We try to avoid choosing the segment at "bend",
+                         * because that can lead to the next allocation
+                         * taking the segment at "fbno", which would be a
+                         * backward allocation.  We only use the segment at
+                         * "fbno" if it is much larger than the current
+                         * requested size, because in that case there's a
+                         * good chance subsequent allocations will be
+                         * contiguous.
+                         */
+                        if (bbno - fbno >= args->maxlen) {
+                                /* left candidate fits perfect */
+                                fend = bbno;
+                        } else if (fend - bend >= args->maxlen * 4) {
+                                /* right candidate has enough free space */
+                                fbno = bend;
+                        } else if (bbno - fbno >= args->minlen) {
+                                /* left candidate fits minimum requirement */
+                                fend = bbno;
+                        } else {
+                                goto fail;
+                        }
+                }
+                flen = fend - fbno;
+        }
+        spin_unlock(&args->pag->pagb_lock);
+        if (fbno != bno || flen != len) {
+                trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+                                          fbno, flen);
+        }
+        *rbno = fbno;
+        *rlen = flen;
+        return;
+fail:
+        /*
+         * Return a zero extent length as failure indications.  All callers
+         * re-check if the trimmed extent satisfies the minlen requirement.
+         */
+        spin_unlock(&args->pag->pagb_lock);
+        trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+        *rbno = fbno;
+        *rlen = 0;
+}
+static void
+xfs_alloc_busy_clear_one(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        struct xfs_busy_extent  *busyp)
+{
+        if (busyp->length) {
+                trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+                                                busyp->length);
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+        }
+        list_del_init(&busyp->list);
        kmem_free(busyp);
 }
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_alloc_busy_clear(
+        struct xfs_mount        *mp,
+        struct list_head        *list,
+        bool                    do_discard)
+{
+        struct xfs_busy_extent  *busyp, *n;
+        struct xfs_perag        *pag = NULL;
+        xfs_agnumber_t          agno = NULLAGNUMBER;
+        list_for_each_entry_safe(busyp, n, list, list) {
+                if (busyp->agno != agno) {
+                        if (pag) {
+                                spin_unlock(&pag->pagb_lock);
+                                xfs_perag_put(pag);
+                        }
+                        pag = xfs_perag_get(mp, busyp->agno);
+                        spin_lock(&pag->pagb_lock);
+                        agno = busyp->agno;
+                }
+                if (do_discard && busyp->length &&
+                    !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+                        busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+                else
+                        xfs_alloc_busy_clear_one(mp, pag, busyp);
+        }
+        if (pag) {
+                spin_unlock(&pag->pagb_lock);
+                xfs_perag_put(pag);
+        }
+}
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+        void                    *priv,
+        struct list_head        *a,
+        struct list_head        *b)
+{
+        return container_of(a, struct xfs_busy_extent, list)->agno -
+                container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define __XFS_ALLOC_H__
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 /*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *      - the AG superblock, AGF, AGI and AGFL
+ *      - the AGF (bno and cnt) and AGI btree root blocks
+ *      - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)     \
+        ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+/*
 * Argument structure for xfs_alloc routines.
 * This is turned into a structure to avoid having 20 arguments passed
 * down several levels of the stack.
@@ -118,15 +135,29 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag);
 #ifdef __KERNEL__
+void
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
-                xfs_agnumber_t agno,
+        bool do_discard);
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
 void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+        list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
 #endif  /* __KERNEL__ */
@@ -205,4 +236,18 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
                return 0;
        }
+        xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
        xfs_trans_agbtree_delta(cur->bc_tp, 1);
        new->s = cpu_to_be32(bno);
@@ -118,18 +120,8 @@ xfs_allocbt_free_block(
        if (error)
                return error;
-        /*
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
-         * Since blocks move to the free list without the coordination used in
+                              XFS_ALLOC_BUSY_SKIP_DISCARD);
-         * xfs_bmap_finish, we can't allow block to be available for
-         * reallocation and non-transaction writing (user data) until we know
-         * that the transaction that moved it to the free list is permanently
-         * on disk. We track the blocks by declaring these blocks as "busy";
-         * the busy list is maintained on a per-ag basis and each transaction
-         * records which entries should be removed when the iclog commits to
-         * disk. If a busy block is allocated, the iclog is pushed up to the
-         * LSN that freed the block.
-         */
-        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
@@ -280,38 +272,6 @@ xfs_allocbt_key_diff(
        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
-STATIC int
-xfs_allocbt_kill_root(
-        struct xfs_btree_cur    *cur,
-        struct xfs_buf          *bp,
-        int                     level,
-        union xfs_btree_ptr     *newroot)
-{
-        int                     error;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        XFS_BTREE_STATS_INC(cur, killroot);
-        /*
-         * Update the root pointer, decreasing the level by 1 and then
-         * free the old root.
-         */
-        xfs_allocbt_set_root(cur, newroot, -1);
-        error = xfs_allocbt_free_block(cur, bp);
-        if (error) {
-                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                return error;
-        }
-        XFS_BTREE_STATS_INC(cur, free);
-        xfs_btree_setbuf(cur, level, NULL);
-        cur->bc_nlevels--;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        return 0;
-}
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -423,7 +383,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
        .dup_cursor             = xfs_allocbt_dup_cursor,
        .set_root               = xfs_allocbt_set_root,
-        .kill_root              = xfs_allocbt_kill_root,
        .alloc_block            = xfs_allocbt_alloc_block,
        .free_block             = xfs_allocbt_free_block,
        .update_lastrec         = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..01d2072fb6d4 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
                        if (mp->m_flags & XFS_MOUNT_WSYNC) {
                                xfs_trans_set_sync(args.trans);
                        }
+                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                                xfs_trans_ichgtime(args.trans, dp,
+                                                        XFS_ICHGTIME_CHG);
+                        }
                        err2 = xfs_trans_commit(args.trans,
                                                 XFS_TRANS_RELEASE_LOG_RES);
                        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                        /*
-                         * Hit the inode change time.
-                         */
-                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-                        }
                        return(error == 0 ? err2 : error);
                }
@@ -420,6 +419,9 @@ xfs_attr_set_int(
                xfs_trans_set_sync(args.trans);
        }
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
        /*
         * Commit the last in the sequence of transactions.
         */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        /*
-         * Hit the inode change time.
-         */
-        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-        }
        return(error);
 out:
@@ -495,6 +490,13 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        args.whichfork = XFS_ATTR_FORK;
        /*
+         * we have no control over the attribute names that userspace passes us
+         * to remove, so we have to allow the name lookup prior to attribute
+         * removal to fail.
+         */
+        args.op_flags = XFS_DA_OP_OKNOENT;
+        /*
         * Attach the dquots to the inode.
         */
        error = xfs_qm_dqattach(dp, 0);
@@ -567,6 +569,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
                xfs_trans_set_sync(args.trans);
        }
+        if ((flags & ATTR_KERNOTIME) == 0)
+                xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
        /*
         * Commit the last in the sequence of transactions.
         */
@@ -574,13 +579,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        /*
-         * Hit the inode change time.
-         */
-        if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
-        }
        return(error);
 out:
@@ -1995,7 +1993,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        tmp = (valuelen < XFS_BUF_SIZE(bp))
                                ? valuelen : XFS_BUF_SIZE(bp);
-                        xfs_biomove(bp, 0, tmp, dst, XBF_READ);
+                        xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
                        xfs_buf_relse(bp);
                        dst += tmp;
                        valuelen -= tmp;
@@ -2125,9 +2123,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
                                                        XFS_BUF_SIZE(bp);
-                xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
+                xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
                if (tmp < XFS_BUF_SIZE(bp))
-                        xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+                        xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
                if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
                        return (error);
                }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
        int                     *flags);        /* inode logging flags */
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
-        int                     rsvd);  /* OK to allocate reserved blocks */
-/*
 * Called by xfs_bmap_add_extent to handle cases converting a delayed
 * allocation to a real allocation.
 */
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp,/* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
        int                     whichfork); /* data or attr fork */
 /*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_del_extent(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_trans_t             *tp,    /* current trans pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        xfs_btree_cur_t         *cur,   /* if null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp,/* inode logging flags */
-        int                     whichfork, /* data or attr fork */
-        int                     rsvd);   /* OK to allocate reserved blocks */
-/*
 * Remove the entry "free" from the free item list.  Prev points to the
 * previous entry, unless "free" is the head of the list.
 */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
 STATIC int                              /* error */
 xfs_bmap_add_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
+        int                     whichfork) /* data or attr fork */
-        int                     rsvd)   /* OK to use reserved data blocks */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor or null */
        xfs_filblks_t           da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
        xfs_extnum_t            nextents; /* number of extents in file now */
        XFS_STATS_INC(xs_add_exlist);
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        ASSERT(idx <= nextents);
        da_old = da_new = 0;
        error = 0;
+        ASSERT(*idx >= 0);
+        ASSERT(*idx <= nextents);
        /*
         * This is the first extent added to a new/empty file.
         * Special case this one, so other routines get to assume there are
         * already extents in the list.
         */
        if (nextents == 0) {
-                xfs_iext_insert(ip, 0, 1, new,
+                xfs_iext_insert(ip, *idx, 1, new,
                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
                ASSERT(cur == NULL);
-                ifp->if_lastex = 0;
                if (!isnullstartblock(new->br_startblock)) {
                        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+                error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-                                &logflags, rsvd)))
+                                                       &logflags);
-                        goto done;
        }
        /*
         * Real allocation off the end of the file.
         */
-        else if (idx == nextents) {
+        else if (*idx == nextents) {
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+                error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, whichfork)))
+                                &logflags, whichfork);
-                        goto done;
        } else {
                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
                /*
                 * Get the record referred to by idx.
                 */
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
                /*
                 * If it's a real allocation record, and the new allocation ends
                 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
                                if (cur)
                                        ASSERT(cur->bc_private.b.flags &
                                                XFS_BTCUR_BPRV_WASDEL);
-                                if ((error = xfs_bmap_add_extent_delay_real(ip,
+                                error = xfs_bmap_add_extent_delay_real(ip,
-                                        idx, &cur, new, &da_new, first, flist,
+                                                idx, &cur, new, &da_new,
-                                        &logflags, rsvd)))
+                                                first, flist, &logflags);
-                                        goto done;
-                        } else if (new->br_state == XFS_EXT_NORM) {
-                                ASSERT(new->br_state == XFS_EXT_NORM);
-                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags)))
-                                        goto done;
                        } else {
-                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+                                ASSERT(new->br_state == XFS_EXT_NORM ||
-                                if ((error = xfs_bmap_add_extent_unwritten_real(
+                                       new->br_state == XFS_EXT_UNWRITTEN);
-                                        ip, idx, &cur, new, &logflags)))
+                                error = xfs_bmap_add_extent_unwritten_real(ip,
+                                                idx, &cur, new, &logflags);
+                                if (error)
                                        goto done;
                        }
-                        ASSERT(*curp == cur || *curp == NULL);
                }
                /*
                 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
                        if (cur)
                                ASSERT((cur->bc_private.b.flags &
                                        XFS_BTCUR_BPRV_WASDEL) == 0);
-                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+                        error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, whichfork)))
+                                        new, &logflags, whichfork);
-                                goto done;
                }
        }
+        if (error)
+                goto done;
        ASSERT(*curp == cur || *curp == NULL);
        /*
         * Convert to a btree if necessary.
         */
@@ -614,8 +579,8 @@ xfs_bmap_add_extent(
                        nblks += cur->bc_private.b.allocated;
                ASSERT(nblks <= da_old);
                if (nblks < da_old)
-                        xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                        xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                                (int64_t)(da_old - nblks), rsvd);
+                                (int64_t)(da_old - nblks), 0);
        }
        /*
         * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        int                     rsvd)   /* OK to use reserved data block allocation */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
        int                     diff;   /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
         */
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        new_endoff = new->br_startoff + new->br_blockcount;
        ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left and right neighbors are both contiguous with new.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 2, state);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
-                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left neighbor is contiguous, the right is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx + 1, 1, state);
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, PREV.br_state)))
                                goto done;
                }
                *dnew = 0;
                break;
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                *dnew = 0;
                break;
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
                *dnew = temp;
                break;
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, new_endoff);
                temp = PREV.br_blockcount - new->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, idx + 1);
+                ep = xfs_iext_get_ext(ifp, *idx + 1);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
                *dnew = temp;
                break;
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is contiguous with the new allocation.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount,
                        RIGHT.br_state);
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_state)))
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock));
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp;
                break;
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is not contiguous.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, idx + 1, 1, new, state);
+                xfs_iext_insert(ip, *idx + 1, 1, new, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, idx);
+                ep = xfs_iext_get_ext(ifp, *idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp;
                break;
@@ -1038,18 +1007,34 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the middle part of a previous delayed allocation.
                 * Contiguity is impossible here.
                 * This case is avoided almost all the time.
+                 *
+                 * We start with a delayed allocation:
+                 *
+                 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                 *  PREV @ idx
+                 *
+                 * and we are allocating:
+                 *                     +rrrrrrrrrrrrrrrrr+
+                 *                            new
+                 *
+                 * and we set it up for insertion as:
+                 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                 *                            new
+                 *  PREV @ idx          LEFT              RIGHT
+                 *                      inserted at idx + 1
                 */
                temp = new->br_startoff - PREV.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_bmbt_set_blockcount(ep, temp);
-                r[0] = *new;
-                r[1].br_state = PREV.br_state;
-                r[1].br_startblock = 0;
-                r[1].br_startoff = new_endoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                r[1].br_blockcount = temp2;
+                trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
-                ip->i_df.if_lastex = idx + 1;
+                LEFT = *new;
+                RIGHT.br_state = PREV.br_state;
+                RIGHT.br_startblock = nullstartblock(
+                                (int)xfs_bmap_worst_indlen(ip, temp2));
+                RIGHT.br_startoff = new_endoff;
+                RIGHT.br_blockcount = temp2;
+                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+                xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1079,7 +1064,8 @@ xfs_bmap_add_extent_delay_real(
                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
                if (diff > 0 &&
-                    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
+                    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                                             -((int64_t)diff), 0)) {
                        /*
                         * Ick gross gag me with a spoon.
                         */
@@ -1089,27 +1075,31 @@ xfs_bmap_add_extent_delay_real(
                                        temp--;
                                        diff--;
                                        if (!diff ||
-                                            !xfs_mod_incore_sb(ip->i_mount,
+                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+                                                    XFS_SBS_FDBLOCKS,
+                                                    -((int64_t)diff), 0))
                                                break;
                                }
                                if (temp2) {
                                        temp2--;
                                        diff--;
                                        if (!diff ||
-                                            !xfs_mod_incore_sb(ip->i_mount,
+                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd))
+                                                    XFS_SBS_FDBLOCKS,
+                                                    -((int64_t)diff), 0))
                                                break;
                                }
                        }
                }
-                ep = xfs_iext_get_ext(ifp, idx);
+                ep = xfs_iext_get_ext(ifp, *idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
                        nullstartblock((int)temp2));
-                trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+                ++*idx;
                *dnew = temp + temp2;
                break;
@@ -1141,7 +1131,7 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp) /* inode logging flags */
@@ -1168,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
        error = 0;
        cur = *curp;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        newext = new->br_state;
        oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1191,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -1211,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1242,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The left and right neighbors are both contiguous with new.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 2, state);
+                xfs_iext_remove(ip, *idx + 1, 2, state);
-                ip->i_df.if_lastex = idx - 1;
                ip->i_d.di_nextents -= 2;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1285,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The left neighbor is contiguous, the right is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1321,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting all of a previous oldext extent to newext.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
                xfs_bmbt_set_state(ep, newext);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx + 1, 1, state);
                ip->i_d.di_nextents--;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1358,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_state(ep, newext);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1384,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                --*idx;
-                ip->i_df.if_lastex = idx - 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1429,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the first part of a previous oldext extent to newext.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
                xfs_bmbt_set_startoff(ep, new_endoff);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
                xfs_bmbt_set_startblock(ep,
                        new->br_startblock + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1468,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is contiguous with the new allocation.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+                ++*idx;
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount, newext);
-                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx + 1;
                if (cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
@@ -1508,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
                 * Setting the last part of a previous oldext extent to newext.
                 * The right neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount - new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                xfs_iext_insert(ip, idx + 1, 1, new, state);
-                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1548,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
                 * newext.  Contiguity is impossible here.
                 * One extent becomes three extents.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep,
                        new->br_startoff - PREV.br_startoff);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                r[0] = *new;
                r[1].br_startoff = new_endoff;
@@ -1559,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
                        PREV.br_startoff + PREV.br_blockcount - new_endoff;
                r[1].br_startblock = new->br_startblock + new->br_blockcount;
                r[1].br_state = oldext;
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
-                ip->i_df.if_lastex = idx + 1;
+                ++*idx;
+                xfs_iext_insert(ip, *idx, 2, &r[0], state);
                ip->i_d.di_nextents += 2;
                if (cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1630,12 +1625,10 @@ done:
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        int                     rsvd)           /* OK to allocate reserved blocks */
 {
-        xfs_bmbt_rec_host_t     *ep;    /* extent record for idx */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
        xfs_filblks_t           newlen=0;       /* new indirect size */
@@ -1645,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
        xfs_filblks_t           temp=0; /* temp for indirect calculations */
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        ASSERT(isnullstartblock(new->br_startblock));
        /*
         * Check and set flags if this segment has a left neighbor
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -1664,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
         * Check and set flags if the current (right) segment exists.
         * If it doesn't exist, we're converting the hole at end-of-file.
         */
-        if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(ep, &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -1699,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
                 * on the left and on the right.
                 * Merge all three into a single extent record.
                 */
+                --*idx;
                temp = left.br_blockcount + new->br_blockcount +
                        right.br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
                        nullstartblock((int)newlen));
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, idx, 1, state);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                ip->i_df.if_lastex = idx - 1;
                break;
        case BMAP_LEFT_CONTIG:
@@ -1722,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
+                --*idx;
                temp = left.br_blockcount + new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
                oldlen = startblockval(left.br_startblock) +
                        startblockval(new->br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
                        nullstartblock((int)newlen));
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx - 1;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -1741,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                temp = new->br_blockcount + right.br_blockcount;
                oldlen = startblockval(new->br_startblock) +
                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
-                xfs_bmbt_set_allf(ep, new->br_startoff,
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff,
                        nullstartblock((int)newlen), temp, right.br_state);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ip->i_df.if_lastex = idx;
                break;
        case 0:
@@ -1760,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
                 * Insert a new entry.
                 */
                oldlen = newlen = 0;
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ip->i_df.if_lastex = idx;
                break;
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
-                xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                        (int64_t)(oldlen - newlen), rsvd);
+                        (int64_t)(oldlen - newlen), 0);
                /*
                 * Nothing to do for disk quota accounting here.
                 */
@@ -1783,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
        int                     whichfork) /* data or attr fork */
 {
-        xfs_bmbt_rec_host_t     *ep;    /* pointer to extent entry ins. point */
        int                     error;  /* error return value */
        int                     i;      /* temp state */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
@@ -1799,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
        int                     state;  /* state bits, accessed thru macros */
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
        if (whichfork == XFS_ATTR_FORK)
@@ -1809,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
        /*
         * Check and set flags if this segment has a left neighbor.
         */
-        if (idx > 0) {
+        if (*idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
        }
@@ -1820,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
         * Check and set flags if this segment has a current value.
         * Not true if we're inserting into the "hole" at eof.
         */
-        if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(ep, &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1859,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
                 * left and on the right.
                 * Merge all three into a single extent record.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        left.br_blockcount + new->br_blockcount +
                        right.br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                xfs_iext_remove(ip, *idx + 1, 1, state);
-                xfs_iext_remove(ip, idx, 1, state);
-                ifp->if_lastex = idx - 1;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                if (cur == NULL) {
@@ -1901,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
+                --*idx;
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
                        left.br_blockcount + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ifp->if_lastex = idx - 1;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
@@ -1932,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + right.br_blockcount,
                        right.br_state);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                ifp->if_lastex = idx;
                if (cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
@@ -1964,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
                 * real allocation.
                 * Insert a new entry.
                 */
-                xfs_iext_insert(ip, idx, 1, new, state);
+                xfs_iext_insert(ip, *idx, 1, new, state);
-                ifp->if_lastex = idx;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                if (cur == NULL) {
@@ -2345,6 +2333,13 @@ xfs_bmap_rtalloc(
         */
        if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
                ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+        /*
+         * Lock out other modifications to the RT bitmap inode.
+         */
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
        /*
         * If it's an allocation to an empty file at offset 0,
         * pick an extent that will space things out in the rt area.
@@ -2427,7 +2422,7 @@ xfs_bmap_btalloc_nullfb(
                startag = ag = 0;
        pag = xfs_perag_get(mp, ag);
-        while (*blen < ap->alen) {
+        while (*blen < args->maxlen) {
                if (!pag->pagf_init) {
                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
                                                    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2449,7 +2444,7 @@ xfs_bmap_btalloc_nullfb(
                        notinit = 1;
                if (xfs_inode_is_filestream(ap->ip)) {
-                        if (*blen >= ap->alen)
+                        if (*blen >= args->maxlen)
                                break;
                        if (ap->userdata) {
@@ -2495,14 +2490,14 @@ xfs_bmap_btalloc_nullfb(
         * If the best seen length is less than the request
         * length, use the best as the minimum.
         */
-        else if (*blen < ap->alen)
+        else if (*blen < args->maxlen)
                args->minlen = *blen;
        /*
-         * Otherwise we've seen an extent as big as alen,
+         * Otherwise we've seen an extent as big as maxlen,
         * use that as the minimum.
         */
        else
-                args->minlen = ap->alen;
+                args->minlen = args->maxlen;
        /*
         * set the failure fallback case to look in the selected
@@ -2570,7 +2565,9 @@ xfs_bmap_btalloc(
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->rval;
-        args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+        /* Trim the allocation back to the maximum an AG can fit. */
+        args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
@@ -2618,7 +2615,7 @@ xfs_bmap_btalloc(
                        /*
                         * Adjust for alignment
                         */
-                        if (blen > args.alignment && blen <= ap->alen)
+                        if (blen > args.alignment && blen <= args.maxlen)
                                args.minlen = blen - args.alignment;
                        args.minalignslop = 0;
                } else {
@@ -2637,7 +2634,7 @@ xfs_bmap_btalloc(
                         * of minlen+alignment+slop doesn't go up
                         * between the calls.
                         */
-                        if (blen > mp->m_dalign && blen <= ap->alen)
+                        if (blen > mp->m_dalign && blen <= args.maxlen)
                                nextminlen = blen - mp->m_dalign;
                        else
                                nextminlen = args.minlen;
@@ -2804,13 +2801,12 @@ STATIC int				/* error */
 xfs_bmap_del_extent(
        xfs_inode_t             *ip,    /* incore inode pointer */
        xfs_trans_t             *tp,    /* current transaction pointer */
-        xfs_extnum_t            idx,    /* extent number to update/delete */
+        xfs_extnum_t            *idx,   /* extent number to update/delete */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork, /* data or attr fork */
+        int                     whichfork) /* data or attr fork */
-        int                     rsvd)   /* OK to allocate reserved blocks */
 {
        xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
        xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
@@ -2841,10 +2837,10 @@ xfs_bmap_del_extent(
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT((idx >= 0) && (idx < ifp->if_bytes /
+        ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
                (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(del->br_blockcount > 0);
-        ep = xfs_iext_get_ext(ifp, idx);
+        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &got);
        ASSERT(got.br_startoff <= del->br_startoff);
        del_endoff = del->br_startoff + del->br_blockcount;
@@ -2918,11 +2914,12 @@ xfs_bmap_del_extent(
                /*
                 * Matches the whole extent.  Delete the entry.
                 */
-                xfs_iext_remove(ip, idx, 1,
+                xfs_iext_remove(ip, *idx, 1,
                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-                ifp->if_lastex = idx;
+                --*idx;
                if (delay)
                        break;
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                flags |= XFS_ILOG_CORE;
@@ -2939,21 +2936,20 @@ xfs_bmap_del_extent(
                /*
                 * Deleting the first part of the extent.
                 */
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, del_endoff);
                temp = got.br_blockcount - del->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                        da_new = temp;
                        break;
                }
                xfs_bmbt_set_startblock(ep, del_endblock);
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -2969,18 +2965,17 @@ xfs_bmap_del_extent(
                 * Deleting the last part of the extent.
                 */
                temp = got.br_blockcount - del->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                ifp->if_lastex = idx;
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                        trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                        trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                        da_new = temp;
                        break;
                }
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
                if (!cur) {
                        flags |= xfs_ilog_fext(whichfork);
                        break;
@@ -2997,7 +2992,7 @@ xfs_bmap_del_extent(
                 * Deleting the middle of the extent.
                 */
                temp = del->br_startoff - got.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
                new.br_startoff = del_endoff;
                temp2 = got_endoff - del_endoff;
@@ -3084,9 +3079,9 @@ xfs_bmap_del_extent(
                                }
                        }
                }
-                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 1, &new, state);
+                xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-                ifp->if_lastex = idx + 1;
+                ++*idx;
                break;
        }
        /*
@@ -3111,9 +3106,10 @@ xfs_bmap_del_extent(
         * Nothing to do for disk quota accounting here.
         */
        ASSERT(da_old >= da_new);
-        if (da_old > da_new)
+        if (da_old > da_new) {
-                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                        rsvd);
+                        (int64_t)(da_old - da_new), 0);
+        }
 done:
        *logflagsp = flags;
        return error;
@@ -3496,7 +3492,7 @@ xfs_bmap_search_extents(
        if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
                     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-                xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+                xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                                "Access to block zero in inode %llu "
                                "start_block: %llx start_off: %llx "
                                "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4170,12 +4166,11 @@ xfs_bmap_read_extents(
                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
                                "corrupt dinode %Lu, (btree extents).",
                                (unsigned long long) ip->i_ino);
-                        XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
+                        XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-                                         XFS_ERRLEVEL_LOW,
+                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
-                                        ip->i_mount);
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
@@ -4481,6 +4476,16 @@ xfs_bmapi(
                                /* Figure out the extent size, adjust alen */
                                extsz = xfs_get_extsz_hint(ip);
                                if (extsz) {
+                                        /*
+                                         * make sure we don't exceed a single
+                                         * extent length when we align the
+                                         * extent by reducing length we are
+                                         * going to allocate by the maximum
+                                         * amount extent size aligment may
+                                         * require.
+                                         */
+                                        alen = XFS_FILBLKS_MIN(len,
+                                                   MAXEXTLEN - (2 * extsz - 1));
                                        error = xfs_bmap_extsize_align(mp,
                                                        &got, &prev, extsz,
                                                        rt, eof,
@@ -4523,29 +4528,24 @@ xfs_bmapi(
                                if (rt) {
                                        error = xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FREXTENTS,
-                                                        -((int64_t)extsz), (flags &
+                                                        -((int64_t)extsz), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                } else {
-                                        error = xfs_mod_incore_sb(mp,
+                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)alen), (flags &
+                                                        -((int64_t)alen), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (!error) {
-                                        error = xfs_mod_incore_sb(mp,
+                                        error = xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)indlen), (flags &
+                                                        -((int64_t)indlen), 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                        if (error && rt)
                                                xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FREXTENTS,
-                                                        (int64_t)extsz, (flags &
+                                                        (int64_t)extsz, 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                        else if (error)
-                                                xfs_mod_incore_sb(mp,
+                                                xfs_icsb_modify_counters(mp,
                                                        XFS_SBS_FDBLOCKS,
-                                                        (int64_t)alen, (flags &
+                                                        (int64_t)alen, 0);
-                                                        XFS_BMAPI_RSVBLOCKS));
                                }
                                if (error) {
@@ -4662,13 +4662,12 @@ xfs_bmapi(
                                if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
                                firstblock, flist, &tmp_logflags,
-                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork);
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        lastx = ifp->if_lastex;
                        ep = xfs_iext_get_ext(ifp, lastx);
                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
                        xfs_bmbt_get_all(ep, &got);
@@ -4744,8 +4743,12 @@ xfs_bmapi(
                 * Check if writing previously allocated but
                 * unwritten extents.
                 */
-                if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
+                if (wr &&
-                    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+                    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+                     (mval->br_state == XFS_EXT_NORM &&
+                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+                                (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
                        /*
                         * Modify (by adding) the state flag, if writing.
                         */
@@ -4757,14 +4760,15 @@ xfs_bmapi(
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
                        }
-                        mval->br_state = XFS_EXT_NORM;
+                        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+                                                ? XFS_EXT_NORM
+                                                : XFS_EXT_UNWRITTEN;
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
                                firstblock, flist, &tmp_logflags,
-                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+                                whichfork);
                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        lastx = ifp->if_lastex;
                        ep = xfs_iext_get_ext(ifp, lastx);
                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
                        xfs_bmbt_get_all(ep, &got);
@@ -4823,14 +4827,14 @@ xfs_bmapi(
                /*
                 * Else go on to the next record.
                 */
-                ep = xfs_iext_get_ext(ifp, ++lastx);
                prev = got;
-                if (lastx >= nextents)
+                if (++lastx < nextents) {
-                        eof = 1;
+                        ep = xfs_iext_get_ext(ifp, lastx);
-                else
                        xfs_bmbt_get_all(ep, &got);
+                } else {
+                        eof = 1;
+                }
        }
-        ifp->if_lastex = lastx;
        *nmap = n;
        /*
         * Transform from btree to extents, give it cur.
@@ -4939,7 +4943,6 @@ xfs_bmapi_single(
        ASSERT(!isnullstartblock(got.br_startblock));
        ASSERT(bno < got.br_startoff + got.br_blockcount);
        *fsb = got.br_startblock + (bno - got.br_startoff);
-        ifp->if_lastex = lastx;
        return 0;
 }
@@ -4981,7 +4984,6 @@ xfs_bunmapi(
        int                     tmp_logflags;   /* partial logging flags */
        int                     wasdel;         /* was a delayed alloc extent */
        int                     whichfork;      /* data or attribute fork */
-        int                     rsvd;           /* OK to allocate reserved blocks */
        xfs_fsblock_t           sum;
        trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -4999,7 +5001,7 @@ xfs_bunmapi(
        mp = ip->i_mount;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
        ASSERT(ifp->if_ext_max ==
@@ -5115,9 +5117,9 @@ xfs_bunmapi(
                                del.br_blockcount = mod;
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+                        error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
                                firstblock, flist, &logflags,
-                                XFS_DATA_FORK, 0);
+                                XFS_DATA_FORK);
                        if (error)
                                goto error0;
                        goto nodelete;
@@ -5143,9 +5145,12 @@ xfs_bunmapi(
                                 */
                                ASSERT(bno >= del.br_blockcount);
                                bno -= del.br_blockcount;
-                                if (bno < got.br_startoff) {
+                                if (got.br_startoff > bno) {
-                                        if (--lastx >= 0)
+                                        if (--lastx >= 0) {
-                                                xfs_bmbt_get_all(--ep, &got);
+                                                ep = xfs_iext_get_ext(ifp,
+                                                                      lastx);
+                                                xfs_bmbt_get_all(ep, &got);
+                                        }
                                }
                                continue;
                        } else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5169,18 +5174,19 @@ xfs_bunmapi(
                                        prev.br_startoff = start;
                                }
                                prev.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+                                lastx--;
+                                error = xfs_bmap_add_extent(ip, &lastx, &cur,
                                        &prev, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK);
                                if (error)
                                        goto error0;
                                goto nodelete;
                        } else {
                                ASSERT(del.br_state == XFS_EXT_NORM);
                                del.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(ip, lastx, &cur,
+                                error = xfs_bmap_add_extent(ip, &lastx, &cur,
                                        &del, firstblock, flist, &logflags,
-                                        XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5195,13 +5201,13 @@ xfs_bunmapi(
                                rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
                                do_div(rtexts, mp->m_sb.sb_rextsize);
                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                                (int64_t)rtexts, rsvd);
+                                                (int64_t)rtexts, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
-                                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                (int64_t)del.br_blockcount, rsvd);
+                                                (int64_t)del.br_blockcount, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
@@ -5232,31 +5238,29 @@ xfs_bunmapi(
                        error = XFS_ERROR(ENOSPC);
                        goto error0;
                }
-                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+                error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
-                                &tmp_logflags, whichfork, rsvd);
+                                &tmp_logflags, whichfork);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
                bno = del.br_startoff - 1;
 nodelete:
-                lastx = ifp->if_lastex;
                /*
                 * If not done go on to the next (previous) record.
-                 * Reset ep in case the extents array was re-alloced.
                 */
-                ep = xfs_iext_get_ext(ifp, lastx);
                if (bno != (xfs_fileoff_t)-1 && bno >= start) {
-                        if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+                        if (lastx >= 0) {
-                            xfs_bmbt_get_startoff(ep) > bno) {
+                                ep = xfs_iext_get_ext(ifp, lastx);
-                                if (--lastx >= 0)
+                                if (xfs_bmbt_get_startoff(ep) > bno) {
-                                        ep = xfs_iext_get_ext(ifp, lastx);
+                                        if (--lastx >= 0)
-                        }
+                                                ep = xfs_iext_get_ext(ifp,
-                        if (lastx >= 0)
+                                                                      lastx);
+                                }
                                xfs_bmbt_get_all(ep, &got);
+                        }
                        extno++;
                }
        }
-        ifp->if_lastex = lastx;
        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
@@ -5461,8 +5465,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -5728,7 +5737,7 @@ xfs_check_block(
                        else
                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
                        if (*thispa == *pp) {
-                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
+                                xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
@@ -5893,11 +5902,11 @@ xfs_bmap_check_leaf_extents(
        return;
 error0:
-        cmn_err(CE_WARN, "%s: at error0", __func__);
+        xfs_warn(mp, "%s: at error0", __func__);
        if (bp_release)
                xfs_trans_brelse(NULL, bp);
 error_norelse:
-        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
+        xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
                __func__, i);
        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
        return;
@@ -6060,3 +6069,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_alert(ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,14 +69,16 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
 #define XFS_BMAPI_ATTRFORK      0x010   /* use attribute fork not data */
-#define XFS_BMAPI_RSVBLOCKS     0x020   /* OK to alloc. reserved data blocks */
 #define XFS_BMAPI_PREALLOC      0x040   /* preallocation op: unwritten space */
 #define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
                                        /* combine contig. space */
 #define XFS_BMAPI_CONTIG        0x100   /* must allocate only one extent */
-#define XFS_BMAPI_CONVERT       0x200   /* unwritten extent conversion - */
+/*
-                                        /* need write cache flushing and no */
+ * unwritten extent conversion - this needs write cache flushing and no additional
-                                        /* additional allocation alignments */
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT       0x200
 #define XFS_BMAPI_FLAGS \
        { XFS_BMAPI_WRITE,      "WRITE" }, \
@@ -84,7 +86,6 @@ typedef	struct xfs_bmap_free
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
-        { XFS_BMAPI_RSVBLOCKS,  "RSVBLOCKS" }, \
        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
        { XFS_BMAPI_CONTIG,     "CONTIG" }, \
@@ -391,6 +392,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
         */
        for (i = 0; i < cur->bc_nlevels; i++) {
                if (cur->bc_bufs[i])
-                        xfs_btree_setbuf(cur, i, NULL);
+                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
                else if (!error)
                        break;
        }
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -656,7 +655,7 @@ xfs_btree_reada_bufl(
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 /*
@@ -676,7 +675,7 @@ xfs_btree_reada_bufs(
        ASSERT(agno != NULLAGNUMBER);
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 STATIC int
@@ -763,22 +762,19 @@ xfs_btree_readahead(
 * Set the buffer for level "lev" in the cursor to bp, releasing
 * any previous buffer.
 */
-void
+STATIC void
 xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
        xfs_buf_t               *bp)    /* new buffer to set */
 {
        struct xfs_btree_block  *b;     /* btree block */
-        xfs_buf_t               *obp;   /* old buffer pointer */
-        obp = cur->bc_bufs[lev];
+        if (cur->bc_bufs[lev])
-        if (obp)
+                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
-                xfs_trans_brelse(cur->bc_tp, obp);
        cur->bc_bufs[lev] = bp;
        cur->bc_ra[lev] = 0;
-        if (!bp)
-                return;
        b = XFS_BUF_TO_BLOCK(bp);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -947,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
@@ -3011,6 +3007,43 @@ out0:
        return 0;
 }
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     level,
+        union xfs_btree_ptr     *newroot)
+{
+        int                     error;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
+        /*
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
+         */
+        cur->bc_ops->set_root(cur, newroot, -1);
+        error = cur->bc_ops->free_block(cur, bp);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                return error;
+        }
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level] = NULL;
+        cur->bc_ra[level] = 0;
+        cur->bc_nlevels--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
 STATIC int
 xfs_btree_dec_cursor(
        struct xfs_btree_cur    *cur,
@@ -3195,7 +3228,7 @@ xfs_btree_delrec(
                         * Make it the new root of the btree.
                         */
                        pp = xfs_btree_ptr_addr(cur, 1, block);
-                        error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                        error = xfs_btree_kill_root(cur, bp, level, pp);
                        if (error)
                                goto error0;
                } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
        /* update btree root pointer */
        void    (*set_root)(struct xfs_btree_cur *cur,
-                                union xfs_btree_ptr *nptr, int level_change);
+                            union xfs_btree_ptr *nptr, int level_change);
-        int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
-                                int level, union xfs_btree_ptr *newroot);
        /* block allocation / freeing */
        int     (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
        xfs_agblock_t           agbno,  /* allocation group block number */
        xfs_extlen_t            count); /* count of filesystem blocks */
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     lev,    /* level in btree */
-        struct xfs_buf          *bp);   /* new buffer to set */
 /*
 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..7b7e005e3dcc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
        orig = bip->bli_orig;
        buffer = XFS_BUF_PTR(bp);
        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
-                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(bp->b_mount,
-        "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+                                "%s: bip %x buffer %x orig %x index %d",
-                                bip, bp, orig, x);
+                                __func__, bip, bp, orig, x);
+                        ASSERT(0);
+                }
        }
 }
 #else
@@ -141,8 +143,7 @@ xfs_buf_item_log_check(
 #define         xfs_buf_item_log_check(x)
 #endif
-STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 /*
 * This returns the number of log iovecs needed to log the
@@ -428,13 +429,15 @@ xfs_buf_item_unpin(
                if (remove) {
                        /*
-                         * We have to remove the log item from the transaction
+                         * If we are in a transaction context, we have to
-                         * as we are about to release our reference to the
+                         * remove the log item from the transaction as we are
-                         * buffer.  If we don't, the unlock that occurs later
+                         * about to release our reference to the buffer.  If we
-                         * in xfs_trans_uncommit() will ry to reference the
+                         * don't, the unlock that occurs later in
+                         * xfs_trans_uncommit() will try to reference the
                         * buffer which we no longer have a hold on.
                         */
-                        xfs_trans_del_item(lip);
+                        if (lip->li_desc)
+                                xfs_trans_del_item(lip);
                        /*
                         * Since the transaction no longer refers to the buffer,
@@ -450,7 +453,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -692,8 +695,7 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-        if (bp->b_mount != mp)
+        ASSERT(bp->b_target->bt_mount == mp);
-                bp->b_mount = mp;
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                if (lip->li_type == XFS_LI_BUF) {
@@ -919,15 +921,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -937,7 +950,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -950,128 +962,75 @@ xfs_buf_do_callbacks(
 */
 void
 xfs_buf_iodone_callbacks(
-        xfs_buf_t       *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_log_item_t  *lip;
+        struct xfs_log_item     *lip = bp->b_fspriv;
-        static ulong    lasttime;
+        struct xfs_mount        *mp = lip->li_mountp;
-        static xfs_buftarg_t *lasttarg;
+        static ulong            lasttime;
-        xfs_mount_t     *mp;
+        static xfs_buftarg_t    *lasttarg;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        if (likely(!XFS_BUF_GETERROR(bp)))
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                goto do_callbacks;
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        /*
-                /*
+         * If we've already decided to shutdown the filesystem because of
-                 * If we've already decided to shutdown the filesystem
+         * I/O errors, there's no point in giving this a retry.
-                 * because of IO errors, there's no point in giving this
+         */
-                 * a retry.
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                 */
+                XFS_BUF_SUPER_STALE(bp);
-                mp = lip->li_mountp;
+                trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                goto do_callbacks;
-                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+        }
-                        XFS_BUF_SUPER_STALE(bp);
-                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
-                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
-                        XFS_BUF_CLR_IODONE_FUNC(bp);
-                        xfs_biodone(bp);
-                        return;
-                }
-                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
+        if (XFS_BUF_TARGET(bp) != lasttarg ||
-                    (time_after(jiffies, (lasttime + 5*HZ)))) {
+            time_after(jiffies, (lasttime + 5*HZ))) {
-                        lasttime = jiffies;
+                lasttime = jiffies;
-                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-                                        " block 0x%llx in %s",
+                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                      (__uint64_t)XFS_BUF_ADDR(bp));
-                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+        }
-                }
+        lasttarg = XFS_BUF_TARGET(bp);
-                lasttarg = XFS_BUF_TARGET(bp);
-                if (XFS_BUF_ISASYNC(bp)) {
+        /*
-                        /*
+         * If the write was asynchronous then no one will be looking for the
-                         * If the write was asynchronous then noone will be
+         * error.  Clear the error state and write the buffer out again.
-                         * looking for the error.  Clear the error state
+         *
-                         * and write the buffer out again delayed write.
+         * During sync or umount we'll write all pending buffers again
-                         *
+         * synchronous, which will catch these errors if they keep hanging
-                         * XXXsup This is OK, so long as we catch these
+         * around.
-                         * before we start the umount; we don't want these
+         */
-                         * DELWRI metadata bufs to be hanging around.
+        if (XFS_BUF_ISASYNC(bp)) {
-                         */
+                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
-                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+                if (!XFS_BUF_ISSTALE(bp)) {
-                        if (!(XFS_BUF_ISSTALE(bp))) {
+                        XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DONE(bp);
-                                XFS_BUF_SET_START(bp);
-                        }
-                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-                        xfs_buf_relse(bp);
-                } else {
-                        /*
-                         * If the write of the buffer was not asynchronous,
-                         * then we want to make sure to return the error
-                         * to the caller of bwrite().  Because of this we
-                         * cannot clear the B_ERROR state at this point.
-                         * Instead we install a callback function that
-                         * will be called when the buffer is released, and
-                         * that routine will clear the error state and
-                         * set the buffer to be written out again after
-                         * some delay.
-                         */
-                        /* We actually overwrite the existing b-relse
-                           function at times, but we're gonna be shutting down
-                           anyway. */
-                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_FINISH_IOWAIT(bp);
+                        XFS_BUF_SET_START(bp);
                }
+                ASSERT(XFS_BUF_IODONE_FUNC(bp));
+                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+                xfs_buf_relse(bp);
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        /*
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+         * If the write of the buffer was synchronous, we want to make
-        XFS_BUF_CLR_IODONE_FUNC(bp);
+         * sure to return the error to the caller of xfs_bwrite().
-        xfs_biodone(bp);
+         */
-}
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-        xfs_buf_t       *bp)
-{
-        xfs_log_item_t  *lip;
-        xfs_mount_t     *mp;
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-        mp = (xfs_mount_t *)lip->li_mountp;
-        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
        XFS_BUF_STALE(bp);
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_ERROR(bp,0);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-        if (! XFS_FORCED_SHUTDOWN(mp))
+do_callbacks:
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_do_callbacks(bp);
-        /*
-         * We have to unpin the pinned buffers so do the
-         * callbacks.
-         */
-        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+        xfs_buf_ioend(bp, 0);
-        xfs_buf_relse(bp);
 }
 /*
 * This is the iodone() function for buffers which have been
 * logged.  It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
                error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
                if (unlikely(error == EFSCORRUPTED)) {
                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-                                cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
+                                xfs_alert(mp, "%s: bno %lld dir: inode %lld",
-                                        (long long)bno);
+                                        __func__, (long long)bno,
-                                cmn_err(CE_ALERT, "dir: inode %lld\n",
                                        (long long)dp->i_ino);
                                for (i = 0; i < nmap; i++) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
                                                i,
                                                (long long)mapp[i].br_startoff,
                                                (long long)mapp[i].br_startblock,
@@ -2042,7 +2041,7 @@ xfs_da_do_buf(
                                mappedbno, nmapped, 0, &bp);
                        break;
                case 3:
-                        xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+                        xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
                        error = 0;
                        bp = NULL;
                        break;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..9a84a85c03b1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
        xfs_inode_t     *tip,   /* tmp inode */
        xfs_swapext_t   *sxp)
 {
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = ip->i_mount;
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
        int             taforkblks = 0;
        __uint64_t      tmp;
-        mp = ip->i_mount;
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
                goto out;
        }
-        sbp = &sxp->sx_stat;
        /*
         * we have to do two separate lock calls here to keep lockdep
         * happy. If we try to get all the locks in one call, lock will
@@ -270,9 +266,9 @@ xfs_swap_extents(
        /* check inode formats now that data is flushed */
        error = xfs_swap_extents_check_format(ip, tip);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                    "%s: inode 0x%llx format is incompatible for exchanging.",
-                                __FILE__, ip->i_ino);
+                                __func__, ip->i_ino);
                goto out_unlock;
        }
@@ -377,6 +373,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
        __be32          di_uid;         /* owner's user id */
        __be32          di_gid;         /* owner's group id */
        __be32          di_nlink;       /* number of links to file */
-        __be16          di_projid;      /* owner's project id */
+        __be16          di_projid_lo;   /* lower part of owner's project id */
-        __u8            di_pad[8];      /* unused, zeroed space */
+        __be16          di_projid_hi;   /* higher part owner's project id */
+        __u8            di_pad[6];      /* unused, zeroed space */
        __be16          di_flushiter;   /* incremented on flush */
        xfs_timestamp_t di_atime;       /* time last accessed */
        xfs_timestamp_t di_mtime;       /* time last modified */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
                        XFS_RANDOM_DIR_INO_VALIDATE))) {
-                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+                xfs_warn(mp, "Invalid inode number 0x%Lx",
                                (unsigned long long) ino);
                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
                return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
                                if (i > ra_current &&
                                    map[ra_index].br_blockcount >=
                                    mp->m_dirblkfsbs) {
-                                        xfs_baread(mp->m_ddev_targp,
+                                        xfs_buf_readahead(mp->m_ddev_targp,
                                                XFS_FSB_TO_DADDR(mp,
                                                   map[ra_index].br_startblock +
                                                   ra_offset),
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
        if(blk2->index < 0) {
                state->inleaf = 1;
                blk2->index = 0;
-                cmn_err(CE_ALERT,
+                xfs_alert(args->dp->i_mount,
-                        "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: "
+        "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
-                        "blk1->index %d\n",
+                        __func__, blk1->index);
-                        blk1->index);
        }
 }
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
                        }
                        if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
-                                        "xfs_dir2_node_addname_int: dir ino "
+                        "%s: dir ino " "%llu needed freesp block %lld for\n"
-                                        "%llu needed freesp block %lld for\n"
+                        "  data block %lld, got %lld ifbno %llu lastfbno %d",
-                                        "  data block %lld, got %lld\n"
+                                        __func__, (unsigned long long)dp->i_ino,
-                                        "  ifbno %llu lastfbno %d\n",
-                                        (unsigned long long)dp->i_ino,
                                        (long long)xfs_dir2_db_to_fdb(mp, dbno),
                                        (long long)dbno, (long long)fbno,
                                        (unsigned long long)ifbno, lastfbno);
                                if (fblk) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                " fblk 0x%p blkno %llu "
+                                " fblk 0x%p blkno %llu index %d magic 0x%x",
-                                                "index %d magic 0x%x\n",
                                                fblk,
                                                (unsigned long long)fblk->blkno,
                                                fblk->index,
                                                fblk->magic);
                                } else {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp, " ... fblk is NULL");
-                                                " ... fblk is NULL\n");
                                }
                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
                                                 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
                        break;
                if (e != xfs_etrap[i])
                        continue;
-                cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+                xfs_notice(NULL, "%s: error %d", __func__, e);
                BUG();
                break;
        }
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -73,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(NULL,
        "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
                                expression, file, line, xfs_etest_fsname[i]);
                        return 1;
@@ -94,25 +95,26 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
-                        cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+                        xfs_warn(mp, "error tag #%d on", error_tag);
                        return 0;
                }
        }
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == 0) {
-                        cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+                        xfs_warn(mp, "Turned on XFS error tag #%d",
                                error_tag);
                        xfs_etest[i] = error_tag;
                        xfs_etest_fsid[i] = fsid;
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
-        cmn_err(CE_WARN, "error tag overflow, too many turned on");
+        xfs_warn(mp, "error tag overflow, too many turned on");
        return 1;
 }
@@ -131,55 +133,23 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
                     xfs_etest[i] != 0) {
                        cleared = 1;
-                        cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+                        xfs_warn(mp, "Clearing XFS error tag #%d",
                                xfs_etest[i]);
                        xfs_etest[i] = 0;
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
        if (loud || cleared)
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Cleared all XFS error tags for filesystem");
-                        "Cleared all XFS error tags for filesystem \"%s\"",
-                        mp->m_fsname);
        return 0;
 }
 #endif /* DEBUG */
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-#ifdef DEBUG
-        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-            && (level & CE_ALERT)) {
-                level &= ~CE_ALERT;
-                level |= CE_PANIC;
-                cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-        }
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
 void
 xfs_error_report(
        const char              *tag,
@@ -190,9 +160,8 @@ xfs_error_report(
        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
-                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
+                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                            CE_ALERT, mp,
+                "Internal error %s at line %d of file %s.  Caller 0x%p\n",
-                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
@@ -212,4 +181,5 @@ xfs_corruption_error(
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
        xfs_error_report(tag, level, mp, filename, linenum, ra);
+        xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,16 +127,17 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
@@ -144,10 +145,8 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #endif /* DEBUG */
 /*
- * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
- *                      a panic by setting xfs_panic_mask in a
+ *                      a panic by setting xfs_panic_mask in a sysctl.
- *                      sysctl.  update xfs_max[XFS_PARAM] if
- *                      more are added.
 */
 #define         XFS_NO_PTAG                     0
 #define         XFS_PTAG_IFLUSH                 0x00000001
@@ -159,23 +158,4 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define         XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
-struct xfs_mount;
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-                char *fmt, va_list ap)
-        __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...)
-        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
-extern void xfs_hex_dump(void *p, int length);
-#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
-        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
-#define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
+        if (remove) {
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-                if (remove)
+                if (lip->li_desc)
                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +254,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
         * If the file's parent directory is known, take its iolock in exclusive
         * mode to prevent two sibling files from racing each other to migrate
         * themselves and their parent to different AGs.
+         *
+         * Note that we lock the parent directory iolock inside the child
+         * iolock here.  That's fine as we never hold both parent and child
+         * iolock in any other place.  This is different from the ilock,
+         * which requires locking of the child after the parent for namespace
+         * operations.
         */
        if (pip)
-                xfs_ilock(pip, XFS_IOLOCK_EXCL);
+                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        /*
         * A new AG needs to be found for the file.  If the file's parent
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
        __s32           bs_extsize;     /* extent size                  */
        __s32           bs_extents;     /* number of extents            */
        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
+        __u16           bs_projid_lo;   /* lower part of project id     */
+#define bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
        __u16           bs_forkoff;     /* inode fork offset in bytes   */
-        unsigned char   bs_pad[12];     /* pad space, unused            */
+        __u16           bs_projid_hi;   /* higher part of project id    */
+        unsigned char   bs_pad[10];     /* pad space, unused            */
        __u32           bs_dmevmask;    /* DMIG event mask              */
        __u16           bs_dmstate;     /* DMIG state info              */
        __u16           bs_aextents;    /* attribute number of extents  */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
 /*      XFS_IOC_SETBIOSIZE ---- deprecated 46      */
 /*      XFS_IOC_GETBIOSIZE ---- deprecated 47      */
 #define XFS_IOC_GETBMAPX        _IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE      _IOW ('X', 57, struct xfs_flock64)
 /*
 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
        xfs_fsop_geom_t         *geo,
        int                     new_version)
 {
+        memset(geo, 0, sizeof(*geo));
        geo->blocksize = mp->m_sb.sb_blocksize;
        geo->rtextsize = mp->m_sb.sb_rextsize;
        geo->agblocks = mp->m_sb.sb_agblocks;
@@ -144,12 +147,11 @@ xfs_growfs_data_private(
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
-        error = xfs_read_buf(mp, mp->m_ddev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-                        XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
-        if (error)
+        if (!bp)
-                return error;
+                return EIO;
-        ASSERT(bp);
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
@@ -375,6 +377,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
@@ -382,8 +385,8 @@ xfs_growfs_data_private(
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
                if (error) {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
-                        "error %d reading secondary superblock for ag %d",
+                "error %d reading secondary superblock for ag %d",
                                error, agno);
                        break;
                }
@@ -396,7 +399,7 @@ xfs_growfs_data_private(
                if (!(error = xfs_bwrite(mp, bp))) {
                        continue;
                } else {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
                "write error %d updating secondary superblock for ag %d",
                                error, agno);
                        break; /* no point in continuing */
@@ -597,7 +600,8 @@ out:
                 * the extra reserve blocks from the reserve.....
                 */
                int error;
-                error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0);
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 fdblks_delta, 0);
                if (error == ENOSPC)
                        goto retry;
        }
@@ -611,12 +615,13 @@ out:
 *
 * We cannot use an inode here for this - that will push dirty state back up
 * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
 */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             flags)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -631,8 +636,7 @@ xfs_fs_log_dummy(
        /* log the UUID because it is an unchanging field */
        xfs_mod_sb(tp, XFS_SB_UUID);
-        if (flags & SYNC_WAIT)
+        xfs_trans_set_sync(tp);
-                xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
                 *      to log a whole cluster of inodes instead of all the
                 *      individual transactions causing a lot of log traffic.
                 */
-                xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+                xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
                        uint    isize = sizeof(struct xfs_dinode);
@@ -1055,28 +1055,23 @@ xfs_difree(
         */
        agno = XFS_INO_TO_AGNO(mp, inode);
        if (agno >= mp->m_sb.sb_agcount)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-                        "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agno, mp->m_sb.sb_agcount);
-                        agno, mp->m_sb.sb_agcount, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agino = XFS_INO_TO_AGINO(mp, inode);
        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-                        "xfs_difree: inode != XFS_AGINO_TO_INO() "
+                        __func__, (unsigned long long)inode,
-                        "(%llu != %llu) on %s.  Returning EINVAL.",
+                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-                        (unsigned long long)inode,
-                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
-                        mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
        if (agbno >= mp->m_sb.sb_agblocks)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-                        "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agbno, mp->m_sb.sb_agblocks);
-                        agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
@@ -1085,9 +1080,8 @@ xfs_difree(
         */
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-                        "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                return error;
        }
        agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
         * Look for the entry describing this inode.
         */
        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
-                        "xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
-                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
+                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
-                                error, mp->m_fsname);
+                                __func__, error);
                        goto error0;
                }
@@ -1170,9 +1162,8 @@ xfs_difree(
                error = xfs_inobt_update(cur, &rec);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
-        "xfs_difree: xfs_inobt_update returned an error %d on %s.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                                "xfs_ialloc_read_agi() returned "
+                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
-                                "error %d, agno %d",
+                        __func__, error, agno);
-                                error, agno);
                return error;
        }
@@ -1299,24 +1289,21 @@ xfs_imap(
                if (flags & XFS_IGET_UNTRUSTED)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agno (%d) >= "
+                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
-                                        "mp->m_sb.sb_agcount (%d)",
+                                __func__, agno, mp->m_sb.sb_agcount);
-                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agbno (0x%llx) >= "
+                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
-                                        "mp->m_sb.sb_agblocks (0x%lx)",
+                                __func__, (unsigned long long)agbno,
-                                        (unsigned long long) agbno,
+                                (unsigned long)mp->m_sb.sb_agblocks);
-                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: ino (0x%llx) != "
+                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
-                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
+                                __func__, ino,
-                                        "(0x%llx)",
+                                XFS_AGINO_TO_INO(mp, agno, agino));
-                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
                }
                xfs_stack_trace();
 #endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
         */
        if ((imap->im_blkno + imap->im_len) >
            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        __func__, (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_blkno,
                        (unsigned long long) imap->im_len,
                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
                return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
                          cur->bc_rec.i.ir_startino;
 }
-STATIC int
-xfs_inobt_kill_root(
-        struct xfs_btree_cur    *cur,
-        struct xfs_buf          *bp,
-        int                     level,
-        union xfs_btree_ptr     *newroot)
-{
-        int                     error;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        XFS_BTREE_STATS_INC(cur, killroot);
-        /*
-         * Update the root pointer, decreasing the level by 1 and then
-         * free the old root.
-         */
-        xfs_inobt_set_root(cur, newroot, -1);
-        error = xfs_inobt_free_block(cur, bp);
-        if (error) {
-                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                return error;
-        }
-        XFS_BTREE_STATS_INC(cur, free);
-        cur->bc_bufs[level] = NULL;
-        cur->bc_nlevels--;
-        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        return 0;
-}
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
        .dup_cursor             = xfs_inobt_dup_cursor,
        .set_root               = xfs_inobt_set_root,
-        .kill_root              = xfs_inobt_kill_root,
        .alloc_block            = xfs_inobt_alloc_block,
        .free_block             = xfs_inobt_free_block,
        .get_minrecs            = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..3631783b2b53 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -144,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -202,24 +250,35 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
-                        ip->i_flags &= ~XFS_INEW;
+                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-                        ip->i_flags |= XFS_IRECLAIMABLE;
+                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-                        __xfs_inode_set_reclaim_tag(pag, ip);
                        trace_xfs_iget_reclaim_fail(ip);
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
-                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
+                /*
+                 * Clear the per-lifetime state in the inode as we are now
+                 * effectively a new inode and need to return to the initial
+                 * state before reuse occurs.
+                 */
+                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -230,7 +289,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -244,7 +303,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -297,7 +356,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +371,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -365,8 +424,8 @@ xfs_iget(
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
-        /* the radix tree exists only in inode capable AGs */
+        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +434,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        i * mp->m_sb.sb_inodesize);
                if (!dip->di_next_unlinked)  {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
+        "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
                                bp);
                        ASSERT(dip->di_next_unlinked);
                }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
                                   (int)imap->im_len, buf_flags, &bp);
        if (error) {
                if (error != EAGAIN) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                                "%s: xfs_trans_read_buf() returned error %d.",
-                                "an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                } else {
                        ASSERT(buf_flags & XBF_TRYLOCK);
                }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
                                                XFS_ERRLEVEL_HIGH, mp, dip);
 #ifdef DEBUG
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(mp,
-                                        "Device %s - bad inode magic/vsn "
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                                        "daddr %lld #%d (magic=%x)",
-                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
                                be16_to_cpu(dip->di_magic));
+                        ASSERT(0);
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
                     be64_to_cpu(dip->di_nblocks))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
                        (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
        }
        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
                     !ip->i_mount->m_rtdev_targp)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, has realtime flag set.",
                        ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
                         * no local regular files yet
                         */
                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (local format for regular file).",
-                                        "(local format for regular file).",
                                        (unsigned long long) ip->i_ino);
                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
                                                     XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (bad size %Ld for local inode).",
-                                        "(bad size %Ld for local inode).",
                                        (unsigned long long) ip->i_ino,
                                        (long long) di_size);
                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
                size = be16_to_cpu(atp->hdr.totsize);
                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
-                                "corrupt inode %Lu "
+                                "corrupt inode %Lu (bad attr fork size %Ld).",
-                                "(bad attr fork size %Ld).",
                                (unsigned long long) ip->i_ino,
                                (long long) size);
                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
-                        "corrupt inode %Lu "
+        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                        "(bad size %d for local fork, size = %d).",
                        (unsigned long long) ip->i_ino, size,
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                        "corrupt inode %Lu ((a)extents = %d).",
                        (unsigned long long) ip->i_ino, nex);
                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
            || XFS_BMDR_SPACE_CALC(nrecs) >
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
-                        "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
-                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                 ip->i_mount);
+                                 ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -660,7 +651,8 @@ xfs_dinode_from_disk(
        to->di_uid = be32_to_cpu(from->di_uid);
        to->di_gid = be32_to_cpu(from->di_gid);
        to->di_nlink = be32_to_cpu(from->di_nlink);
-        to->di_projid = be16_to_cpu(from->di_projid);
+        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = be16_to_cpu(from->di_flushiter);
        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +687,8 @@ xfs_dinode_to_disk(
        to->di_uid = cpu_to_be32(from->di_uid);
        to->di_gid = cpu_to_be32(from->di_gid);
        to->di_nlink = cpu_to_be32(from->di_nlink);
-        to->di_projid = cpu_to_be16(from->di_projid);
+        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
        to->di_flushiter = cpu_to_be16(from->di_flushiter);
        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -811,11 +804,9 @@ xfs_iread(
         */
        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                xfs_alert(mp,
-                                "dip->di_magic (0x%x) != "
+                        "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
-                                "XFS_DINODE_MAGIC (0x%x)",
+                        __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-                                be16_to_cpu(dip->di_magic),
-                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
                error = XFS_ERROR(EINVAL);
                goto out_brelse;
@@ -833,9 +824,8 @@ xfs_iread(
                error = xfs_iformat(ip, dip);
                if (error)  {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                                        "xfs_iformat() returned error %d",
+                                __func__, error);
-                                        error);
 #endif /* DEBUG */
                        goto out_brelse;
                }
@@ -874,7 +864,7 @@ xfs_iread(
        if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
-                ip->i_d.di_projid = 0;
+                xfs_set_projid(ip, 0);
        }
        ip->i_delayed_blks = 0;
@@ -885,7 +875,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -930,7 +920,6 @@ xfs_iread_extents(
        /*
         * We know that the size is valid (it's checked in iformat_btree)
         */
-        ifp->if_lastex = NULLEXTNUM;
        ifp->if_bytes = ifp->if_real_bytes = 0;
        ifp->if_flags |= XFS_IFEXTENTS;
        xfs_iext_add(ifp, 0, nextents);
@@ -982,8 +971,7 @@ xfs_ialloc(
        mode_t          mode,
        xfs_nlink_t     nlink,
        xfs_dev_t       rdev,
-        cred_t          *cr,
+        prid_t          prid,
-        xfs_prid_t      prid,
        int             okalloc,
        xfs_buf_t       **ialloc_context,
        boolean_t       *call_again,
@@ -1015,8 +1003,8 @@ xfs_ialloc(
         * This is because we're setting fields here we need
         * to prevent others from looking at until we're done.
         */
-        error = xfs_trans_iget(tp->t_mountp, tp, ino,
+        error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
-                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+                         XFS_ILOCK_EXCL, &ip);
        if (error)
                return error;
        ASSERT(ip != NULL);
@@ -1027,7 +1015,7 @@ xfs_ialloc(
        ASSERT(ip->i_d.di_nlink == nlink);
        ip->i_d.di_uid = current_fsuid();
        ip->i_d.di_gid = current_fsgid();
-        ip->i_d.di_projid = prid;
+        xfs_set_projid(ip, prid);
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        /*
@@ -1165,6 +1153,7 @@ xfs_ialloc(
        /*
         * Log the new values stuffed into the inode.
         */
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1364,7 +1353,7 @@ xfs_itruncate_start(
                return 0;
        }
        last_byte = xfs_file_last_byte(ip);
-        trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
+        trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
        if (last_byte > toss_start) {
                if (flags & XFS_ITRUNC_DEFINITE) {
                        xfs_tosspages(ip, toss_start,
@@ -1480,7 +1469,7 @@ xfs_itruncate_finish(
         * file but the log buffers containing the free and reallocation
         * don't, then we'd end up with garbage in the blocks being freed.
         * As long as we make the new_size permanent before actually
-         * freeing any blocks it doesn't matter if they get writtten to.
+         * freeing any blocks it doesn't matter if they get written to.
         *
         * The callers must signal into us whether or not the size
         * setting here must be synchronous.  There are a few cases
@@ -1819,9 +1808,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1866,9 +1854,9 @@ xfs_iunlink_remove(
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
                                            &last_ibp, &last_offset, 0);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp,
-                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
+                                        "%s: xfs_inotobp() returned error %d.",
-                                        error, mp->m_fsname);
+                                        __func__, error);
                                return error;
                        }
                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1881,9 +1869,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1999,15 +1986,31 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
                                continue;
                        }
+                        spin_unlock(&ip->i_flags_lock);
                        /*
                         * Don't try to lock/unlock the current inode, but we
@@ -2018,11 +2021,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2554,12 +2557,9 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_EXTENTS:
                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
-                ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
-                        (ifp->if_bytes == 0));
-                ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
-                        (ifp->if_bytes > 0));
                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
+                        ASSERT(xfs_iext_get_ext(ifp, 0));
                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
                                whichfork);
@@ -2628,7 +2628,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2639,9 +2639,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2691,7 +2703,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2703,7 +2715,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
@@ -2725,7 +2737,7 @@ cluster_corrupt_out:
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
                        XFS_BUF_ERROR(bp,EIO);
-                        xfs_biodone(bp);
+                        xfs_buf_ioend(bp, 0);
                } else {
                        XFS_BUF_STALE(bp);
                        xfs_buf_relse(bp);
@@ -2773,7 +2785,7 @@ xfs_iflush(
        /*
         * We can't flush the inode until it is unpinned, so wait for it if we
-         * are allowed to block.  We know noone new can pin it, because we are
+         * are allowed to block.  We know no one new can pin it, because we are
         * holding the inode lock shared and you need to hold it exclusively to
         * pin the inode.
         *
@@ -2819,7 +2831,7 @@ xfs_iflush(
         * Get the buffer containing the on-disk inode.
         */
        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-                                (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
+                                (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
                return error;
@@ -2910,16 +2922,16 @@ xfs_iflush_int(
        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-                        ip->i_ino, ip, ip->i_d.di_magic);
+                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
                goto corrupt_out;
        }
        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2927,9 +2939,9 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+                                "%s: Bad regular inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2938,28 +2950,28 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+                                "%s: Bad directory inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        }
        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
                                XFS_RANDOM_IFLUSH_5)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        "%s: detected corrupt incore inode %Lu, "
-                        ip->i_ino,
+                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        __func__, ip->i_ino,
                        ip->i_d.di_nextents + ip->i_d.di_anextents,
-                        ip->i_d.di_nblocks,
+                        ip->i_d.di_nblocks, ip);
-                        ip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
-                        ip->i_ino, ip->i_d.di_forkoff, ip);
+                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
                goto corrupt_out;
        }
        /*
@@ -3008,7 +3020,7 @@ xfs_iflush_int(
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                        memset(&(dip->di_pad[0]), 0,
                              sizeof(dip->di_pad));
-                        ASSERT(ip->i_d.di_projid == 0);
+                        ASSERT(xfs_get_projid(ip) == 0);
                }
        }
@@ -3096,6 +3108,8 @@ xfs_iext_get_ext(
        xfs_extnum_t    idx)            /* index of target extent */
 {
        ASSERT(idx >= 0);
+        ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
        if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
                return ifp->if_u1.if_ext_irec->er_extbuf;
        } else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3175,7 +3189,6 @@ xfs_iext_add(
                }
                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
                ifp->if_real_bytes = 0;
-                ifp->if_lastex = nextents + ext_diff;
        }
        /*
         * Otherwise use a linear (direct) extent list.
@@ -3870,8 +3883,10 @@ xfs_iext_idx_to_irec(
        xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-        ASSERT(page_idx >= 0 && page_idx <=
+        ASSERT(page_idx >= 0);
-                ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+        ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
        erp_idx = 0;
        low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..964cfea77686 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
-        xfs_extnum_t            if_lastex;      /* last if_extents used */
        union {
                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
@@ -111,7 +110,7 @@ struct xfs_imap {
 * Generally, we do not want to hold the i_rlock while holding the
 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
 *
- * xfs_iptr_t contains all the inode fields upto and including the
+ * xfs_iptr_t contains all the inode fields up to and including the
 * i_mnext and i_mprev fields, it is used as a marker in the inode
 * chain off the mount structure by xfs_sync calls.
 */
@@ -134,8 +133,9 @@ typedef struct xfs_icdinode {
        __uint32_t      di_uid;         /* owner's user id */
        __uint32_t      di_gid;         /* owner's group id */
        __uint32_t      di_nlink;       /* number of links to file */
-        __uint16_t      di_projid;      /* owner's project id */
+        __uint16_t      di_projid_lo;   /* lower part of owner's project id */
-        __uint8_t       di_pad[8];      /* unused, zeroed space */
+        __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+        __uint8_t       di_pad[6];      /* unused, zeroed space */
        __uint16_t      di_flushiter;   /* incremented on flush */
        xfs_ictimestamp_t di_atime;     /* time last accessed */
        xfs_ictimestamp_t di_mtime;     /* time last modified */
@@ -212,7 +212,6 @@ typedef struct xfs_icdinode {
 #ifdef __KERNEL__
 struct bhv_desc;
-struct cred;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -335,6 +334,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 }
 /*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was chosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+        return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+                prid_t projid)
+{
+        ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+        ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+/*
 * Manage the i_flush queue embedded in the inode.  This completion
 * queue synchronizes processes attempting to flush the in-core
 * inode back to disk.
@@ -357,12 +375,23 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
+/*
+ * Per-lifetime flags need to be reset when re-using a reclaimable inode during
+ * inode lookup. Thi prevents unintended behaviour on the new inode from
+ * ocurring.
+ */
+#define XFS_IRECLAIM_RESET_FLAGS        \
+        (XFS_IRECLAIMABLE | XFS_IRECLAIM | \
+         XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
+         XFS_IFILESTREAM);
 /*
 * Flags for inode locking.
@@ -389,28 +418,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * Flags for lockdep annotations.
 *
- * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
+ * XFS_LOCK_PARENT - for directory operations that require locking a
- * (ie directory operations that require locking a directory inode and
+ * parent directory inode and a child entry inode.  The parent gets locked
- * an entry inode).  The first inode gets locked with this flag so it
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
- * gets a lockdep subclass of 1 and the second lock will have a lockdep
+ * lock will have a lockdep subclass of 0.
- * subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
 *
 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
 * with xfs_lock_inodes().  This flag is used as the starting subclass
 * and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 2, the
+ * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 3, and so on. It is
+ * second lock will have a lockdep subclass of 5, and so on. It is
 * the responsibility of the class builder to shift this to the correct
 * portion of the lock_mode lockdep mask.
 */
 #define XFS_LOCK_PARENT         1
-#define XFS_LOCK_INUMORDER      2
+#define XFS_LOCK_RTBITMAP       2
+#define XFS_LOCK_RTSUM          3
+#define XFS_LOCK_INUMORDER      4
 #define XFS_IOLOCK_SHIFT        16
 #define XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_SHIFT         24
 #define XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
 #define XFS_IOLOCK_DEP_MASK     0x00ff0000
 #define XFS_ILOCK_DEP_MASK      0xff000000
@@ -419,6 +455,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
@@ -456,8 +494,8 @@ void		xfs_inode_free(struct xfs_inode *ip);
 * xfs_inode.c prototypes.
 */
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
+                           xfs_nlink_t, xfs_dev_t, prid_t, int,
-                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
+                           struct xfs_buf **, boolean_t *, xfs_inode_t **);
 uint            xfs_ip2xflags(struct xfs_inode *);
 uint            xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +509,6 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
 void            xfs_iunpin_wait(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
-void            xfs_ichgtime(xfs_inode_t *, int);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -482,7 +519,7 @@ void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
+        ihold(VFS_I(ip)); \
        trace_xfs_ihold(ip, _THIS_IP_); \
 } while (0)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..b1e88d56069c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
 }
 /*
+ * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode. In this case, we
+ * need to do this conversion before we write the extents into the log. Because
+ * we don't have the disk inode to write into here, we allocate a buffer and
+ * format the extents into it via xfs_iextents_copy(). We free the buffer in
+ * the unlock routine after the copy for the log has been made.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only log on-disk extents
+ * here, so always use the physical fork size to determine the size of the
+ * buffer we need to allocate.
+ */
+STATIC void
+xfs_inode_item_format_extents(
+        struct xfs_inode        *ip,
+        struct xfs_log_iovec    *vecp,
+        int                     whichfork,
+        int                     type)
+{
+        xfs_bmbt_rec_t          *ext_buffer;
+        ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+        if (whichfork == XFS_DATA_FORK)
+                ip->i_itemp->ili_extents_buf = ext_buffer;
+        else
+                ip->i_itemp->ili_aextents_buf = ext_buffer;
+        vecp->i_addr = ext_buffer;
+        vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+        vecp->i_type = type;
+}
+/*
 * This is called to fill in the vector of log iovecs for the
 * given inode log item.  It fills the first item with an inode
 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs;
        size_t                  data_bytes;
-        xfs_bmbt_rec_t          *ext_buffer;
        xfs_mount_t             *mp;
        vecp->i_addr = &iip->ili_format;
@@ -223,15 +257,6 @@ xfs_inode_item_format(
        nvecs        = 1;
        /*
-         * Make sure the linux inode is dirty. We do this before
-         * clearing i_update_core as the VFS will call back into
-         * XFS here and set i_update_core, so we need to dirty the
-         * inode first so that the ordering of i_update_core and
-         * unlogged modifications still works as described below.
-         */
-        xfs_mark_inode_dirty_sync(ip);
-        /*
         * Clear i_update_core if the timestamps (or any other
         * non-transactional modification) need flushing/logging
         * and we're about to log them with the rest of the core.
@@ -329,22 +354,8 @@ xfs_inode_item_format(
                        } else
 #endif
                        {
-                                /*
+                                xfs_inode_item_format_extents(ip, vecp,
-                                 * There are delayed allocation extents
+                                        XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-                                 * in the inode, or we need to convert
-                                 * the extents to on disk format.
-                                 * Use xfs_iextents_copy()
-                                 * to copy only the real extents into
-                                 * a separate buffer.  We'll free the
-                                 * buffer in the unlock routine.
-                                 */
-                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
-                                        KM_SLEEP);
-                                iip->ili_extents_buf = ext_buffer;
-                                vecp->i_addr = ext_buffer;
-                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                                XFS_DATA_FORK);
-                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        }
                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
                        iip->ili_format.ilf_dsize = vecp->i_len;
@@ -454,19 +465,12 @@ xfs_inode_item_format(
                         */
                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
                        vecp->i_len = ip->i_afp->if_bytes;
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
 #else
                        ASSERT(iip->ili_aextents_buf == NULL);
-                        /*
+                        xfs_inode_item_format_extents(ip, vecp,
-                         * Need to endian flip before logging
+                                        XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-                         */
-                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
-                                KM_SLEEP);
-                        iip->ili_aextents_buf = ext_buffer;
-                        vecp->i_addr = ext_buffer;
-                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                        XFS_ATTR_FORK);
 #endif
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
@@ -666,18 +670,39 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completes before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, just unpin the inode directly and return a LSN of -1 so the
+ * transaction committed code knows that it does not need to do any further
+ * processing on the item.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE)) {
+                xfs_inode_item_unpin(lip, 0);
+                return -1;
+        }
        return lsn;
 }
@@ -750,11 +775,11 @@ xfs_inode_item_push(
         * Push the inode to it's backing buffer. This will not remove the
         * inode from the AIL - a further push will be required to trigger a
         * buffer push. However, this allows all the dirty inodes to be pushed
-         * to the buffer before it is pushed to disk. THe buffer IO completion
+         * to the buffer before it is pushed to disk. The buffer IO completion
-         * will pull th einode from the AIL, mark it clean and unlock the flush
+         * will pull the inode from the AIL, mark it clean and unlock the flush
         * lock.
         */
-        (void) xfs_iflush(ip, 0);
+        (void) xfs_iflush(ip, SYNC_TRYLOCK);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 }
@@ -832,15 +857,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -851,28 +925,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
@@ -889,7 +972,6 @@ xfs_iflush_abort(
 {
        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        iip = ip->i_itemp;
        if (iip) {
                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -220,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
 }
 STATIC int
-xfs_cmn_err_fsblock_zero(
+xfs_alert_fsblock_zero(
        xfs_inode_t     *ip,
        xfs_bmbt_irec_t *imap)
 {
-        xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+        xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                        "Access to block zero in inode %llu "
                        "start_block: %llx start_off: %llx "
                        "blkcnt: %llx extent-state: %x\n",
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -366,11 +246,10 @@ xfs_iomap_write_direct(
        }
        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-                error = xfs_cmn_err_fsblock_zero(ip, imap);
+                error = xfs_alert_fsblock_zero(ip, imap);
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                /*
+                 * rounddown_pow_of_two() returns an undefined result
+                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+                 * ensure we always pass in a non-zero value.
+                 */
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +431,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -520,11 +464,9 @@ retry:
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
+                return xfs_alert_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +480,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -675,12 +614,11 @@ xfs_iomap_write_allocate(
                 * covers at least part of the callers request
                 */
                if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, imap);
+                        return xfs_alert_fsblock_zero(ip, imap);
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
@@ -786,7 +724,7 @@ xfs_iomap_write_unwritten(
                        return XFS_ERROR(error);
                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, &imap);
+                        return xfs_alert_fsblock_zero(ip, &imap);
                if ((numblks_fsb = imap.br_blockcount) == 0) {
                        /*
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
         * further change.
         */
        buf->bs_nlink = dic->di_nlink;
-        buf->bs_projid = dic->di_projid;
+        buf->bs_projid_lo = dic->di_projid_lo;
+        buf->bs_projid_hi = dic->di_projid_hi;
        buf->bs_ino = ino;
        buf->bs_mode = dic->di_mode;
        buf->bs_uid = dic->di_uid;
@@ -203,7 +204,6 @@ xfs_bulkstat(
        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_daddr_t             bno;    /* inode cluster start daddr */
        int                     chunkidx; /* current index into inode chunk */
        int                     clustidx; /* current index into inode cluster */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
@@ -462,7 +462,6 @@ xfs_bulkstat(
                                                 mp->m_sb.sb_inopblog);
                                }
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
                                /*
                                 * Skip if this inode is free.
                                 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..41d5b8f2bf92 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
+                space -= bytes;
-        if (tic == tic->t_next) {
+                if (space < 0) {
-                *qp = NULL;
+                        space += log->l_logsize;
-        } else {
+                        cycle--;
-                *qp = tic->t_next;
+                }
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                old = head_val;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
-        if ((log)->l_grant_reserve_bytes < 0) {
-                log->l_grant_reserve_bytes += log->l_logsize;
-                log->l_grant_reserve_cycle--;
-        }
-}
+        do {
+                int             tmp;
+                int             cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (tmp > bytes)
-                log->l_grant_write_bytes += bytes;
-        else {
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_reserve(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_reserve_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -399,11 +374,10 @@ xfs_log_mount(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+                xfs_notice(mp, "Mounting Filesystem");
        else {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp,
-                        "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
-                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -418,7 +392,7 @@ xfs_log_mount(
         */
        error = xfs_trans_ail_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
+                xfs_warn(mp, "AIL initialisation failed: error %d", error);
                goto out_free_log;
        }
        mp->m_log->l_ailp = mp->m_ail;
@@ -438,7 +412,8 @@ xfs_log_mount(
                if (readonly)
                        mp->m_flags |= XFS_MOUNT_RDONLY;
                if (error) {
-                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
+                        xfs_warn(mp, "log mount/recovery failed: error %d",
+                                error);
                        goto out_destroy_ail;
                }
        }
@@ -567,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                         */
                }
-                if (error) {
+                if (error)
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: unmount record failed", __func__);
-                                "xfs_log_unmount: unmount record failed");
-                }
                spin_lock(&log->l_icloglock);
@@ -584,8 +557,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +598,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +676,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-         * tail_lsn.
+        if (tail_lsn != 1)
-         */
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +724,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -797,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
                break;
        case XLOG_STATE_COVER_NEED:
        case XLOG_STATE_COVER_NEED2:
-                if (!xfs_trans_ail_tail(log->l_ailp) &&
+                if (!xfs_ail_min_lsn(log->l_ailp) &&
                    xlog_iclogs_empty(log)) {
                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                                log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -831,23 +795,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
-        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
+        tail_lsn = xfs_ail_min_lsn(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,37 +824,42 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
                 * In this case we just want to return the size of the
                 * log as the amount of space left.
                 */
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+                xfs_alert(log->l_mp,
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -917,19 +882,6 @@ xlog_iodone(xfs_buf_t *bp)
        l = iclog->ic_log;
        /*
-         * If the _XFS_BARRIER_FAILED flag was set by a lower
-         * layer, it means the underlying device no longer supports
-         * barrier I/O. Warn loudly and turn off barriers.
-         */
-        if (bp->b_flags & _XFS_BARRIER_FAILED) {
-                bp->b_flags &= ~_XFS_BARRIER_FAILED;
-                l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                xfs_fs_cmn_err(CE_WARN, l->l_mp,
-                                "xlog_iodone: Barriers are no longer supported"
-                                " by device. Disabling barriers\n");
-        }
-        /*
         * Race to shutdown the filesystem if we see an error.
         */
        if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1047,7 +999,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
-                xlog_warn("XFS: Log allocation failed: No memory!");
+                xfs_warn(mp, "Log allocation failed: No memory!");
                goto out;
        }
@@ -1060,35 +1012,39 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log2_size = mp->m_sb.sb_logsectlog;
                if (log2_size < BBSHIFT) {
-                        xlog_warn("XFS: Log sector size too small "
+                        xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
-                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
+                                log2_size, BBSHIFT);
                        goto out_free_log;
                }
                log2_size -= BBSHIFT;
                if (log2_size > mp->m_sectbb_log) {
-                        xlog_warn("XFS: Log sector size too large "
+                        xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
-                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
+                                log2_size, mp->m_sectbb_log);
                        goto out_free_log;
                }
                /* for larger sector sizes, must have v2 or external log */
                if (log2_size && log->l_logBBstart > 0 &&
                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xfs_warn(mp,
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                "log sector size (0x%x) invalid for configuration.",
-                                  "for configuration.", log2_size);
+                                log2_size);
                        goto out_free_log;
                }
        }
@@ -1107,8 +1063,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1131,7 +1086,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
-                bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+                bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+                                                log->l_iclog_size, 0);
                if (!bp)
                        goto out_free_iclog;
                if (!XFS_BUF_CPSEMA(bp))
@@ -1163,8 +1119,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1179,15 +1135,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1235,61 +1187,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1309,7 +1260,7 @@ xlog_bdstrat(
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                XFS_BUF_ERROR(bp, EIO);
                XFS_BUF_STALE(bp);
-                xfs_biodone(bp);
+                xfs_buf_ioend(bp, 0);
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
@@ -1384,9 +1335,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1422,8 +1372,17 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_LOG_BUFFER;
-        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
+                /*
+                 * If we have an external log device, flush the data device
+                 * before flushing the log to make sure all meta data
+                 * written back from the AIL actually made it to disk
+                 * before writing out the new log tail LSN in the log buffer.
+                 */
+                if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+                        xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
                XFS_BUF_ORDERED(bp);
+        }
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
        ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1499,19 +1458,22 @@ xlog_dealloc_log(xlog_t *log)
        xlog_cil_destroy(log);
+        /*
+         * always need to ensure that the extra buffer does not point to memory
+         * owned by another log buffer before we free it.
+         */
+        xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+        xfs_buf_free(log->l_xbuf);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
-        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }       /* xlog_dealloc_log */
@@ -1614,38 +1576,36 @@ xlog_print_tic_res(
            "SWAPEXT"
        };
-        xfs_fs_cmn_err(CE_WARN, mp,
+        xfs_warn(mp,
-                        "xfs_log_write: reservation summary:\n"
+                "xfs_log_write: reservation summary:\n"
-                        "  trans type  = %s (%u)\n"
+                "  trans type  = %s (%u)\n"
-                        "  unit res    = %d bytes\n"
+                "  unit res    = %d bytes\n"
-                        "  current res = %d bytes\n"
+                "  current res = %d bytes\n"
-                        "  total reg   = %u bytes (o/flow = %u bytes)\n"
+                "  total reg   = %u bytes (o/flow = %u bytes)\n"
-                        "  ophdrs      = %u (ophdr space = %u bytes)\n"
+                "  ophdrs      = %u (ophdr space = %u bytes)\n"
-                        "  ophdr + reg = %u bytes\n"
+                "  ophdr + reg = %u bytes\n"
-                        "  num regions = %u\n",
+                "  num regions = %u\n",
-                        ((ticket->t_trans_type <= 0 ||
+                ((ticket->t_trans_type <= 0 ||
-                          ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                          "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
-                        ticket->t_trans_type,
+                ticket->t_trans_type,
-                        ticket->t_unit_res,
+                ticket->t_unit_res,
-                        ticket->t_curr_res,
+                ticket->t_curr_res,
-                        ticket->t_res_arr_sum, ticket->t_res_o_flow,
+                ticket->t_res_arr_sum, ticket->t_res_o_flow,
-                        ticket->t_res_num_ophdrs, ophdr_spc,
+                ticket->t_res_num_ophdrs, ophdr_spc,
-                        ticket->t_res_arr_sum + 
+                ticket->t_res_arr_sum +
-                        ticket->t_res_o_flow + ophdr_spc,
+                ticket->t_res_o_flow + ophdr_spc,
-                        ticket->t_res_num);
+                ticket->t_res_num);
        for (i = 0; i < ticket->t_res_num; i++) {
-                uint r_type = ticket->t_res_arr[i].r_type; 
+                uint r_type = ticket->t_res_arr[i].r_type;
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
-                            "region[%u]: %s - %u bytes\n",
-                            i, 
                            ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+        xfs_alert_tag(mp, XFS_PTAG_LOGRES,
                "xfs_log_write: reservation ran out. Need to up reservation");
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
@@ -1733,7 +1693,7 @@ xlog_write_setup_ophdr(
        case XFS_LOG:
                break;
        default:
-                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                xfs_warn(log->l_mp,
                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
                        ophdr->oh_clientid, ticket);
                return NULL;
@@ -2244,7 +2204,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2252,23 +2212,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2309,7 +2267,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2317,7 +2275,7 @@ xlog_state_do_callback(
                if (repeats > 5000) {
                        flushcnt += repeats;
                        repeats = 0;
-                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        xfs_warn(log->l_mp,
                                "%s: possible infinite loop (%d iterations)",
                                __func__, flushcnt);
                }
@@ -2356,7 +2314,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2407,7 +2365,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2456,7 +2414,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2539,6 +2497,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2546,24 +2516,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2575,72 +2548,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2650,7 +2608,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2658,17 +2615,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2681,12 +2635,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2695,92 +2646,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2790,7 +2725,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2811,27 +2745,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2855,28 +2786,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2913,11 +2845,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3100,7 +3032,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3131,10 +3063,8 @@ xfs_log_force(
        int     error;
        error = _xfs_log_force(mp, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3218,8 +3148,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3247,7 +3177,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3283,10 +3213,8 @@ xfs_log_force_lsn(
        int     error;
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3322,10 +3250,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3337,13 +3263,6 @@ xfs_log_ticket_get(
        return ticket;
 }
-xlog_tid_t
-xfs_log_get_trans_ident(
-        struct xfs_trans        *tp)
-{
-        return tp->t_ticket->t_tid;
-}
 /*
 * Allocate and initialise a new log ticket.
 */
@@ -3447,6 +3366,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3457,7 +3377,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3492,22 +3412,45 @@ xlog_verify_dest_ptr(
        }
        if (!good_ptr)
-                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+                xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 }
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
-{
+        struct log      *log)
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+{
-        if (equals)
+        int             tail_cycle, tail_blocks;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
+        int             cycle, space;
-        else
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
-    } else {
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+        if (tail_cycle != cycle) {
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+                if (cycle - 1 != tail_cycle &&
-    }
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
-}       /* xlog_verify_grant_head */
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: cycle - 1 != tail_cycle", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
+                if (space > BBTOB(tail_blocks) &&
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: space > BBTOB(tail_blocks)", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3521,16 +3464,16 @@ xlog_verify_tail_lsn(xlog_t	    *log,
        blocks =
            log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
        if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    } else {
        ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
        if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
-            xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+                xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
        blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
        if (blocks < BTOBB(iclog->ic_offset) + 1)
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    }
 }       /* xlog_verify_tail_lsn */
@@ -3570,22 +3513,23 @@ xlog_verify_iclog(xlog_t	 *log,
        icptr = log->l_iclog;
        for (i=0; i < log->l_iclog_bufs; i++) {
                if (icptr == NULL)
-                        xlog_panic("xlog_verify_iclog: invalid ptr");
+                        xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
                icptr = icptr->ic_next;
        }
        if (icptr != log->l_iclog)
-                xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+                xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
        spin_unlock(&log->l_icloglock);
        /* check log magic numbers */
        if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
-                xlog_panic("xlog_verify_iclog: invalid magic num");
+                xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
        ptr = (xfs_caddr_t) &iclog->ic_header;
        for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
             ptr += BBSIZE) {
                if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-                        xlog_panic("xlog_verify_iclog: unexpected magic num");
+                        xfs_emerg(log->l_mp, "%s: unexpected magic num",
+                                __func__);
        }
        /* check fields */
@@ -3615,9 +3559,10 @@ xlog_verify_iclog(xlog_t	 *log,
                        }
                }
                if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
-                        cmn_err(CE_WARN, "xlog_verify_iclog: "
+                        xfs_warn(log->l_mp,
-                                "invalid clientid %d op 0x%p offset 0x%lx",
+                                "%s: invalid clientid %d op 0x%p offset 0x%lx",
-                                clientid, ophead, (unsigned long)field_offset);
+                                __func__, clientid, ophead,
+                                (unsigned long)field_offset);
                /* check length */
                field_offset = (__psint_t)
@@ -3728,12 +3673,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3754,27 +3697,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..78c9039994af 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,9 +189,7 @@ void	  xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
-xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
-int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
+#include "xfs_discard.h"
 /*
 * Perform initial CIL structure initialisation. If the CIL is not
@@ -61,7 +62,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -146,102 +147,6 @@ xlog_cil_init_post_recovery(
 }
 /*
- * Insert the log item into the CIL and calculate the difference in space
- * consumed by the item. Add the space to the checkpoint ticket and calculate
- * if the change requires additional log metadata. If it does, take that space
- * as well. Remove the amount of space we addded to the checkpoint ticket from
- * the current transaction ticket so that the accounting works out correctly.
- *
- * If this is the first time the item is being placed into the CIL in this
- * context, pin it so it can't be written to disk until the CIL is flushed to
- * the iclog and the iclog written to disk.
- */
-static void
-xlog_cil_insert(
-        struct log              *log,
-        struct xlog_ticket      *ticket,
-        struct xfs_log_item     *item,
-        struct xfs_log_vec      *lv)
-{
-        struct xfs_cil          *cil = log->l_cilp;
-        struct xfs_log_vec      *old = lv->lv_item->li_lv;
-        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-        int                     len;
-        int                     diff_iovecs;
-        int                     iclog_space;
-        if (old) {
-                /* existing lv on log item, space used is a delta */
-                ASSERT(!list_empty(&item->li_cil));
-                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
-                len = lv->lv_buf_len - old->lv_buf_len;
-                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
-                kmem_free(old->lv_buf);
-                kmem_free(old);
-        } else {
-                /* new lv, must pin the log item */
-                ASSERT(!lv->lv_item->li_lv);
-                ASSERT(list_empty(&item->li_cil));
-                len = lv->lv_buf_len;
-                diff_iovecs = lv->lv_niovecs;
-                IOP_PIN(lv->lv_item);
-        }
-        len += diff_iovecs * sizeof(xlog_op_header_t);
-        /* attach new log vector to log item */
-        lv->lv_item->li_lv = lv;
-        spin_lock(&cil->xc_cil_lock);
-        list_move_tail(&item->li_cil, &cil->xc_cil);
-        ctx->nvecs += diff_iovecs;
-        /*
-         * If this is the first time the item is being committed to the CIL,
-         * store the sequence number on the log item so we can tell
-         * in future commits whether this is the first checkpoint the item is
-         * being committed into.
-         */
-        if (!item->li_seq)
-                item->li_seq = ctx->sequence;
-        /*
-         * Now transfer enough transaction reservation to the context ticket
-         * for the checkpoint. The context ticket is special - the unit
-         * reservation has to grow as well as the current reservation as we
-         * steal from tickets so we can correctly determine the space used
-         * during the transaction commit.
-         */
-        if (ctx->ticket->t_curr_res == 0) {
-                /* first commit in checkpoint, steal the header reservation */
-                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
-                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-                ticket->t_curr_res -= ctx->ticket->t_unit_res;
-        }
-        /* do we need space for more log record headers? */
-        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
-        if (len > 0 && (ctx->space_used / iclog_space !=
-                                (ctx->space_used + len) / iclog_space)) {
-                int hdrs;
-                hdrs = (len + iclog_space - 1) / iclog_space;
-                /* need to take into account split region headers, too */
-                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
-                ctx->ticket->t_unit_res += hdrs;
-                ctx->ticket->t_curr_res += hdrs;
-                ticket->t_curr_res -= hdrs;
-                ASSERT(ticket->t_curr_res >= len);
-        }
-        ticket->t_curr_res -= len;
-        ctx->space_used += len;
-        spin_unlock(&cil->xc_cil_lock);
-}
-/*
 * Format log item into a flat buffers
 *
 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +191,7 @@ xlog_cil_format_items(
                        len += lv->lv_iovecp[index].i_len;
                lv->lv_buf_len = len;
-                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
                ptr = lv->lv_buf;
                for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +205,136 @@ xlog_cil_format_items(
        }
 }
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+        struct log              *log,
+        struct xfs_log_vec      *lv,
+        int                     *len,
+        int                     *diff_iovecs)
+{
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&lv->lv_item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                *len += lv->lv_buf_len - old->lv_buf_len;
+                *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&lv->lv_item->li_cil));
+                *len += lv->lv_buf_len;
+                *diff_iovecs += lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        /*
+         * If this is the first time the item is being committed to the
+         * CIL, store the sequence number on the log item so we can
+         * tell in future commits whether this is the first checkpoint
+         * the item is being committed into.
+         */
+        if (!lv->lv_item->li_seq)
+                lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
 static void
 xlog_cil_insert_items(
        struct log              *log,
        struct xfs_log_vec      *log_vector,
-        struct xlog_ticket      *ticket,
+        struct xlog_ticket      *ticket)
-        xfs_lsn_t               *start_lsn)
 {
-        struct xfs_log_vec *lv;
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-        if (start_lsn)
+        struct xfs_log_vec      *lv;
-                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        int                     len = 0;
+        int                     diff_iovecs = 0;
+        int                     iclog_space;
        ASSERT(log_vector);
+        /*
+         * Do all the accounting aggregation and switching of log vectors
+         * around in a separate loop to the insertion of items into the CIL.
+         * Then we can do a separate loop to update the CIL within a single
+         * lock/unlock pair. This reduces the number of round trips on the CIL
+         * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+         * hold time for the transaction commit.
+         *
+         * If this is the first time the item is being placed into the CIL in
+         * this context, pin it so it can't be written to disk until the CIL is
+         * flushed to the iclog and the iclog written to disk.
+         *
+         * We can do this safely because the context can't checkpoint until we
+         * are done so it doesn't matter exactly how we update the CIL.
+         */
        for (lv = log_vector; lv; lv = lv->lv_next)
-                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+                xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+        /* account for space used by new iovec headers  */
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        spin_lock(&cil->xc_cil_lock);
+        /* move the items to the tail of the CIL */
+        for (lv = log_vector; lv; lv = lv->lv_next)
+                list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
 }
 static void
@@ -342,24 +362,28 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
+        struct xfs_mount        *mp = ctx->cil->xc_log->l_mp;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
-        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
-        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+        xfs_alloc_busy_sort(&ctx->busy_extents);
-                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+                             (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
        spin_lock(&ctx->cil->xc_cil_lock);
        list_del(&ctx->committing);
        spin_unlock(&ctx->cil->xc_cil_lock);
        xlog_cil_free_logvec(ctx->lv_chain);
+        if (!list_empty(&ctx->busy_extents)) {
+                ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+                xfs_discard_extents(mp, &ctx->busy_extents);
+                xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+        }
        kmem_free(ctx);
 }
@@ -529,7 +553,7 @@ xlog_cil_push(
        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
        if (error)
-                goto out_abort;
+                goto out_abort_free_ticket;
        /*
         * now that we've written the checkpoint into the log, strictly
@@ -549,14 +573,15 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
        spin_unlock(&cil->xc_cil_lock);
+        /* xfs_log_done always frees the ticket on error. */
        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-        if (error || commit_lsn == -1)
+        if (commit_lsn == -1)
                goto out_abort;
        /* attach all the transactions w/ busy extents to iclog */
@@ -573,7 +598,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -586,6 +611,8 @@ out_free_ticket:
        kmem_free(new_ctx);
        return 0;
+out_abort_free_ticket:
+        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
        return XFS_ERROR(EIO);
@@ -608,7 +635,7 @@ out_abort:
 * background commit, returns without it held once background commits are
 * allowed again.
 */
-int
+void
 xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -623,11 +650,6 @@ xfs_log_commit_cil(
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
        /*
         * do all the hard work of formatting items (including memory
         * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -638,7 +660,10 @@ xfs_log_commit_cil(
        /* lock out background commit */
        down_read(&log->l_cilp->xc_ctx_lock);
-        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        if (commit_lsn)
+                *commit_lsn = log->l_cilp->xc_ctx->sequence;
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket);
        /* check we didn't blow the reservation */
        if (tp->t_ticket->t_curr_res < 0)
@@ -684,7 +709,6 @@ xfs_log_commit_cil(
         */
        if (push)
                xlog_cil_push(log, 0);
-        return 0;
 }
 /*
@@ -735,7 +759,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..2d3b6a498d63 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -89,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
        return be32_to_cpu(i) >> 24;
 }
-#define xlog_panic(args...)     cmn_err(CE_PANIC, ## args)
-#define xlog_exit(args...)      cmn_err(CE_PANIC, ## args)
-#define xlog_warn(args...)      cmn_err(CE_WARN, ## args)
 /*
 * In core log state
 */
@@ -133,12 +127,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -152,6 +144,9 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
+#define XLOG_TAIL_WARN          0x10    /* log tail verify warning issued */
+typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
@@ -244,9 +239,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +347,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +415,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +485,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +497,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +539,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +570,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to sample and crack LSNs that are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +648,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..04142caedb2b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -81,7 +92,7 @@ xlog_get_bp(
        int             nbblks)
 {
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
@@ -90,7 +101,7 @@ xlog_get_bp(
        /*
         * We do log I/O in units of log sectors (a power-of-2
         * multiple of the basic block size), so we round up the
-         * requested size to acommodate the basic blocks required
+         * requested size to accommodate the basic blocks required
         * for complete log sectors.
         *
         * In addition, the buffer may be used for a non-sector-
@@ -101,13 +112,14 @@ xlog_get_bp(
         * an issue.  Nor will this be a problem if the log I/O is
         * done in basic blocks (sector size 1).  But otherwise we
         * extend the buffer by one extra log sector to ensure
-         * there's space to accomodate this possiblility.
+         * there's space to accommodate this possibility.
         */
        if (nbblks > 1 && log->l_sectBBsize > 1)
                nbblks += log->l_sectBBsize;
        nbblks = round_up(nbblks, log->l_sectBBsize);
-        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
+        return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+                                        BBTOB(nbblks), 0);
 }
 STATIC void
@@ -148,7 +160,7 @@ xlog_bread_noalign(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -167,7 +179,7 @@ xlog_bread_noalign(
        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
-        error = xfs_iowait(bp);
+        error = xfs_buf_iowait(bp);
        if (error)
                xfs_ioerror_alert("xlog_bread", log->l_mp,
                                  bp, XFS_BUF_ADDR(bp));
@@ -193,6 +205,35 @@ xlog_bread(
 }
 /*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,         /* block to read from */
+        int             nbblks,         /* blocks to read */
+        xfs_buf_t       *bp,
+        xfs_caddr_t     offset)
+{
+        xfs_caddr_t     orig_offset = XFS_BUF_PTR(bp);
+        int             orig_len = bp->b_buffer_length;
+        int             error, error2;
+        error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+        if (error)
+                return error;
+        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+        /* must reset buffer pointer even on error */
+        error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+        if (error)
+                return error;
+        return error2;
+}
+/*
 * Write out the buffer at the given block for the given number of blocks.
 * The buffer is kept locked across the write and is returned locked.
 * This can only be used for synchronous log writes.
@@ -207,7 +248,7 @@ xlog_bwrite(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -242,9 +283,9 @@ xlog_header_check_dump(
        xfs_mount_t             *mp,
        xlog_rec_header_t       *head)
 {
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-        cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
@@ -267,15 +308,15 @@ xlog_header_check_recover(
         * a dirty log created in IRIX.
         */
        if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log written in incompatible format - can't recover");
+        "dirty log written in incompatible format - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
                                 XFS_ERRLEVEL_HIGH, mp);
                return XFS_ERROR(EFSCORRUPTED);
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log entry has mismatched uuid - can't recover");
+        "dirty log entry has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -300,9 +341,9 @@ xlog_header_check_mount(
                 * h_fs_uuid is nil, we assume this log was last mounted
                 * by IRIX and continue.
                 */
-                xlog_warn("XFS: nil uuid in log - IRIX style log");
+                xfs_warn(mp, "nil uuid in log - IRIX style log");
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn("XFS: log has mismatched uuid - can't recover");
+                xfs_warn(mp, "log has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_mount",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -321,12 +362,13 @@ xlog_recover_iodone(
                 * this during recovery. One strike!
                 */
                xfs_ioerror_alert("xlog_recover_iodone",
-                                  bp->b_mount, bp, XFS_BUF_ADDR(bp));
+                                        bp->b_target->bt_mount, bp,
-                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
+                                        XFS_BUF_ADDR(bp));
+                xfs_force_shutdown(bp->b_target->bt_mount,
+                                        SHUTDOWN_META_IO_ERROR);
        }
-        bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        xfs_biodone(bp);
+        xfs_buf_ioend(bp, 0);
 }
 /*
@@ -477,8 +519,8 @@ xlog_find_verify_log_record(
        for (i = (*last_blk) - 1; i >= 0; i--) {
                if (i < start_blk) {
                        /* valid log record not found */
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-                "XFS: Log inconsistent (didn't find previous header)");
+                "Log inconsistent (didn't find previous header)");
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        goto out;
@@ -578,12 +620,12 @@ xlog_find_head(
                         * mkfs etc write a dummy unmount record to a fresh
                         * log so we can store the uuid in there
                         */
-                        xlog_warn("XFS: totally zeroed log");
+                        xfs_warn(log->l_mp, "totally zeroed log");
                }
                return 0;
        } else if (error) {
-                xlog_warn("XFS: empty log check failed");
+                xfs_warn(log->l_mp, "empty log check failed");
                return error;
        }
@@ -806,7 +848,7 @@ validate_head:
        xlog_put_bp(bp);
        if (error)
-            xlog_warn("XFS: failed to find log head");
+                xfs_warn(log->l_mp, "failed to find log head");
        return error;
 }
@@ -899,7 +941,7 @@ xlog_find_tail(
                }
        }
        if (!found) {
-                xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -923,12 +965,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -958,7 +1000,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -973,12 +1015,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1017,7 +1057,7 @@ done:
        xlog_put_bp(bp);
        if (error)
-                xlog_warn("XFS: failed to locate log tail");
+                xfs_warn(log->l_mp, "failed to locate log tail");
        return error;
 }
@@ -1081,7 +1121,8 @@ xlog_find_zeroed(
                 * the first block must be 1. If it's not, maybe we're
                 * not looking at a log... Bail out.
                 */
-                xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+                xfs_warn(log->l_mp,
+                        "Log inconsistent or not a log (last==0, first!=1)");
                return XFS_ERROR(EINVAL);
        }
@@ -1217,20 +1258,12 @@ xlog_write_log_records(
                 */
                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
-                        offset = XFS_BUF_PTR(bp);
+                        offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
-                        balign = BBTOB(ealign - start_block);
+                        error = xlog_bread_offset(log, ealign, sectbb,
-                        error = XFS_BUF_SET_PTR(bp, offset + balign,
+                                                        bp, offset);
-                                                BBTOB(sectbb));
-                        if (error)
-                                break;
-                        error = xlog_bread_noalign(log, ealign, sectbb, bp);
                        if (error)
                                break;
-                        error = XFS_BUF_SET_PTR(bp, offset, bufblks);
-                        if (error)
-                                break;
                }
                offset = xlog_align(log, start_block, endcount, bp);
@@ -1495,8 +1528,8 @@ xlog_recover_add_to_trans(
        if (list_empty(&trans->r_itemq)) {
                /* we need to catch log corruptions here */
                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-                        xlog_warn("XFS: xlog_recover_add_to_trans: "
+                        xfs_warn(log->l_mp, "%s: bad header magic number",
-                                  "bad header magic number");
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1523,8 +1556,8 @@ xlog_recover_add_to_trans(
        if (item->ri_total == 0) {              /* first region to be added */
                if (in_f->ilf_size == 0 ||
                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: bad number of regions (%d) in inode log format",
+                "bad number of regions (%d) in inode log format",
                                  in_f->ilf_size);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
@@ -1581,8 +1614,9 @@ xlog_recover_reorder_trans(
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+                                "%s: unrecognized type of log operation",
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1603,82 +1637,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1696,14 +1693,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1714,128 +1710,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1850,18 +1788,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1873,9 +1811,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1889,8 +1826,9 @@ xlog_recover_do_inode_buffer(
                logged_nextp = item->ri_buf[item_index].i_addr +
                                next_unlinked_offset - reg_buf_offset;
                if (unlikely(*logged_nextp == 0)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+                "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+                "Trying to replay bad (0) inode di_next_unlinked field.",
                                item, bp);
                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
                                         XFS_ERRLEVEL_LOW, mp);
@@ -1911,36 +1849,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -1956,17 +1887,17 @@ xlog_recover_do_reg_buffer(
                if (buf_f->blf_flags &
                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: NULL dquot in %s.", __func__);
                                goto next;
                        }
                        if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: dquot too small (%d) in %s.",
                                        item->ri_buf[i].i_len, __func__);
                                goto next;
                        }
-                        error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
+                        error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
                        if (error)
@@ -1991,6 +1922,7 @@ xlog_recover_do_reg_buffer(
 */
 int
 xfs_qm_dqcheck(
+        struct xfs_mount *mp,
        xfs_disk_dquot_t *ddq,
        xfs_dqid_t       id,
        uint             type,    /* used only when IO_dorepair is true */
@@ -2017,14 +1949,14 @@ xfs_qm_dqcheck(
         */
        if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
                errs++;
        }
        if (ddq->d_version != XFS_DQUOT_VERSION) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
                errs++;
@@ -2034,7 +1966,7 @@ xfs_qm_dqcheck(
            ddq->d_flags != XFS_DQ_PROJ &&
            ddq->d_flags != XFS_DQ_GROUP) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
                        str, id, ddq->d_flags);
                errs++;
@@ -2042,7 +1974,7 @@ xfs_qm_dqcheck(
        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : ondisk-dquot 0x%p, ID mismatch: "
                        "0x%x expected, found id 0x%x",
                        str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -2055,9 +1987,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_blk_softlimit)) {
                        if (!ddq->d_btimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-                                        "BLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -2067,9 +1998,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_ino_softlimit)) {
                        if (!ddq->d_itimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-                                        "INODE TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -2079,9 +2009,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_rtb_softlimit)) {
                        if (!ddq->d_rtbtimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-                                        "RTBLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -2092,7 +2021,7 @@ xfs_qm_dqcheck(
                return errs;
        if (flags & XFS_QMOPT_DOWARN)
-                cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+                xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
        /*
         * Typically, a repair is only requested by quotacheck.
@@ -2174,77 +2103,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2275,8 +2173,7 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+                ASSERT(bp->b_target->bt_mount == mp);
-                bp->b_mount = mp;
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2285,16 +2182,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2304,10 +2199,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2317,8 +2208,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2351,10 +2240,10 @@ xlog_recover_do_inode_trans(
         */
        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        __func__, dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2362,10 +2251,10 @@ xlog_recover_do_inode_trans(
        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        __func__, item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2393,12 +2282,13 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad regular inode log record, rec ptr 0x%p, "
-                                item, dip, bp, ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2406,45 +2296,48 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad dir inode log record, rec ptr 0x%p, "
-                                item, dip, bp, ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
-                        item, dip, bp, ino,
+        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+                        __func__, item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
-                        item, dip, bp, ino, dicp->di_forkoff);
+        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+                        "%s: Bad inode log record length %d, rec ptr 0x%p",
-                        item->ri_buf[1].i_len, item);
+                        __func__, item->ri_buf[1].i_len, item);
                error = EFSCORRUPTED;
                goto error;
        }
@@ -2531,7 +2424,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2540,8 +2433,7 @@ xlog_recover_do_inode_trans(
        }
 write_inode_buffer:
-        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        ASSERT(bp->b_target->bt_mount == mp);
-        bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
 error:
@@ -2556,18 +2448,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2473,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2613,13 +2493,11 @@ xlog_recover_do_dquot_trans(
        recddq = item->ri_buf[1].i_addr;
        if (recddq == NULL) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-                        "XFS: NULL dquot in %s.", __func__);
                return XFS_ERROR(EIO);
        }
        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
-                        "XFS: dquot too small (%d) in %s.",
                        item->ri_buf[1].i_len, __func__);
                return XFS_ERROR(EIO);
        }
@@ -2644,12 +2522,10 @@ xlog_recover_do_dquot_trans(
         */
        dq_f = item->ri_buf[0].i_addr;
        ASSERT(dq_f);
-        if ((error = xfs_qm_dqcheck(recddq,
+        error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           dq_f->qlf_id,
+                           "xlog_recover_dquot_pass2 (log copy)");
-                           0, XFS_QMOPT_DOWARN,
+        if (error)
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
                return XFS_ERROR(EIO);
-        }
        ASSERT(dq_f->qlf_len == 1);
        error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2669,8 +2545,9 @@ xlog_recover_do_dquot_trans(
         * was among a chunk of dquots created earlier, and we did some
         * minimal initialization then.
         */
-        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+        error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2");
+        if (error) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2678,8 +2555,7 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        ASSERT(dq_f->qlf_size == 2);
-        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+        ASSERT(bp->b_target->bt_mount == mp);
-        bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
@@ -2694,38 +2570,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2738,11 +2607,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2751,10 +2619,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2786,62 +2650,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2853,7 +2661,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2872,26 +2680,103 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+                        __func__, ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+                        __func__, ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
 STATIC int
 xlog_recover_unmount_trans(
+        struct log              *log,
        xlog_recover_t          *trans)
 {
        /* Do nothing now */
-        xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
        return 0;
 }
@@ -2934,8 +2819,8 @@ xlog_recover_process_data(
                dp += sizeof(xlog_op_header_t);
                if (ohead->oh_clientid != XFS_TRANSACTION &&
                    ohead->oh_clientid != XFS_LOG) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
-                "XFS: xlog_recover_process_data: bad clientid");
+                                        __func__, ohead->oh_clientid);
                        ASSERT(0);
                        return (XFS_ERROR(EIO));
                }
@@ -2948,8 +2833,8 @@ xlog_recover_process_data(
                                        be64_to_cpu(rhead->h_lsn));
                } else {
                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
-                        "XFS: xlog_recover_process_data: bad length");
+                                        __func__, be32_to_cpu(ohead->oh_len));
                                WARN_ON(1);
                                return (XFS_ERROR(EIO));
                        }
@@ -2962,7 +2847,7 @@ xlog_recover_process_data(
                                                                trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
-                                error = xlog_recover_unmount_trans(trans);
+                                error = xlog_recover_unmount_trans(log, trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
                                error = xlog_recover_add_to_cont_trans(log,
@@ -2970,8 +2855,8 @@ xlog_recover_process_data(
                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad transaction",
-                        "XFS: xlog_recover_process_data: bad transaction");
+                                        __func__);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -2981,8 +2866,8 @@ xlog_recover_process_data(
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
-                        "XFS: xlog_recover_process_data: bad flag");
+                                        __func__, flags);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -3012,7 +2897,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3051,7 +2936,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3108,7 +2993,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3167,8 +3052,7 @@ xlog_recover_clear_agi_bucket(
 out_abort:
        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
-        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
-                        "failed to clear agi %d. Continuing.", agno);
        return;
 }
@@ -3419,7 +3303,7 @@ xlog_valid_rec_header(
        if (unlikely(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
-                xlog_warn("XFS: %s: unrecognised log version (%d).",
+                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
                        __func__, be32_to_cpu(rhead->h_version));
                return XFS_ERROR(EIO);
        }
@@ -3585,19 +3469,9 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                wrapped_hblks = hblks - split_hblks;
-                                error = XFS_BUF_SET_PTR(hbp,
+                                error = xlog_bread_offset(log, 0,
-                                                offset + BBTOB(split_hblks),
+                                                wrapped_hblks, hbp,
-                                                BBTOB(hblks - split_hblks));
+                                                offset + BBTOB(split_hblks));
-                                if (error)
-                                        goto bread_err2;
-                                error = xlog_bread_noalign(log, 0,
-                                                           wrapped_hblks, hbp);
-                                if (error)
-                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(hbp, offset,
-                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
                        }
@@ -3648,19 +3522,9 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
-                                error = XFS_BUF_SET_PTR(dbp,
+                                error = xlog_bread_offset(log, 0,
-                                                offset + BBTOB(split_bblks),
+                                                bblks - split_bblks, hbp,
-                                                BBTOB(bblks - split_bblks));
+                                                offset + BBTOB(split_bblks));
-                                if (error)
-                                        goto bread_err2;
-                                error = xlog_bread_noalign(log, wrapped_hblks,
-                                                bblks - split_bblks,
-                                                dbp);
-                                if (error)
-                                        goto bread_err2;
-                                error = XFS_BUF_SET_PTR(dbp, offset, h_size);
                                if (error)
                                        goto bread_err2;
                        }
@@ -3725,7 +3589,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3733,10 +3597,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3755,7 +3621,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
@@ -3817,7 +3683,7 @@ xlog_do_recover(
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
        xfsbdstrat(log->l_mp, bp);
-        error = xfs_iowait(bp);
+        error = xfs_buf_iowait(bp);
        if (error) {
                xfs_ioerror_alert("xlog_do_recover",
                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
@@ -3875,10 +3741,9 @@ xlog_recover(
                        return error;
                }
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
-                        "Starting XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                error = xlog_do_recover(log, head_blk, tail_blk);
                log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3911,9 +3776,7 @@ xlog_recover_finish(
                int     error;
                error = xlog_recover_process_efis(log);
                if (error) {
-                        cmn_err(CE_ALERT,
+                        xfs_alert(log->l_mp, "Failed to recover EFIs");
-                                "Failed to recover EFIs on filesystem: %s",
-                                log->l_mp->m_fsname);
                        return error;
                }
                /*
@@ -3928,15 +3791,12 @@ xlog_recover_finish(
                xlog_recover_check_summary(log);
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
-                        "Ending XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
-                cmn_err(CE_DEBUG,
+                xfs_info(log->l_mp, "Ending clean mount");
-                        "!Ending clean XFS mount for filesystem: %s\n",
-                        log->l_mp->m_fsname);
        }
        return 0;
 }
@@ -3969,10 +3829,8 @@ xlog_recover_check_summary(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s agf read failed agno %d error %d",
-                                        "xlog_recover_check_summary(agf)"
+                                                __func__, agno, error);
-                                        "agf read failed agno %d error %d",
-                                                        agno, error);
                } else {
                        agfp = XFS_BUF_TO_AGF(agfbp);
                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3981,7 +3839,10 @@ xlog_recover_check_summary(
                }
                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                if (!error) {
+                if (error) {
+                        xfs_alert(mp, "%s agi read failed agno %d error %d",
+                                                __func__, agno, error);
+                } else {
                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
                        itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..b49b82363d20 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
                                                int);
 STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
                                                int);
-STATIC int      xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
-                                                int64_t, int);
 STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
 #define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
-#define xfs_icsb_modify_counters(mp, a, b, c)           do { } while (0)
 #endif
 static const struct {
@@ -138,9 +133,7 @@ xfs_uuid_mount(
                return 0;
        if (uuid_is_nil(uuid)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-                        "XFS: Filesystem %s has nil UUID - can't mount",
-                        mp->m_fsname);
                return XFS_ERROR(EINVAL);
        }
@@ -168,8 +161,7 @@ xfs_uuid_mount(
 out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
-        cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+        xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
-                         mp->m_fsname);
        return XFS_ERROR(EINVAL);
 }
@@ -199,6 +191,8 @@ xfs_uuid_unmount(
 /*
 * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
 */
 struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +200,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
        struct xfs_perag        *pag;
        int                     ref = 0;
-        spin_lock(&mp->m_perag_lock);
+        rcu_read_lock();
        pag = radix_tree_lookup(&mp->m_perag_tree, agno);
        if (pag) {
                ASSERT(atomic_read(&pag->pag_ref) >= 0);
-                /* catch leaks in the positive direction during testing */
-                ASSERT(atomic_read(&pag->pag_ref) < 1000);
                ref = atomic_inc_return(&pag->pag_ref);
        }
-        spin_unlock(&mp->m_perag_lock);
+        rcu_read_unlock();
        trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
        return pag;
 }
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          first,
+        int                     tag)
+{
+        struct xfs_perag        *pag;
+        int                     found;
+        int                     ref;
+        rcu_read_lock();
+        found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                        (void **)&pag, first, 1, tag);
+        if (found <= 0) {
+                rcu_read_unlock();
+                return NULL;
+        }
+        ref = atomic_inc_return(&pag->pag_ref);
+        rcu_read_unlock();
+        trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+        return pag;
+}
 void
 xfs_perag_put(struct xfs_perag *pag)
 {
@@ -229,10 +247,18 @@ xfs_perag_put(struct xfs_perag *pag)
        trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
 }
+STATIC void
+__xfs_free_perag(
+        struct rcu_head *head)
+{
+        struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+        ASSERT(atomic_read(&pag->pag_ref) == 0);
+        kmem_free(pag);
+}
 /*
- * Free up the resources associated with a mount structure.  Assume that
+ * Free up the per-ag resources associated with the mount structure.
- * the structure was initially zeroed, so we can tell which fields got
- * initialized.
 */
 STATIC void
 xfs_free_perag(
@@ -244,10 +270,10 @@ xfs_free_perag(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                spin_lock(&mp->m_perag_lock);
                pag = radix_tree_delete(&mp->m_perag_tree, agno);
+                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
                ASSERT(atomic_read(&pag->pag_ref) == 0);
-                spin_unlock(&mp->m_perag_lock);
+                call_rcu(&pag->rcu_head, __xfs_free_perag);
-                kmem_free(pag);
        }
 }
@@ -282,6 +308,8 @@ xfs_mount_validate_sb(
        xfs_sb_t        *sbp,
        int             flags)
 {
+        int             loud = !(flags & XFS_MFSI_QUIET);
        /*
         * If the log device and data device have the
         * same device number, the log is internal.
@@ -290,28 +318,32 @@ xfs_mount_validate_sb(
         * a volume filesystem in a non-volume manner.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-                xfs_fs_mount_cmn_err(flags, "bad magic number");
+                if (loud)
+                        xfs_warn(mp, "bad magic number");
                return XFS_ERROR(EWRONGFS);
        }
        if (!xfs_sb_good_version(sbp)) {
-                xfs_fs_mount_cmn_err(flags, "bad version");
+                if (loud)
+                        xfs_warn(mp, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an external log; "
+                        xfs_warn(mp,
-                        "specify logdev on the\nmount command line.");
+                "filesystem is marked as having an external log; "
+                "specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an internal log; "
+                        xfs_warn(mp,
-                        "do not specify logdev on\nthe mount command line.");
+                "filesystem is marked as having an internal log; "
+                "do not specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
@@ -340,7 +372,8 @@ xfs_mount_validate_sb(
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
            (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 1 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -353,7 +386,8 @@ xfs_mount_validate_sb(
             (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
            sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
                              sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 2 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -361,12 +395,12 @@ xfs_mount_validate_sb(
         * Until this is fixed only page-sized or smaller data blocks work.
         */
        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud) {
-                        "file system with blocksize %d bytes",
+                        xfs_warn(mp,
-                        sbp->sb_blocksize);
+                "File system with blocksize %d bytes. "
-                xfs_fs_mount_cmn_err(flags,
+                "Only pagesize (%ld) or less will currently work.",
-                        "only pagesize (%ld) or less will currently work.",
+                                sbp->sb_blocksize, PAGE_SIZE);
-                        PAGE_SIZE);
+                }
                return XFS_ERROR(ENOSYS);
        }
@@ -380,21 +414,23 @@ xfs_mount_validate_sb(
        case 2048:
                break;
        default:
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "inode size of %d bytes not supported",
+                        xfs_warn(mp, "inode size of %d bytes not supported",
-                        sbp->sb_inodesize);
+                                sbp->sb_inodesize);
                return XFS_ERROR(ENOSYS);
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system too large to be mounted on this system.");
+                        xfs_warn(mp,
+                "file system too large to be mounted on this system.");
                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
-                xfs_fs_mount_cmn_err(flags, "file system busy");
+                if (loud)
+                        xfs_warn(mp, "file system busy");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -402,8 +438,9 @@ xfs_mount_validate_sb(
         * Version 1 directory format has never worked on Linux.
         */
        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system using version 1 directory format");
+                        xfs_warn(mp,
+                                "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
        }
@@ -443,8 +480,11 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
+                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+                spin_lock_init(&pag->pag_buf_lock);
+                pag->pag_buf_tree = RB_ROOT;
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
@@ -639,9 +679,9 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
        unsigned int    sector_size;
-        unsigned int    extra_flags;
        xfs_buf_t       *bp;
        int             error;
+        int             loud = !(flags & XFS_MFSI_QUIET);
        ASSERT(mp->m_sb_bp == NULL);
        ASSERT(mp->m_ddev_targp != NULL);
@@ -652,39 +692,37 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * access to the superblock.
         */
        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-        extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
+reread:
-                          extra_flags);
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-        if (!bp || XFS_BUF_ISERROR(bp)) {
+                                        XFS_SB_DADDR, sector_size, 0);
-                xfs_fs_mount_cmn_err(flags, "SB read failed");
+        if (!bp) {
-                error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
+                if (loud)
-                goto fail;
+                        xfs_warn(mp, "SB buffer read failed");
+                return EIO;
        }
-        ASSERT(XFS_BUF_ISBUSY(bp));
-        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        /*
         * Initialize the mount structure from the superblock.
         * But first do some basic consistency checking.
         */
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
-                xfs_fs_mount_cmn_err(flags, "SB validate failed");
+                if (loud)
-                goto fail;
+                        xfs_warn(mp, "SB validate failed");
+                goto release_buf;
        }
        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > mp->m_sb.sb_sectsize) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "device supports only %u byte sectors (not %u)",
+                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
-                        sector_size, mp->m_sb.sb_sectsize);
+                                sector_size, mp->m_sb.sb_sectsize);
                error = ENOSYS;
-                goto fail;
+                goto release_buf;
        }
        /*
@@ -692,33 +730,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
         * re-read the superblock so the buffer is correctly sized.
         */
        if (sector_size < mp->m_sb.sb_sectsize) {
-                XFS_BUF_UNMANAGE(bp);
                xfs_buf_relse(bp);
                sector_size = mp->m_sb.sb_sectsize;
-                bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR,
+                goto reread;
-                                  BTOBB(sector_size), extra_flags);
-                if (!bp || XFS_BUF_ISERROR(bp)) {
-                        xfs_fs_mount_cmn_err(flags, "SB re-read failed");
-                        error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
-                        goto fail;
-                }
-                ASSERT(XFS_BUF_ISBUSY(bp));
-                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
        }
        /* Initialize per-cpu counters */
        xfs_icsb_reinit_counters(mp);
        mp->m_sb_bp = bp;
-        xfs_buf_relse(bp);
+        xfs_buf_unlock(bp);
-        ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
        return 0;
- fail:
+release_buf:
-        if (bp) {
+        xfs_buf_relse(bp);
-                XFS_BUF_UNMANAGE(bp);
-                xfs_buf_relse(bp);
-        }
        return error;
 }
@@ -839,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
                        if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "alignment check 1 failed");
-                                        "XFS: alignment check 1 failed");
                                return XFS_ERROR(EINVAL);
                        }
                        mp->m_dalign = mp->m_swidth = 0;
@@ -853,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
                                        return XFS_ERROR(EINVAL);
                                }
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp,
-"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+                "stripe alignment turned off: sunit(%d)/swidth(%d) "
+                "incompatible with agsize(%d)",
                                        mp->m_dalign, mp->m_swidth,
                                        sbp->sb_agblocks);
@@ -864,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                        xfs_fs_cmn_err(CE_WARN, mp,
+                                        xfs_warn(mp,
-"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+                "stripe alignment turned off: sunit(%d) less than bsize(%d)",
-                                                mp->m_dalign,
+                                                mp->m_dalign,
                                                mp->m_blockmask +1);
                                        return XFS_ERROR(EINVAL);
                                }
@@ -961,6 +986,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -991,42 +1034,35 @@ xfs_check_sizes(xfs_mount_t *mp)
 {
        xfs_buf_t       *bp;
        xfs_daddr_t     d;
-        int             error;
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-                cmn_err(CE_WARN, "XFS: size check 1 failed");
+                xfs_warn(mp, "filesystem size mismatch detected");
                return XFS_ERROR(EFBIG);
        }
-        error = xfs_read_buf(mp, mp->m_ddev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
-                             d - XFS_FSS_TO_BB(mp, 1),
+                                        d - XFS_FSS_TO_BB(mp, 1),
-                             XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                        BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
-        if (!error) {
+        if (!bp) {
-                xfs_buf_relse(bp);
+                xfs_warn(mp, "last sector read failed");
-        } else {
+                return EIO;
-                cmn_err(CE_WARN, "XFS: size check 2 failed");
-                if (error == ENOSPC)
-                        error = XFS_ERROR(EFBIG);
-                return error;
        }
+        xfs_buf_relse(bp);
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        cmn_err(CE_WARN, "XFS: size check 3 failed");
+                        xfs_warn(mp, "log size mismatch detected");
                        return XFS_ERROR(EFBIG);
                }
-                error = xfs_read_buf(mp, mp->m_logdev_targp,
+                bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
-                                     d - XFS_FSB_TO_BB(mp, 1),
+                                        d - XFS_FSB_TO_BB(mp, 1),
-                                     XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                        XFS_FSB_TO_B(mp, 1), 0);
-                if (!error) {
+                if (!bp) {
-                        xfs_buf_relse(bp);
+                        xfs_warn(mp, "log device read failed");
-                } else {
+                        return EIO;
-                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        if (error == ENOSPC)
-                                error = XFS_ERROR(EFBIG);
-                        return error;
                }
+                xfs_buf_relse(bp);
        }
        return 0;
 }
@@ -1061,7 +1097,7 @@ xfs_mount_reset_sbqflags(
                return 0;
 #ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1069,8 +1105,7 @@ xfs_mount_reset_sbqflags(
                                      XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
                return error;
        }
@@ -1136,8 +1171,7 @@ xfs_mountfs(
         * transaction subsystem is online.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "correcting sb_features alignment problem");
-                        "XFS: correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
                sbp->sb_bad_features2 = sbp->sb_features2;
                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1189,6 +1223,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
@@ -1213,7 +1250,7 @@ xfs_mountfs(
         */
        error = xfs_rtmount_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: RT mount failed");
+                xfs_warn(mp, "RT mount failed");
                goto out_remove_uuid;
        }
@@ -1244,12 +1281,12 @@ xfs_mountfs(
        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
-                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
+                xfs_warn(mp, "Failed per-ag init: %d", error);
                goto out_remove_uuid;
        }
        if (!sbp->sb_logblocks) {
-                cmn_err(CE_WARN, "XFS: no log defined");
+                xfs_warn(mp, "no log defined");
                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
                error = XFS_ERROR(EFSCORRUPTED);
                goto out_free_perag;
@@ -1262,7 +1299,7 @@ xfs_mountfs(
                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount failed");
+                xfs_warn(mp, "log mount failed");
                goto out_free_perag;
        }
@@ -1299,16 +1336,14 @@ xfs_mountfs(
         */
        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
        if (error) {
-                cmn_err(CE_WARN, "XFS: failed to read root inode");
+                xfs_warn(mp, "failed to read root inode");
                goto out_log_dealloc;
        }
        ASSERT(rip != NULL);
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
-                cmn_err(CE_WARN, "XFS: corrupted root inode");
+                xfs_warn(mp, "corrupted root inode %llu: not a directory",
-                cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
-                        XFS_BUFTARG_NAME(mp->m_ddev_targp),
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1328,7 +1363,7 @@ xfs_mountfs(
                /*
                 * Free up the root inode.
                 */
-                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+                xfs_warn(mp, "failed to read RT inodes");
                goto out_rele_rip;
        }
@@ -1340,7 +1375,7 @@ xfs_mountfs(
        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
-                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
+                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
                }
        }
@@ -1361,10 +1396,7 @@ xfs_mountfs(
                 * quotachecked license.
                 */
                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "resetting quota flags");
-                                "XFS: resetting qflags for filesystem %s",
-                                mp->m_fsname);
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
                                return error;
@@ -1378,7 +1410,7 @@ xfs_mountfs(
         */
        error = xfs_log_mount_finish(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount finish failed");
+                xfs_warn(mp, "log mount finish failed");
                goto out_rtunmount;
        }
@@ -1407,8 +1439,8 @@ xfs_mountfs(
                resblks = xfs_default_resblks(mp);
                error = xfs_reserve_blocks(mp, &resblks, NULL);
                if (error)
-                        cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+                        xfs_warn(mp,
-                                "blocks. Continuing without a reserve pool.");
+        "Unable to allocate reserve blocks. Continuing without reserve pool.");
        }
        return 0;
@@ -1497,12 +1529,12 @@ xfs_unmountfs(
        resblks = 0;
        error = xfs_reserve_blocks(mp, &resblks, NULL);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+                xfs_warn(mp, "Unable to free reserved block pool. "
                                "Freespace may not be correct on next mount.");
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
@@ -1601,7 +1633,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNASYNC(sbp);
                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
-                error = xfs_iowait(sbp);
+                error = xfs_buf_iowait(sbp);
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1864,72 @@ xfs_mod_incore_sb_unlocked(
 */
 int
 xfs_mod_incore_sb(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        xfs_sb_field_t  field,
+        xfs_sb_field_t          field,
-        int64_t         delta,
+        int64_t                 delta,
-        int             rsvd)
+        int                     rsvd)
 {
-        int     status;
+        int                     status;
-        /* check for per-cpu counters */
-        switch (field) {
 #ifdef HAVE_PERCPU_SB
-        case XFS_SBS_ICOUNT:
+        ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
-        case XFS_SBS_IFREE:
-        case XFS_SBS_FDBLOCKS:
-                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                        status = xfs_icsb_modify_counters(mp, field,
-                                                        delta, rsvd);
-                        break;
-                }
-                /* FALLTHROUGH */
 #endif
-        default:
+        spin_lock(&mp->m_sb_lock);
-                spin_lock(&mp->m_sb_lock);
+        status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-                status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+        spin_unlock(&mp->m_sb_lock);
-                spin_unlock(&mp->m_sb_lock);
-                break;
-        }
        return status;
 }
 /*
- * xfs_mod_incore_sb_batch() is used to change more than one field
+ * Change more than one field in the in-core superblock structure at a time.
- * in the in-core superblock structure at a time.  This modification
- * is protected by a lock internal to this module.  The fields and
- * changes to those fields are specified in the array of xfs_mod_sb
- * structures passed in.
 *
- * Either all of the specified deltas will be applied or none of
+ * The fields and changes to those fields are specified in the array of
- * them will.  If any modified field dips below 0, then all modifications
+ * xfs_mod_sb structures passed in.  Either all of the specified deltas
- * will be backed out and EINVAL will be returned.
+ * will be applied or none of them will.  If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
+ *
+ * Note that this function may not be used for the superblock values that
+ * are tracked with the in-memory per-cpu counters - a direct call to
+ * xfs_icsb_modify_counters is required for these.
 */
 int
-xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+xfs_mod_incore_sb_batch(
+        struct xfs_mount        *mp,
+        xfs_mod_sb_t            *msb,
+        uint                    nmsb,
+        int                     rsvd)
 {
-        int             status=0;
+        xfs_mod_sb_t            *msbp;
-        xfs_mod_sb_t    *msbp;
+        int                     error = 0;
        /*
-         * Loop through the array of mod structures and apply each
+         * Loop through the array of mod structures and apply each individually.
-         * individually.  If any fail, then back out all those
+         * If any fail, then back out all those which have already been applied.
-         * which have already been applied.  Do all of this within
+         * Do all of this within the scope of the m_sb_lock so that all of the
-         * the scope of the m_sb_lock so that all of the changes will
+         * changes will be atomic.
-         * be atomic.
         */
        spin_lock(&mp->m_sb_lock);
-        msbp = &msb[0];
+        for (msbp = msb; msbp < (msb + nmsb); msbp++) {
-        for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
-                /*
+                       msbp->msb_field > XFS_SBS_FDBLOCKS);
-                 * Apply the delta at index n.  If it fails, break
-                 * from the loop so we'll fall into the undo loop
-                 * below.
-                 */
-                switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-                case XFS_SBS_ICOUNT:
-                case XFS_SBS_IFREE:
-                case XFS_SBS_FDBLOCKS:
-                        if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                spin_unlock(&mp->m_sb_lock);
-                                status = xfs_icsb_modify_counters(mp,
-                                                        msbp->msb_field,
-                                                        msbp->msb_delta, rsvd);
-                                spin_lock(&mp->m_sb_lock);
-                                break;
-                        }
-                        /* FALLTHROUGH */
-#endif
-                default:
-                        status = xfs_mod_incore_sb_unlocked(mp,
-                                                msbp->msb_field,
-                                                msbp->msb_delta, rsvd);
-                        break;
-                }
-                if (status != 0) {
+                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-                        break;
+                                                   msbp->msb_delta, rsvd);
-                }
+                if (error)
+                        goto unwind;
        }
+        spin_unlock(&mp->m_sb_lock);
+        return 0;
-        /*
+unwind:
-         * If we didn't complete the loop above, then back out
+        while (--msbp >= msb) {
-         * any changes made to the superblock.  If you add code
+                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-         * between the loop above and here, make sure that you
+                                                   -msbp->msb_delta, rsvd);
-         * preserve the value of status. Loop back until
+                ASSERT(error == 0);
-         * we step below the beginning of the array.  Make sure
-         * we don't touch anything back there.
-         */
-        if (status != 0) {
-                msbp--;
-                while (msbp >= msb) {
-                        switch (msbp->msb_field) {
-#ifdef HAVE_PERCPU_SB
-                        case XFS_SBS_ICOUNT:
-                        case XFS_SBS_IFREE:
-                        case XFS_SBS_FDBLOCKS:
-                                if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
-                                        spin_unlock(&mp->m_sb_lock);
-                                        status = xfs_icsb_modify_counters(mp,
-                                                        msbp->msb_field,
-                                                        -(msbp->msb_delta),
-                                                        rsvd);
-                                        spin_lock(&mp->m_sb_lock);
-                                        break;
-                                }
-                                /* FALLTHROUGH */
-#endif
-                        default:
-                                status = xfs_mod_incore_sb_unlocked(mp,
-                                                        msbp->msb_field,
-                                                        -(msbp->msb_delta),
-                                                        rsvd);
-                                break;
-                        }
-                        ASSERT(status == 0);
-                        msbp--;
-                }
        }
        spin_unlock(&mp->m_sb_lock);
-        return status;
+        return error;
 }
 /*
@@ -1998,18 +1967,13 @@ xfs_getsb(
 */
 void
 xfs_freesb(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp = mp->m_sb_bp;
-        /*
+        xfs_buf_lock(bp);
-         * Use xfs_getsb() so that the buffer will be locked
-         * when we call xfs_buf_relse().
-         */
-        bp = xfs_getsb(mp, 0);
-        XFS_BUF_UNMANAGE(bp);
-        xfs_buf_relse(bp);
        mp->m_sb_bp = NULL;
+        xfs_buf_relse(bp);
 }
 /*
@@ -2053,10 +2017,8 @@ xfs_dev_is_read_only(
        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
            xfs_readonly_buftarg(mp->m_logdev_targp) ||
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "%s required on read-only device.", message);
-                        "XFS: %s required on read-only device.", message);
+                xfs_notice(mp, "write access unavailable, cannot proceed.");
-                cmn_err(CE_NOTE,
-                        "XFS: write access unavailable, cannot proceed.");
                return EROFS;
        }
        return 0;
@@ -2496,7 +2458,7 @@ xfs_icsb_balance_counter(
        spin_unlock(&mp->m_sb_lock);
 }
-STATIC int
+int
 xfs_icsb_modify_counters(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
 #include "xfs_sync.h"
-struct cred;
 struct log;
 struct xfs_mount_args;
 struct xfs_inode;
@@ -91,6 +90,8 @@ extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
 extern void     xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void     xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void     xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int      xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+                                                int64_t, int);
 #else
 #define xfs_icsb_init_counters(mp)              (0)
@@ -98,8 +99,20 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_reinit_counters(mp)            do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)       do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -190,15 +203,14 @@ typedef struct xfs_mount {
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-        struct task_struct      *m_sync_task;   /* generalised sync thread */
+        struct delayed_work     m_sync_work;    /* background sync work */
-        xfs_sync_work_t         m_sync_work;    /* work item for VFS_SYNC */
+        struct delayed_work     m_reclaim_work; /* background inode reclaim */
-        struct list_head        m_sync_list;    /* sync thread work item list */
+        struct work_struct      m_flush_work;   /* background inode flush */
-        spinlock_t              m_sync_lock;    /* work item list lock */
-        int                     m_sync_seq;     /* sync thread generation no. */
-        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -212,6 +224,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
                                                   disk errors in metadata */
+#define XFS_MOUNT_DISCARD       (1ULL << 5)     /* discard unused blocks */
 #define XFS_MOUNT_RETERR        (1ULL << 6)     /* return alignment errors to
                                                   user */
 #define XFS_MOUNT_NOALIGN       (1ULL << 7)     /* turn off stripe alignment
@@ -232,8 +245,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_DIRSYNC       (1ULL << 21)    /* synchronous directory ops */
 #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22)    /* don't report large preferred
                                                 * I/O size in stat() */
-#define XFS_MOUNT_NO_PERCPU_SB  (1ULL << 23)    /* don't use per-cpu superblock
-                                                   counters */
 #define XFS_MOUNT_FILESTREAMS   (1ULL << 24)    /* enable the filestreams
                                                   allocator */
 #define XFS_MOUNT_NOATTR2       (1ULL << 25)    /* disable use of attr2 format */
@@ -327,6 +338,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 * perag get/put wrappers for ref counting
 */
 struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+                                        int tag);
 void    xfs_perag_put(struct xfs_perag *pag);
 /*
@@ -376,6 +389,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
        if (!xfs_mru_elem_zone)
                goto out;
-        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
                goto out_destroy_mru_elem_zone;
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
        spin_lock(&mru->lock);
        if (mru->queued) {
                spin_unlock(&mru->lock);
-                cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+                cancel_delayed_work_sync(&mru->work);
                spin_lock(&mru->lock);
        }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
+                struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+        return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+                struct xfs_mount *mp, struct xfs_dquot *udqp,
+                struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+        return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)                                  (0)
 #define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)                                             (0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+        return 0;
+}
 #define xfs_qm_newmount(mp, a, b)                                       (0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)                                       (0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
@@ -370,7 +382,8 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
-extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+                                xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_REFCACHE_H__
-#define __XFS_REFCACHE_H__
-#ifdef HAVE_REFCACHE
-/*
- * Maximum size (in inodes) for the NFS reference cache
- */
-#define XFS_REFCACHE_SIZE_MAX   512
-struct xfs_inode;
-struct xfs_mount;
-extern void xfs_refcache_insert(struct xfs_inode *);
-extern void xfs_refcache_purge_ip(struct xfs_inode *);
-extern void xfs_refcache_purge_mp(struct xfs_mount *);
-extern void xfs_refcache_purge_some(struct xfs_mount *);
-extern void xfs_refcache_resize(int);
-extern void xfs_refcache_destroy(void);
-extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
-#else
-#define xfs_refcache_insert(ip)         do { } while (0)
-#define xfs_refcache_purge_ip(ip)       do { } while (0)
-#define xfs_refcache_purge_mp(mp)       do { } while (0)
-#define xfs_refcache_purge_some(mp)     do { } while (0)
-#define xfs_refcache_resize(size)       do { } while (0)
-#define xfs_refcache_destroy()          do { } while (0)
-#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
-#endif
-#endif  /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
         * tree quota mechanism would be circumvented.
         */
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
                error = XFS_ERROR(EXDEV);
                goto error_return;
        }
@@ -211,7 +211,9 @@ xfs_rename(
                        goto error_return;
                if (error)
                        goto abort_return;
-                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, target_dp,
+                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
                if (error)
                        goto abort_return;
-                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, target_dp,
+                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                /*
                 * Decrement the link count on the target since the target
@@ -292,7 +296,8 @@ xfs_rename(
         * inode isn't really being changed, but old unix file systems did
         * it and some incremental backup programs won't work without it.
         */
-        xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when
@@ -315,7 +320,7 @@ xfs_rename(
        if (error)
                goto abort_return;
-        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_buf.h"
 /*
@@ -75,7 +76,7 @@ xfs_growfs_rt_alloc(
        xfs_mount_t     *mp,            /* file system mount point */
        xfs_extlen_t    oblocks,        /* old count of blocks */
        xfs_extlen_t    nblocks,        /* new count of blocks */
-        xfs_ino_t       ino)            /* inode number (bitmap/summary) */
+        xfs_inode_t     *ip)            /* inode (bitmap/summary) */
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
@@ -85,7 +86,6 @@ xfs_growfs_rt_alloc(
        xfs_fsblock_t   firstblock;     /* first block allocated in xaction */
        xfs_bmap_free_t flist;          /* list of freed blocks */
        xfs_fsblock_t   fsbno;          /* filesystem block for bno */
-        xfs_inode_t     *ip;            /* pointer to incore inode */
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
@@ -111,9 +111,9 @@ xfs_growfs_rt_alloc(
                /*
                 * Lock the inode.
                 */
-                if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
                xfs_bmap_init(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
@@ -154,9 +154,8 @@ xfs_growfs_rt_alloc(
                        /*
                         * Lock the bitmap inode.
                         */
-                        if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                        XFS_ILOCK_EXCL, &ip)))
+                        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                                goto error_cancel;
                        /*
                         * Get a buffer for the block.
                         */
@@ -1853,7 +1852,6 @@ xfs_growfs_rt(
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
        xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
@@ -1883,13 +1881,13 @@ xfs_growfs_rt(
        /*
         * Read in the last block of the device, make sure it exists.
         */
-        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
-                        XFS_FSB_TO_BB(mp, nrblocks - 1),
+                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                        XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                XFS_FSB_TO_B(mp, 1), 0);
-        if (error)
+        if (!bp)
-                return error;
+                return EIO;
-        ASSERT(bp);
        xfs_buf_relse(bp);
        /*
         * Calculate new parameters.  These are the final values to be reached.
         */
@@ -1917,11 +1915,11 @@ xfs_growfs_rt(
        /*
         * Allocate space to the bitmap and summary files, as necessary.
         */
-        if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+        error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
-                        mp->m_sb.sb_rbmino)))
+        if (error)
                return error;
-        if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+        error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
-                        mp->m_sb.sb_rsumino)))
+        if (error)
                return error;
        /*
         * Allocate a new (fake) mount/sb.
@@ -1971,10 +1969,8 @@ xfs_growfs_rt(
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
                 */
@@ -1985,10 +1981,8 @@ xfs_growfs_rt(
                /*
                 * Get the summary inode into the transaction.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+                xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
                 */
@@ -2074,15 +2068,15 @@ xfs_rtallocate_extent(
        xfs_extlen_t    prod,           /* extent product factor */
        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
 {
+        xfs_mount_t     *mp = tp->t_mountp;
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* inode for bitmap file */
-        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_rtblock_t   r;              /* result allocated block */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
        ASSERT(minlen > 0 && minlen <= maxlen);
-        mp = tp->t_mountp;
        /*
         * If prod is set then figure out what to do to minlen and maxlen.
         */
@@ -2098,12 +2092,7 @@ xfs_rtallocate_extent(
                        return 0;
                }
        }
-        /*
-         * Lock out other callers by grabbing the bitmap inode lock.
-         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
        sumbp = NULL;
        /*
         * Allocate by size, or near another block, or exactly at some block.
@@ -2122,11 +2111,12 @@ xfs_rtallocate_extent(
                                len, &sumbp, &sb, prod, &r);
                break;
        default:
+                error = EIO;
                ASSERT(0);
        }
-        if (error) {
+        if (error)
                return error;
-        }
        /*
         * If it worked, update the superblock.
         */
@@ -2154,7 +2144,6 @@ xfs_rtfree_extent(
        xfs_extlen_t    len)            /* length of extent freed */
 {
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* bitmap file inode */
        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
@@ -2163,9 +2152,9 @@ xfs_rtfree_extent(
        /*
         * Synchronize by locking the bitmap inode.
         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                        XFS_ILOCK_EXCL, &ip)))
+        xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                return error;
 #if defined(__KERNEL__) && defined(DEBUG)
        /*
         * Check to see that this whole range is currently allocated.
@@ -2198,10 +2187,10 @@ xfs_rtfree_extent(
         */
        if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
            mp->m_sb.sb_rextents) {
-                if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-                        ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                        mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-                *(__uint64_t *)&ip->i_d.di_atime = 0;
+                *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        }
        return 0;
 }
@@ -2215,15 +2204,14 @@ xfs_rtmount_init(
 {
        xfs_buf_t       *bp;    /* buffer for last block of subvolume */
        xfs_daddr_t     d;      /* address of last block of subvolume */
-        int             error;  /* error return value */
        xfs_sb_t        *sbp;   /* filesystem superblock copy in mount */
        sbp = &mp->m_sb;
        if (sbp->sb_rblocks == 0)
                return 0;
        if (mp->m_rtdev_targp == NULL) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: This filesystem has a realtime volume, use rtdev=device option");
+        "Filesystem has a realtime volume, use rtdev=device option");
                return XFS_ERROR(ENODEV);
        }
        mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,20 +2225,17 @@ xfs_rtmount_init(
         */
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
-                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
+                xfs_warn(mp, "realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return XFS_ERROR(EFBIG);
        }
-        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+        bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
-                                d - XFS_FSB_TO_BB(mp, 1),
+                                        d - XFS_FSB_TO_BB(mp, 1),
-                                XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                                        XFS_FSB_TO_B(mp, 1), 0);
-        if (error) {
+        if (!bp) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "realtime device size check failed");
-        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
+                return EIO;
-                if (error == ENOSPC)
-                        return XFS_ERROR(EFBIG);
-                return error;
        }
        xfs_buf_relse(bp);
        return 0;
@@ -2309,20 +2294,16 @@ xfs_rtpick_extent(
        xfs_rtblock_t   *pick)          /* result rt extent */
 {
        xfs_rtblock_t   b;              /* result block */
-        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap incore inode */
        int             log2;           /* log of sequence number */
        __uint64_t      resid;          /* residual after log removed */
        __uint64_t      seq;            /* sequence number of file creation */
        __uint64_t      *seqp;          /* pointer to seqno in inode */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
+        seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
-        ASSERT(ip == mp->m_rbmip);
+        if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-        seqp = (__uint64_t *)&ip->i_d.di_atime;
+                mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-        if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-                ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
                *seqp = 0;
        }
        seq = *seqp;
@@ -2338,7 +2319,7 @@ xfs_rtpick_extent(
                        b = mp->m_sb.sb_rextents - len;
        }
        *seqp = seq + 1;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        *pick = b;
        return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
        if (mp->m_sb.sb_rblocks == 0)
                return 0;
-        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        xfs_warn(mp, "Not built with CONFIG_XFS_RT");
        return ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
        logerror = flags & SHUTDOWN_LOG_IO_ERROR;
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
+                xfs_notice(mp,
-                                 "line %d of file %s.  Return address = 0x%p",
+        "%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
-                        mp->m_fsname, flags, lnnum, fname, __return_address);
+                        __func__, flags, lnnum, fname, __return_address);
        }
        /*
         * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
                return;
        if (flags & SHUTDOWN_CORRUPT_INCORE) {
-                xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+    "Corruption of in-memory data detected.  Shutting down filesystem");
-                        mp->m_fsname);
+                if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
-                if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
                        xfs_stack_trace();
-                }
        } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
                if (logerror) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-                "Log I/O Error Detected.  Shutting down filesystem: %s",
+                "Log I/O Error Detected.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (flags & SHUTDOWN_DEVICE_REQ) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "All device paths lost.  Shutting down filesystem: %s",
+                "All device paths lost.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "I/O Error Detected.  Shutting down filesystem: %s",
+                "I/O Error Detected. Shutting down filesystem");
-                                mp->m_fsname);
                }
        }
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_ALERT, "Please umount the filesystem, "
+                xfs_alert(mp,
-                                  "and rectify the problem(s)");
+        "Please umount the filesystem and rectify the problem(s)");
        }
 }
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
        xfs_buf_t               *bp,
        xfs_daddr_t             blkno)
 {
-        cmn_err(CE_ALERT,
+        xfs_alert(mp,
- "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
+                 "I/O error occurred: meta-data dev %s block 0x%llx"
- "       (\"%s\") error %d buf count %zd",
+                 "       (\"%s\") error %d buf count %zd",
-                (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
                (__uint64_t)blkno, func,
                XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
 xfs_get_extsz_hint(
        struct xfs_inode        *ip)
 {
-        xfs_extlen_t            extsz;
+        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+                return ip->i_d.di_extsize;
-        if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
+        if (XFS_IS_REALTIME_INODE(ip))
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+                return ip->i_mount->m_sb.sb_rextsize;
-                                ? ip->i_d.di_extsize
+        return 0;
-                                : ip->i_mount->m_sb.sb_rextsize;
-                ASSERT(extsz);
-        } else {
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                                ? ip->i_d.di_extsize : 0;
-        }
-        return extsz;
 }
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
-         XFS_SB_VERSION2_ATTR2BIT)
+         XFS_SB_VERSION2_ATTR2BIT       | \
+         XFS_SB_VERSION2_PROJID32BIT)
 #define XFS_SB_VERSION2_OKSASHFBITS     \
        (0)
 #define XFS_SB_VERSION2_OKREALBITS      \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+        return xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..c83f63b33aae 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
 xfs_trans_free(
        struct xfs_trans        *tp)
 {
-        struct xfs_busy_extent  *busyp, *n;
+        xfs_alloc_busy_sort(&tp->t_busy);
+        xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
-        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
-                xfs_alloc_busy_clear(tp->t_mountp, busyp);
        atomic_dec(&tp->t_mountp->m_active_trans);
        xfs_trans_free_dqinfo(tp);
@@ -696,7 +694,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (blocks > 0) {
-                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +765,7 @@ undo_log:
 undo_blocks:
        if (blocks > 0) {
-                (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
                                         (int64_t)blocks, rsvd);
                tp->t_blk_res = 0;
        }
@@ -1009,7 +1007,7 @@ void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
-        xfs_mod_sb_t    msb[14];        /* If you add cases, add entries */
+        xfs_mod_sb_t    msb[9]; /* If you add cases, add entries */
        xfs_mod_sb_t    *msbp;
        xfs_mount_t     *mp = tp->t_mountp;
        /* REFERENCED */
@@ -1017,55 +1015,61 @@ xfs_trans_unreserve_and_mod_sb(
        int             rsvd;
        int64_t         blkdelta = 0;
        int64_t         rtxdelta = 0;
+        int64_t         idelta = 0;
+        int64_t         ifreedelta = 0;
        msbp = msb;
        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-        /* calculate free blocks delta */
+        /* calculate deltas */
        if (tp->t_blk_res > 0)
                blkdelta = tp->t_blk_res;
        if ((tp->t_fdblocks_delta != 0) &&
            (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
             (tp->t_flags & XFS_TRANS_SB_DIRTY)))
                blkdelta += tp->t_fdblocks_delta;
-        if (blkdelta != 0) {
-                msbp->msb_field = XFS_SBS_FDBLOCKS;
-                msbp->msb_delta = blkdelta;
-                msbp++;
-        }
-        /* calculate free realtime extents delta */
        if (tp->t_rtx_res > 0)
                rtxdelta = tp->t_rtx_res;
        if ((tp->t_frextents_delta != 0) &&
            (tp->t_flags & XFS_TRANS_SB_DIRTY))
                rtxdelta += tp->t_frextents_delta;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+             (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+                idelta = tp->t_icount_delta;
+                ifreedelta = tp->t_ifree_delta;
+        }
+        /* apply the per-cpu counters */
+        if (blkdelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 blkdelta, rsvd);
+                if (error)
+                        goto out;
+        }
+        if (idelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+                                                 idelta, rsvd);
+                if (error)
+                        goto out_undo_fdblocks;
+        }
+        if (ifreedelta) {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+                                                 ifreedelta, rsvd);
+                if (error)
+                        goto out_undo_icount;
+        }
+        /* apply remaining deltas */
        if (rtxdelta != 0) {
                msbp->msb_field = XFS_SBS_FREXTENTS;
                msbp->msb_delta = rtxdelta;
                msbp++;
        }
-        /* apply remaining deltas */
-        if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
-             (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
-                if (tp->t_icount_delta != 0) {
-                        msbp->msb_field = XFS_SBS_ICOUNT;
-                        msbp->msb_delta = tp->t_icount_delta;
-                        msbp++;
-                }
-                if (tp->t_ifree_delta != 0) {
-                        msbp->msb_field = XFS_SBS_IFREE;
-                        msbp->msb_delta = tp->t_ifree_delta;
-                        msbp++;
-                }
-        }
        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
                if (tp->t_dblocks_delta != 0) {
                        msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1119,24 @@ xfs_trans_unreserve_and_mod_sb(
        if (msbp > msb) {
                error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
                        (uint)(msbp - msb), rsvd);
-                ASSERT(error == 0);
+                if (error)
+                        goto out_undo_ifreecount;
        }
+        return;
+out_undo_ifreecount:
+        if (ifreedelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+        if (idelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+        if (blkdelta)
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+        ASSERT(error == 0);
+        return;
 }
 /*
@@ -1328,7 +1348,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1341,7 +1361,7 @@ xfs_trans_item_committed(
                lip->li_flags |= XFS_LI_ABORTED;
        item_lsn = IOP_COMMITTED(lip, commit_lsn);
-        /* If the committed routine returns -1, item has been freed. */
+        /* item_lsn of -1 means the item needs no further processing */
        if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
                return;
@@ -1389,15 +1409,12 @@ xfs_trans_item_committed(
 */
 STATIC void
 xfs_trans_committed(
-        struct xfs_trans        *tp,
+        void                    *arg,
        int                     abortflag)
 {
+        struct xfs_trans        *tp = arg;
        struct xfs_log_item_desc *lidp, *next;
-        /* Call the transaction's completion callback if there is one. */
-        if (tp->t_callback != NULL)
-                tp->t_callback(tp, tp->t_callarg);
        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
                xfs_trans_free_item_desc(lidp);
@@ -1406,21 +1423,120 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item needs no further processing */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                /*
+                 * if we are aborting the operation, no point in inserting the
+                 * object into the AIL as we are in a shutdown situation.
+                 */
+                if (aborted) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+                        IOP_UNPIN(lip, 1);
+                        continue;
+                }
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
 /*
- * Called from the trans_commit code when we notice that
+ * Called from the trans_commit code when we notice that the filesystem is in
- * the filesystem is in the middle of a forced shutdown.
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
 */
 STATIC void
 xfs_trans_uncommit(
        struct xfs_trans        *tp,
        uint                    flags)
 {
-        struct xfs_log_item_desc *lidp;
+        struct xfs_log_item_desc *lidp, *n;
-        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+        list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
-                /*
-                 * Unpin all but those that aren't dirty.
-                 */
                if (lidp->lid_flags & XFS_LID_DIRTY)
                        IOP_UNPIN(lidp->lid_item, 1);
        }
@@ -1525,7 +1641,7 @@ xfs_trans_commit_iclog(
         * running in simulation mode (the log is explicitly turned
         * off).
         */
-        tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+        tp->t_logcb.cb_func = xfs_trans_committed;
        tp->t_logcb.cb_arg = tp;
        /*
@@ -1637,7 +1753,6 @@ xfs_trans_commit_cil(
        int                     flags)
 {
        struct xfs_log_vec      *log_vector;
-        int                     error;
        /*
         * Get each log item to allocate a vector structure for
@@ -1648,9 +1763,7 @@ xfs_trans_commit_cil(
        if (!log_vector)
                return ENOMEM;
-        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+        xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        if (error)
-                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
                                                 * transaction. */
        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-        xfs_trans_callback_t    t_callback;     /* transaction callback */
-        void                    *t_callarg;     /* callback arg */
        unsigned int            t_flags;        /* misc flags */
        int64_t                 t_icount_delta; /* superblock icount change */
        int64_t                 t_ifree_delta;  /* superblock ifree change */
@@ -471,8 +469,7 @@ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
+void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
-                               xfs_ino_t , uint, uint, struct xfs_inode **);
 void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..5fc2380092c8 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+struct workqueue_struct *xfs_ail_wq;    /* AIL workqueue */
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
+/*
-#else
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *prev_lip;
+        if (list_empty(&ailp->xa_ail))
+                return;
+        /*
+         * Check the next and previous entries are valid.
+         */
+        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+#ifdef XFS_TRANS_DEBUG
+        /*
+         * Walk the list checking lsn ordering, and that every entry has the
+         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+         * when specifically debugging the transaction subsystem.
+         */
+        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+                if (&prev_lip->li_ail != &ailp->xa_ail)
+                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+                prev_lip = lip;
+        }
+#endif /* XFS_TRANS_DEBUG */
+}
+#else /* !DEBUG */
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_min(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+}
+ /*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        if (lip->li_ail.next == &ailp->xa_ail)
+                return NULL;
+        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
 /*
- * This is called by the log manager code to determine the LSN
+ * This is called by the log manager code to determine the LSN of the tail of
- * of the tail of the log.  This is exactly the LSN of the first
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
- * item in the AIL.  If the AIL is empty, then this function
+ * is empty, then this function returns 0.
- * returns 0.
 *
- * We need the AIL lock in order to get a coherent read of the
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
- * lsn of the last item in the AIL.
+ * item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_ail_tail(
+xfs_ail_min_lsn(
        struct xfs_ail  *ailp)
 {
-        xfs_lsn_t       lsn;
+        xfs_lsn_t       lsn = 0;
        xfs_log_item_t  *lip;
        spin_lock(&ailp->xa_lock);
        lip = xfs_ail_min(ailp);
-        if (lip == NULL) {
+        if (lip)
-                lsn = (xfs_lsn_t)0;
-        } else {
                lsn = lip->li_lsn;
-        }
        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
 /*
- * xfs_trans_push_ail
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
- *
- * This routine is called to move the tail of the AIL forward.  It does this by
- * trying to flush items in the AIL whose lsns are below the given
- * threshold_lsn.
- *
- * the push is run asynchronously in a separate thread, so we return the tail
- * of the log right now instead of the tail after the push. This means we will
- * either continue right away, or we will sleep waiting on the async thread to
- * do its work.
- *
- * We do this unlocked - we only need to know whether there is anything in the
- * AIL at the time we are called. We don't need to access the contents of
- * any of the objects, so the lock is not needed.
 */
-void
+static xfs_lsn_t
-xfs_trans_ail_push(
+xfs_ail_max_lsn(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp)
-        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_lsn_t       lsn = 0;
+        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(ailp);
+        spin_lock(&ailp->xa_lock);
-        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+        lip = xfs_ail_max(ailp);
-                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+        if (lip)
-                        xfsaild_wakeup(ailp, threshold_lsn);
+                lsn = lip->li_lsn;
-        }
+        spin_unlock(&ailp->xa_lock);
+        return lsn;
 }
 /*
@@ -236,35 +300,78 @@ out:
 }
 /*
- * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * splice the log item list into the AIL at the given LSN.
- * zero indicates that the caller should sleep until woken.
 */
-long
+static void
-xfsaild_push(
+xfs_ail_splice(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t       *last_lsn)
+        struct list_head *list,
+        xfs_lsn_t       lsn)
 {
-        long            tout = 0;
+        xfs_log_item_t  *next_lip;
-        xfs_lsn_t       last_pushed_lsn = *last_lsn;
-        xfs_lsn_t       target =  ailp->xa_target;
+        /* If the list is empty, just insert the item.  */
-        xfs_lsn_t       lsn;
+        if (list_empty(&ailp->xa_ail)) {
-        xfs_log_item_t  *lip;
+                list_splice(list, &ailp->xa_ail);
-        int             flush_log, count, stuck;
+                return;
-        xfs_mount_t     *mp = ailp->xa_mount;
+        }
+        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
+                        break;
+        }
+        ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
+               XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
+        list_splice_init(list, &next_lip->li_ail);
+}
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_ail_check(ailp, lip);
+        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
+}
+/*
+ * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
+ * to run at a later time if there is more work to do to complete the push.
+ */
+STATIC void
+xfs_ail_worker(
+        struct work_struct      *work)
+{
+        struct xfs_ail          *ailp = container_of(to_delayed_work(work),
+                                        struct xfs_ail, xa_work);
+        xfs_mount_t             *mp = ailp->xa_mount;
        struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
-        int             push_xfsbufd = 0;
+        xfs_log_item_t          *lip;
+        xfs_lsn_t               lsn;
+        xfs_lsn_t               target;
+        long                    tout = 10;
+        int                     flush_log = 0;
+        int                     stuck = 0;
+        int                     count = 0;
+        int                     push_xfsbufd = 0;
        spin_lock(&ailp->xa_lock);
+        target = ailp->xa_target;
        xfs_trans_ail_cursor_init(ailp, cur);
-        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
                xfs_trans_ail_cursor_done(ailp, cur);
                spin_unlock(&ailp->xa_lock);
-                *last_lsn = 0;
+                goto out_done;
-                return tout;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -281,8 +388,7 @@ xfsaild_push(
         * lots of contention on the AIL lists.
         */
        lsn = lip->li_lsn;
-        flush_log = stuck = count = 0;
+        while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
-        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
                int     lock_result;
                /*
                 * If we can lock the item without sleeping, unlock the AIL
@@ -301,13 +407,13 @@ xfsaild_push(
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
                        IOP_PUSH(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        break;
                case XFS_ITEM_PUSHBUF:
                        XFS_STATS_INC(xs_push_ail_pushbuf);
                        IOP_PUSHBUF(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        push_xfsbufd = 1;
                        break;
@@ -319,7 +425,7 @@ xfsaild_push(
                case XFS_ITEM_LOCKED:
                        XFS_STATS_INC(xs_push_ail_locked);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        stuck++;
                        break;
@@ -374,9 +480,27 @@ xfsaild_push(
                wake_up_process(mp->m_ddev_targp->bt_task);
        }
+        /* assume we have more work to do in a short while */
+out_done:
        if (!count) {
                /* We're past our target or empty, so idle */
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
+                /*
+                 * We clear the XFS_AIL_PUSHING_BIT first before checking
+                 * whether the target has changed. If the target has changed,
+                 * this pushes the requeue race directly onto the result of the
+                 * atomic test/set bit, so we are guaranteed that either the
+                 * the pusher that changed the target or ourselves will requeue
+                 * the work (but not both).
+                 */
+                clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
+                smp_rmb();
+                if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
+                    test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+                        return;
+                tout = 50;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
                /*
                 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +508,7 @@ xfsaild_push(
                 * start the next scan from the start of the AIL.
                 */
                tout = 50;
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
        } else if ((stuck * 100) / count > 90) {
                /*
                 * Either there is a lot of contention on the AIL or we
@@ -396,14 +520,61 @@ xfsaild_push(
                 * continuing from where we were.
                 */
                tout = 20;
-        } else {
-                /* more to do, but wait a short while before continuing */
-                tout = 10;
        }
-        *last_lsn = last_pushed_lsn;
-        return tout;
+        /* There is more to do, requeue us.  */
+        queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
+                                        msecs_to_jiffies(tout));
 }
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       threshold_lsn)
+{
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+            XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+                return;
+        /*
+         * Ensure that the new target is noticed in push code before it clears
+         * the XFS_AIL_PUSHING_BIT.
+         */
+        smp_wmb();
+        xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
+        if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+                queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+}
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+        struct xfs_ail  *ailp)
+{
+        xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+        if (threshold_lsn)
+                xfs_ail_push(ailp, threshold_lsn);
+}
 /*
 * This is to be called when an item is unlocked that may have
@@ -449,129 +620,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
+ *
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * To optimise the insert operation, we delete all the items from the AIL in
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * the first pass, moving them into a temporary list, then splice the temporary
- * log manager.
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function must be called with the AIL lock held.  The lock is dropped
- * is dropped before returning.
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
+ *
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * log manager.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * This function will not drop the AIL lock until all items are removed from
- * bump the AIL's generation count to indicate that the tree
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
- * has changed.
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function must be called with the AIL lock held.  The lock is dropped
- * is dropped before returning.
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+        for (i = 0; i < nr_items; i++) {
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                lip->li_flags &= ~XFS_LI_IN_AIL;
-                lip->li_lsn = 0;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -592,7 +786,6 @@ xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
        struct xfs_ail  *ailp;
-        int             error;
        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
        if (!ailp)
@@ -601,15 +794,9 @@ xfs_trans_ail_init(
        ailp->xa_mount = mp;
        INIT_LIST_HEAD(&ailp->xa_ail);
        spin_lock_init(&ailp->xa_lock);
-        error = xfsaild_start(ailp);
+        INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
-        if (error)
-                goto out_free_ailp;
        mp->m_ail = ailp;
        return 0;
-out_free_ailp:
-        kmem_free(ailp);
-        return error;
 }
 void
@@ -618,135 +805,6 @@ xfs_trans_ail_destroy(
 {
        struct xfs_ail  *ailp = mp->m_ail;
-        xfsaild_stop(ailp);
+        cancel_delayed_work_sync(&ailp->xa_work);
        kmem_free(ailp);
 }
-/*
- * Insert the given log item into the AIL.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
- */
-STATIC void
-xfs_ail_insert(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-/* ARGSUSED */
-{
-        xfs_log_item_t  *next_lip;
-        /*
-         * If the list is empty, just insert the item.
-         */
-        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
-                return;
-        }
-        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
-                        break;
-        }
-        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
-        return;
-}
-/*
- * Delete the given item from the AIL.  Return a pointer to the item.
- */
-/*ARGSUSED*/
-STATIC xfs_log_item_t *
-xfs_ail_delete(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-/* ARGSUSED */
-{
-        xfs_ail_check(ailp, lip);
-        list_del(&lip->li_ail);
-        return lip;
-}
-/*
- * Return a pointer to the first item in the AIL.
- * If the AIL is empty, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_min(
-        struct xfs_ail  *ailp)
-/* ARGSUSED */
-{
-        if (list_empty(&ailp->xa_ail))
-                return NULL;
-        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-/*
- * Return a pointer to the item which follows
- * the given item in the AIL.  If the given item
- * is the last item in the list, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_next(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-/* ARGSUSED */
-{
-        if (lip->li_ail.next == &ailp->xa_ail)
-                return NULL;
-        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
-}
-#ifdef DEBUG
-/*
- * Check that the list is sorted as it should be.
- */
-STATIC void
-xfs_ail_check(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_log_item_t  *prev_lip;
-        if (list_empty(&ailp->xa_ail))
-                return;
-        /*
-         * Check the next and previous entries are valid.
-         */
-        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
-        /*
-         * Walk the list checking lsn ordering, and that every entry has the
-         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * when specifically debugging the transaction subsystem.
-         */
-        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-                prev_lip = lip;
-        }
-#endif /* XFS_TRANS_DEBUG */
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..03b3b7f85a3b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
-                                        cmn_err(CE_DEBUG, "Returning error!\n");
+                                        xfs_debug(mp, "Returning error!");
                                        return XFS_ERROR(EIO);
                                }
                        }
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
                        ASSERT(!XFS_BUF_ISASYNC(bp));
                        XFS_BUF_READ(bp);
                        xfsbdstrat(tp->t_mountp, bp);
-                        error = xfs_iowait(bp);
+                        error = xfs_buf_iowait(bp);
                        if (error) {
                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                                  bp, blkno);
@@ -383,7 +383,8 @@ xfs_trans_read_buf(
        bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
        if (bp == NULL) {
                *bpp = NULL;
-                return 0;
+                return (flags & XBF_TRYLOCK) ?
+                                        0 : XFS_ERROR(ENOMEM);
        }
        if (XFS_BUF_GETERROR(bp) != 0) {
            XFS_BUF_SUPER_STALE(bp);
@@ -403,7 +404,7 @@ xfs_trans_read_buf(
                                xfs_force_shutdown(tp->t_mountp,
                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
-                                cmn_err(CE_DEBUG, "Returning trans error!\n");
+                                xfs_debug(mp, "Returning trans error!");
                                return XFS_ERROR(EIO);
                        }
                }
@@ -427,7 +428,7 @@ shutdown_abort:
         */
 #if defined(DEBUG)
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
-                cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
+                xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
                                     (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..048b0c689d3e 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
 #endif
 /*
- * Get an inode and join it to the transaction.
- */
-int
-xfs_trans_iget(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp)
-{
-        int                     error;
-        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-        if (!error && tp) {
-                xfs_trans_ijoin(tp, *ipp);
-                (*ipp)->i_itemp->ili_lock_flags = lock_flags;
-        }
-        return error;
-}
-/*
 * Add a locked inode to the transaction.
 *
 * The inode must be locked, and it cannot be associated with any transaction.
@@ -103,7 +81,7 @@ xfs_trans_ijoin(
 *
 *
 * Grabs a reference to the inode which will be dropped when the transaction
- * is commited.  The inode will also be unlocked at that point.  The inode
+ * is committed.  The inode will also be unlocked at that point.  The inode
 * must be locked, and it cannot be associated with any transaction.
 */
 void
@@ -118,6 +96,36 @@ xfs_trans_ijoin_ref(
 }
 /*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        struct inode            *inode = VFS_I(ip);
+        timespec_t              tv;
+        ASSERT(tp);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(ip->i_transp == tp);
+        tv = current_fs_time(inode->i_sb);
+        if ((flags & XFS_ICHGTIME_MOD) &&
+            !timespec_equal(&inode->i_mtime, &tv)) {
+                inode->i_mtime = tv;
+        }
+        if ((flags & XFS_ICHGTIME_CHG) &&
+            !timespec_equal(&inode->i_ctime, &tv)) {
+                inode->i_ctime = tv;
+        }
+}
+/*
 * This is called to mark the fields indicated in fieldmask as needing
 * to be logged when the transaction is committed.  The inode must
 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -63,28 +65,52 @@ struct xfs_ail_cursor {
 struct xfs_ail {
        struct xfs_mount        *xa_mount;
        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
        xfs_lsn_t               xa_target;
        struct xfs_ail_cursor   xa_cursors;
        spinlock_t              xa_lock;
+        struct delayed_work     xa_work;
+        xfs_lsn_t               xa_last_pushed_lsn;
+        unsigned long           xa_flags;
 };
+#define XFS_AIL_PUSHING_BIT     0
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+extern struct workqueue_struct  *xfs_ail_wq;    /* AIL workqueue */
-                                        __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+static inline void
+xfs_trans_ail_update(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+void                    xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_ail_push_all(struct xfs_ail *);
+xfs_lsn_t               xfs_ail_min_lsn(struct xfs_ail *ailp);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
-xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
 struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur,
                                        xfs_lsn_t lsn);
@@ -93,11 +119,6 @@ struct xfs_log_item	*xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
 void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur);
-long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
-void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
-int     xfsaild_start(struct xfs_ail *);
-void    xfsaild_stop(struct xfs_ail *);
 #if BITS_PER_LONG != 64
 static inline void
 xfs_trans_ail_copy_lsn(
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..65584b55607d 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,10 +73,6 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
-typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
-typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types:
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
        mode_t          mode,
        xfs_nlink_t     nlink,
        xfs_dev_t       rdev,
-        cred_t          *credp,
        prid_t          prid,           /* project id */
        int             okalloc,        /* ok to allocate new space */
        xfs_inode_t     **ipp,          /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
         * transaction commit so that no other process can steal
         * the inode(s) that we've just allocated.
         */
-        code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+        code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
                          &ialloc_context, &call_again, &ip);
        /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
                 * other allocations in this allocation group,
                 * this call should always succeed.
                 */
-                code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+                code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
                                  okalloc, &ialloc_context, &call_again, &ip);
                /*
@@ -235,7 +234,7 @@ xfs_droplink(
 {
        int     error;
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT (ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
 {
        if (ip->i_d.di_nlink >= XFS_MAXLINK)
                return XFS_ERROR(EMLINK);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
 #define __XFS_UTILS_H__
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
-                                xfs_dev_t, cred_t *, prid_t, int,
+                                xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
-                                xfs_inode_t **, int *);
 extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
 extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
 extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..619720705bc6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-                code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
+                code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
                                         qflags, &udqp, &gdqp);
                if (code)
                        return code;
@@ -184,8 +184,11 @@ xfs_setattr(
                    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        if (mask & ATTR_CTIME)
+                        if (mask & ATTR_CTIME) {
-                                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                                inode->i_mtime = inode->i_ctime =
+                                                current_fs_time(inode->i_sb);
+                                xfs_mark_inode_dirty_sync(ip);
+                        }
                        code = 0;
                        goto error_return;
                }
@@ -950,40 +953,62 @@ xfs_release(
                 * If we previously truncated this file and removed old data
                 * in the process, we want to initiate "early" writeout on
                 * the last close.  This is an attempt to combat the notorious
-                 * NULL files problem which is particularly noticable from a
+                 * NULL files problem which is particularly noticeable from a
                 * truncate down, buffered (re-)write (delalloc), followed by
                 * a crash.  What we are effectively doing here is
                 * significantly reducing the time window where we'd otherwise
                 * be exposed to that problem.
                 */
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-                if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
+                if (truncated) {
-                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+                        if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
+                                xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+                }
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
+                /*
-                         */
+                 * If we can't get the iolock just skip truncating the blocks
-                        error = xfs_free_eofblocks(mp, ip,
+                 * past EOF because we could deadlock with the mmap_sem
-                                                   XFS_FREE_EOF_TRYLOCK);
+                 * otherwise.  We'll get another chance to drop them once the
-                        if (error)
+                 * last reference to the inode is dropped, so we'll never leak
-                                return error;
+                 * blocks permanently.
-                }
+                 *
-        }
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * outstanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }
@@ -1167,9 +1192,8 @@ xfs_inactive(
                 * inode might be lost for a long time or forever.
                 */
                if (!XFS_FORCED_SHUTDOWN(mp)) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
-                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
+                                __func__, error);
-                                error, mp->m_fsname);
                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1186,12 +1210,12 @@ xfs_inactive(
                 */
                error = xfs_bmap_finish(&tp,  &free_list, &committed);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-                                "xfs_bmap_finish() returned error %d", error);
+                                __func__, error);
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-                                "xfs_trans_commit() returned error %d", error);
+                                __func__, error);
        }
        /*
@@ -1253,8 +1277,7 @@ xfs_create(
        struct xfs_name         *name,
        mode_t                  mode,
        xfs_dev_t               rdev,
-        xfs_inode_t             **ipp,
+        xfs_inode_t             **ipp)
-        cred_t                  *credp)
 {
        int                     is_dir = S_ISDIR(mode);
        struct xfs_mount        *mp = dp->i_mount;
@@ -1266,7 +1289,7 @@ xfs_create(
        boolean_t               unlock_dp_on_error = B_FALSE;
        uint                    cancel_flags;
        int                     committed;
-        xfs_prid_t              prid;
+        prid_t                  prid;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *gdqp = NULL;
        uint                    resblks;
@@ -1279,9 +1302,9 @@ xfs_create(
                return XFS_ERROR(EIO);
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
+                prid = xfs_get_projid(dp);
        else
-                prid = dfltprid;
+                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -1289,7 +1312,7 @@ xfs_create(
        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
-                goto std_return;
+                return error;
        if (is_dir) {
                rdev = 0;
@@ -1360,7 +1383,7 @@ xfs_create(
         * entry pointing to them, but a directory also the "." entry
         * pointing to itself.
         */
-        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
                               prid, resblks > 0, &ip, &committed);
        if (error) {
                if (error == ENOSPC)
@@ -1369,12 +1392,6 @@ xfs_create(
        }
        /*
-         * At this point, we've gotten a newly allocated inode.
-         * It is locked (and joined to the transaction).
-         */
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        /*
         * Now we join the directory inode to the transaction.  We do not do it
         * earlier because xfs_dir_ialloc might commit the previous transaction
         * (and release all the locks).  An error from here on will result in
@@ -1391,7 +1408,7 @@ xfs_create(
                ASSERT(error != ENOSPC);
                goto out_trans_abort;
        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        if (is_dir) {
@@ -1419,22 +1436,13 @@ xfs_create(
         */
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
-                goto out_abort_rele;
+                goto out_bmap_cancel;
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error) {
+        if (error)
-                IRELE(ip);
+                goto out_release_inode;
-                goto out_dqrele;
-        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -1448,27 +1456,21 @@ xfs_create(
        cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
- out_dqrele:
+ out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to
+         * release the inode.  This prevents recursive transactions
+         * and deadlocks from xfs_inactive.
+         */
+        if (ip)
+                IRELE(ip);
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
        return error;
- out_abort_rele:
-        /*
-         * Wait until after the current transaction is aborted to
-         * release the inode.  This prevents recursive transactions
-         * and deadlocks from xfs_inactive.
-         */
-        xfs_bmap_cancel(&free_list);
-        cancel_flags |= XFS_TRANS_ABORT;
-        xfs_trans_cancel(tp, cancel_flags);
-        IRELE(ip);
-        unlock_dp_on_error = B_FALSE;
-        goto out_dqrele;
 }
 #ifdef DEBUG
@@ -1742,7 +1744,7 @@ xfs_remove(
                ASSERT(error != ENOENT);
                goto out_bmap_cancel;
        }
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        if (is_dir) {
                /*
@@ -1880,7 +1882,7 @@ xfs_link(
         * the tree quota mechanism could be circumvented.
         */
        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
+                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
                error = XFS_ERROR(EXDEV);
                goto error_return;
        }
@@ -1895,7 +1897,7 @@ xfs_link(
                                        &first_block, &free_list, resblks);
        if (error)
                goto abort_return;
-        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
@@ -1933,8 +1935,7 @@ xfs_symlink(
        struct xfs_name         *link_name,
        const char              *target_path,
        mode_t                  mode,
-        xfs_inode_t             **ipp,
+        xfs_inode_t             **ipp)
-        cred_t                  *credp)
 {
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp;
@@ -1955,7 +1956,7 @@ xfs_symlink(
        int                     byte_cnt;
        int                     n;
        xfs_buf_t               *bp;
-        xfs_prid_t              prid;
+        prid_t                  prid;
        struct xfs_dquot        *udqp, *gdqp;
        uint                    resblks;
@@ -1978,9 +1979,9 @@ xfs_symlink(
        udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-                prid = dp->i_d.di_projid;
+                prid = xfs_get_projid(dp);
        else
-                prid = (xfs_prid_t)dfltprid;
+                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2047,8 @@ xfs_symlink(
        /*
         * Allocate an inode for the symlink.
         */
-        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
+        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-                               1, 0, credp, prid, resblks > 0, &ip, NULL);
+                               prid, resblks > 0, &ip, NULL);
        if (error) {
                if (error == ENOSPC)
                        goto error_return;
@@ -2094,9 +2095,8 @@ xfs_symlink(
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
                                  &free_list);
-                if (error) {
+                if (error)
-                        goto error1;
+                        goto error2;
-                }
                if (resblks)
                        resblks -= fs_blocks;
@@ -2128,8 +2128,8 @@ xfs_symlink(
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-                goto error1;
+                goto error2;
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
@@ -2141,13 +2141,6 @@ xfs_symlink(
                xfs_trans_set_sync(tp);
        }
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error) {
                goto error2;
@@ -2272,7 +2265,7 @@ xfs_alloc_file_space(
        count = len;
        imapp = &imaps[0];
        nimaps = 1;
-        bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+        bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
@@ -2431,9 +2424,9 @@ xfs_zero_remaining_bytes(
        if (endoff > ip->i_size)
                endoff = ip->i_size;
-        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
+        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp,
-                                mp->m_rtdev_targp : mp->m_ddev_targp);
+                                mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
        if (!bp)
                return XFS_ERROR(ENOMEM);
@@ -2459,7 +2452,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_READ(bp);
                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
                xfsbdstrat(mp, bp);
-                error = xfs_iowait(bp);
+                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
                                          mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2465,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNREAD(bp);
                XFS_BUF_WRITE(bp);
                xfsbdstrat(mp, bp);
-                error = xfs_iowait(bp);
+                error = xfs_buf_iowait(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
                                          mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2704,7 @@ xfs_change_file_space(
        xfs_off_t       llen;
        xfs_trans_t     *tp;
        struct iattr    iattr;
+        int             prealloc_type;
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2753,12 +2747,17 @@ xfs_change_file_space(
         * size to be changed.
         */
        setprealloc = clrprealloc = 0;
+        prealloc_type = XFS_BMAPI_PREALLOC;
        switch (cmd) {
+        case XFS_IOC_ZERO_RANGE:
+                prealloc_type |= XFS_BMAPI_CONVERT;
+                xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+                /* FALLTHRU */
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                                                1, attr_flags);
+                                                prealloc_type, attr_flags);
                if (error)
                        return error;
                setprealloc = 1;
@@ -2827,7 +2826,7 @@ xfs_change_file_space(
                if (ip->i_d.di_mode & S_IXGRP)
                        ip->i_d.di_mode &= ~S_ISGID;
-                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        }
        if (setprealloc)
                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
@@ -2835,7 +2834,8 @@ xfs_change_file_space(
                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
+        if (attr_flags & XFS_ATTR_SYNC)
+                xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..3bcd23353d6c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
 #define _XFS_VNODEOPS_H 1
 struct attrlist_cursor_kern;
-struct cred;
 struct file;
 struct iattr;
 struct inode;
@@ -19,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
 #define XFS_ATTR_NOACL          0x08    /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC           0x10    /* synchronous operation required */
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_release(struct xfs_inode *ip);
@@ -26,7 +26,7 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-                xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
+                xfs_dev_t rdev, struct xfs_inode **ipp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +34,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-                const char *target_path, mode_t mode, struct xfs_inode **ipp,
+                const char *target_path, mode_t mode, struct xfs_inode **ipp);
-                cred_t *credp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/xfs
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)