Merge tag 'v2.6.39-rc7'

in order to pull in changes in drivers/media/dvb/firewire/ and sound/firewire/.
author: Stefan Richter <stefanr@s5r6.in-berlin.de> 2011-05-10 14:52:07 -0400
committer: Stefan Richter <stefanr@s5r6.in-berlin.de> 2011-05-10 16:50:41 -0400
commit: 020abf03cd659388f94cb328e1e1df0656e0d7ff (patch)
tree: 40d05011708ad1b4a05928d167eb120420581aa6 /fs/xfs
parent: 0ff8fbc61727c926883eec381fbd3d32d1fab504 (diff)
parent: 693d92a1bbc9e42681c42ed190bd42b636ca876f (diff)
80 files changed, 4461 insertions, 4282 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 XFS_LINUX := linux-2.6
-ifeq ($(CONFIG_XFS_DEBUG),y)
-        EXTRA_CFLAGS += -g
-endif
 obj-$(CONFIG_XFS_FS)            += xfs.o
 xfs-y                           += linux-2.6/xfs_trace.o
@@ -98,17 +95,17 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   kmem.o \
                                   xfs_aops.o \
                                   xfs_buf.o \
+                                   xfs_discard.o \
                                   xfs_export.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
                                   xfs_globals.o \
                                   xfs_ioctl.o \
                                   xfs_iops.o \
+                                   xfs_message.o \
                                   xfs_super.o \
                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
-xfs-y                           += $(addprefix support/, \
+xfs-y                           += support/uuid.o
-                                   debug.o \
-                                   uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
+#include "xfs_message.h"
 /*
 * Greedy allocation.  May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed6..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -378,28 +413,19 @@ xfs_submit_ioend_bio(
        if (xfs_ioend_new_eof(ioend))
                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +496,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +625,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +652,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +679,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +734,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +789,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +803,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -935,13 +847,13 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                goto out_invalidate;
-        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+        xfs_alert(ip->i_mount,
                "page discard on page %p, inode 0x%llx, offset %llu.",
                        page, ip->i_ino, offset);
@@ -959,7 +871,7 @@ xfs_aops_discard_page(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "page discard unable to remove delalloc mapping.");
                        }
                        break;
@@ -1002,10 +914,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1056,10 +968,14 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
@@ -1076,90 +992,54 @@ xfs_vm_writepage(
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
-                         */
-                        if (!imap_valid || flags != BMAPI_READ) {
-                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh, head);
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                } else {
+                        if (PageUptodate(page)) {
+                                ASSERT(buffer_mapped(bh));
+                                imap_valid = 0;
+                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1188,7 +1068,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1257,13 +1137,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1158,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1263,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1375,7 +1295,7 @@ xfs_get_blocks_direct(
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
 * extents.  In case this is regular synchronous I/O we just call xfs_end_io
- * to do this and we are done.  But in case this was a successfull AIO
+ * to do this and we are done.  But in case this was a successful AIO
 * request this handler is called from interrupt context, from which we
 * can't start transactions.  In that case offload the I/O completion to
 * the workqueues we also use for buffered I/O completion.
@@ -1434,7 +1354,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1490,7 +1410,7 @@ xfs_vm_write_failed(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "xfs_vm_write_failed: unable to clean up ino %lld",
                                                ip->i_ino);
                        }
@@ -1574,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
        .writepages             = xfs_vm_writepages,
-        .sync_page              = block_sync_page,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e31..9ef9ed2cfe2e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -99,77 +94,79 @@ xfs_buf_vmap_len(
 }
 /*
- *      Page Region interfaces.
+ * xfs_buf_lru_add - add a buffer to the LRU.
 *
- *      For pages in filesystems where the blocksize is smaller than the
+ * The LRU takes a new reference to the buffer so that it will only be freed
- *      pagesize, we use the page->private field (long) to hold a bitmap
+ * once the shrinker takes the buffer off the LRU.
- *      of uptodate regions within the page.
- *
- *      Each such region is "bytes per page / bits per long" bytes long.
- *
- *      NBPPR == number-of-bytes-per-page-region
- *      BTOPR == bytes-to-page-region (rounded up)
- *      BTOPRT == bytes-to-page-region-truncated (rounded down)
 */
-#if (BITS_PER_LONG == 32)
+STATIC void
-#define PRSHIFT         (PAGE_CACHE_SHIFT - 5)  /* (32 == 1<<5) */
+xfs_buf_lru_add(
-#elif (BITS_PER_LONG == 64)
+        struct xfs_buf  *bp)
-#define PRSHIFT         (PAGE_CACHE_SHIFT - 6)  /* (64 == 1<<6) */
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-#define NBPPR           (PAGE_CACHE_SIZE/BITS_PER_LONG)
-#define BTOPR(b)        (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
-#define BTOPRT(b)       (((unsigned int)(b) >> PRSHIFT))
-STATIC unsigned long
-page_region_mask(
-        size_t          offset,
-        size_t          length)
 {
-        unsigned long   mask;
+        struct xfs_buftarg *btp = bp->b_target;
-        int             first, final;
-        first = BTOPR(offset);
-        final = BTOPRT(offset + length - 1);
-        first = min(first, final);
-        mask = ~0UL;
-        mask <<= BITS_PER_LONG - (final - first);
-        mask >>= BITS_PER_LONG - (final);
-        ASSERT(offset + length <= PAGE_CACHE_SIZE);
-        ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
-        return mask;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
 }
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
+ * bt_lru_lock.
+ */
 STATIC void
-set_page_region(
+xfs_buf_lru_del(
-        struct page     *page,
+        struct xfs_buf  *bp)
-        size_t          offset,
-        size_t          length)
 {
-        set_page_private(page,
+        struct xfs_buftarg *btp = bp->b_target;
-                page_private(page) | page_region_mask(offset, length));
-        if (page_private(page) == ~0UL)
-                SetPageUptodate(page);
-}
-STATIC int
+        if (list_empty(&bp->b_lru))
-test_page_region(
+                return;
-        struct page     *page,
-        size_t          offset,
-        size_t          length)
-{
-        unsigned long   mask = page_region_mask(offset, length);
-        return (mask && (page_private(page) & mask) == mask);
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
 */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +183,9 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,7 +261,9 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
-        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
+        ASSERT(list_empty(&bp->b_lru));
+        if (bp->b_flags & _XBF_PAGES) {
                uint            i;
                if (xfs_buf_is_vmapped(bp))
@@ -272,56 +273,77 @@ xfs_buf_free(
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
-                        if (bp->b_flags & _XBF_PAGE_CACHE)
+                        __free_page(page);
-                                ASSERT(!PagePrivate(page));
-                        page_cache_release(page);
                }
-        }
+        } else if (bp->b_flags & _XBF_KMEM)
+                kmem_free(bp->b_addr);
        _xfs_buf_free_pages(bp);
        xfs_buf_deallocate(bp);
 }
 /*
- *      Finds all pages for buffer in question and builds it's page list.
+ * Allocates all the pages for buffer in question and builds it's page list.
 */
 STATIC int
-_xfs_buf_lookup_pages(
+xfs_buf_allocate_memory(
        xfs_buf_t               *bp,
        uint                    flags)
 {
-        struct address_space    *mapping = bp->b_target->bt_mapping;
-        size_t                  blocksize = bp->b_target->bt_bsize;
        size_t                  size = bp->b_count_desired;
        size_t                  nbytes, offset;
        gfp_t                   gfp_mask = xb_to_gfp(flags);
        unsigned short          page_count, i;
-        pgoff_t                 first;
        xfs_off_t               end;
        int                     error;
+        /*
+         * for buffers that are contained within a single page, just allocate
+         * the memory from the heap - there's no need for the complexity of
+         * page arrays to keep allocation down to order 0.
+         */
+        if (bp->b_buffer_length < PAGE_SIZE) {
+                bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+                if (!bp->b_addr) {
+                        /* low memory - use alloc_page loop instead */
+                        goto use_alloc_page;
+                }
+                if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+                                                                PAGE_MASK) !=
+                    ((unsigned long)bp->b_addr & PAGE_MASK)) {
+                        /* b_addr spans two pages - use alloc_page instead */
+                        kmem_free(bp->b_addr);
+                        bp->b_addr = NULL;
+                        goto use_alloc_page;
+                }
+                bp->b_offset = offset_in_page(bp->b_addr);
+                bp->b_pages = bp->b_page_array;
+                bp->b_pages[0] = virt_to_page(bp->b_addr);
+                bp->b_page_count = 1;
+                bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+                return 0;
+        }
+use_alloc_page:
        end = bp->b_file_offset + bp->b_buffer_length;
        page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
        error = _xfs_buf_get_pages(bp, page_count, flags);
        if (unlikely(error))
                return error;
-        bp->b_flags |= _XBF_PAGE_CACHE;
        offset = bp->b_offset;
-        first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
+        bp->b_flags |= _XBF_PAGES;
        for (i = 0; i < bp->b_page_count; i++) {
                struct page     *page;
                uint            retries = 0;
+retry:
-              retry:
+                page = alloc_page(gfp_mask);
-                page = find_or_create_page(mapping, first + i, gfp_mask);
                if (unlikely(page == NULL)) {
                        if (flags & XBF_READ_AHEAD) {
                                bp->b_page_count = i;
-                                for (i = 0; i < bp->b_page_count; i++)
+                                error = ENOMEM;
-                                        unlock_page(bp->b_pages[i]);
+                                goto out_free_pages;
-                                return -ENOMEM;
                        }
                        /*
@@ -331,65 +353,55 @@ _xfs_buf_lookup_pages(
                         * handle buffer allocation failures we can't do much.
                         */
                        if (!(++retries % 100))
-                                printk(KERN_ERR
+                                xfs_err(NULL,
-                                        "XFS: possible memory allocation "
+                "possible memory allocation deadlock in %s (mode:0x%x)",
-                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
                XFS_STATS_INC(xb_page_found);
-                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
+                nbytes = min_t(size_t, size, PAGE_SIZE - offset);
                size -= nbytes;
-                ASSERT(!PagePrivate(page));
-                if (!PageUptodate(page)) {
-                        page_count--;
-                        if (blocksize >= PAGE_CACHE_SIZE) {
-                                if (flags & XBF_READ)
-                                        bp->b_flags |= _XBF_PAGE_LOCKED;
-                        } else if (!PagePrivate(page)) {
-                                if (test_page_region(page, offset, nbytes))
-                                        page_count++;
-                        }
-                }
                bp->b_pages[i] = page;
                offset = 0;
        }
+        return 0;
-        if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+out_free_pages:
-                for (i = 0; i < bp->b_page_count; i++)
+        for (i = 0; i < bp->b_page_count; i++)
-                        unlock_page(bp->b_pages[i]);
+                __free_page(bp->b_pages[i]);
-        }
-        if (page_count == bp->b_page_count)
-                bp->b_flags |= XBF_DONE;
        return error;
 }
 /*
- *      Map buffer into kernel address-space if nessecary.
+ *      Map buffer into kernel address-space if necessary.
 */
 STATIC int
 _xfs_buf_map_pages(
        xfs_buf_t               *bp,
        uint                    flags)
 {
-        /* A single page buffer is always mappable */
+        ASSERT(bp->b_flags & _XBF_PAGES);
        if (bp->b_page_count == 1) {
+                /* A single page buffer is always mappable */
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-                bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                int retried = 0;
-                                        -1, PAGE_KERNEL);
-                if (unlikely(bp->b_addr == NULL))
+                do {
+                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                                                -1, PAGE_KERNEL);
+                        if (bp->b_addr)
+                                break;
+                        vm_unmap_aliases();
+                } while (retried++ <= 1);
+                if (!bp->b_addr)
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
@@ -500,9 +512,14 @@ found:
                }
        }
+        /*
+         * if the buffer is stale, clear all the external state associated with
+         * it. We need to keep flags such as how we allocated the buffer memory
+         * intact here.
+         */
        if (bp->b_flags & XBF_STALE) {
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-                bp->b_flags &= XBF_MAPPED;
+                bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
        }
        trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -523,7 +540,7 @@ xfs_buf_get(
        xfs_buf_flags_t         flags)
 {
        xfs_buf_t               *bp, *new_bp;
-        int                     error = 0, i;
+        int                     error = 0;
        new_bp = xfs_buf_allocate(flags);
        if (unlikely(!new_bp))
@@ -531,7 +548,7 @@ xfs_buf_get(
        bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
        if (bp == new_bp) {
-                error = _xfs_buf_lookup_pages(bp, flags);
+                error = xfs_buf_allocate_memory(bp, flags);
                if (error)
                        goto no_buffer;
        } else {
@@ -540,14 +557,11 @@ xfs_buf_get(
                        return NULL;
        }
-        for (i = 0; i < bp->b_page_count; i++)
-                mark_page_accessed(bp->b_pages[i]);
        if (!(bp->b_flags & XBF_MAPPED)) {
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
-                        printk(KERN_WARNING "%s: failed to map pages\n",
+                        xfs_warn(target->bt_mount,
-                                        __func__);
+                                "%s: failed to map pages\n", __func__);
                        goto no_buffer;
                }
        }
@@ -641,10 +655,7 @@ xfs_buf_readahead(
        xfs_off_t               ioff,
        size_t                  isize)
 {
-        struct backing_dev_info *bdi;
+        if (bdi_read_congested(target->bt_bdi))
-        bdi = target->bt_mapping->backing_dev_info;
-        if (bdi_read_congested(bdi))
                return;
        xfs_buf_read(target, ioff, isize,
@@ -722,10 +733,10 @@ xfs_buf_associate_memory(
        size_t                  buflen;
        int                     page_count;
-        pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
+        pageaddr = (unsigned long)mem & PAGE_MASK;
        offset = (unsigned long)mem - pageaddr;
-        buflen = PAGE_CACHE_ALIGN(len + offset);
+        buflen = PAGE_ALIGN(len + offset);
-        page_count = buflen >> PAGE_CACHE_SHIFT;
+        page_count = buflen >> PAGE_SHIFT;
        /* Free any previous set of page pointers */
        if (bp->b_pages)
@@ -742,13 +753,12 @@ xfs_buf_associate_memory(
        for (i = 0; i < bp->b_page_count; i++) {
                bp->b_pages[i] = mem_to_page((void *)pageaddr);
-                pageaddr += PAGE_CACHE_SIZE;
+                pageaddr += PAGE_SIZE;
        }
        bp->b_count_desired = len;
        bp->b_buffer_length = buflen;
        bp->b_flags |= XBF_MAPPED;
-        bp->b_flags &= ~_XBF_PAGE_LOCKED;
        return 0;
 }
@@ -781,8 +791,8 @@ xfs_buf_get_uncached(
        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
        if (unlikely(error)) {
-                printk(KERN_WARNING "%s: failed to map pages\n",
+                xfs_warn(target->bt_mount,
-                                __func__);
+                        "%s: failed to map pages\n", __func__);
                goto fail_free_mem;
        }
@@ -827,7 +837,7 @@ xfs_buf_rele(
        trace_xfs_buf_rele(bp, _RET_IP_);
        if (!pag) {
-                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -835,13 +845,15 @@ xfs_buf_rele(
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                if (bp->b_relse) {
+                if (!(bp->b_flags & XBF_STALE) &&
-                        atomic_inc(&bp->b_hold);
+                           atomic_read(&bp->b_lru_ref)) {
+                        xfs_buf_lru_add(bp);
                        spin_unlock(&pag->pag_buf_lock);
-                        bp->b_relse(bp);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -853,20 +865,7 @@ xfs_buf_rele(
 /*
- *      Mutual exclusion on buffers.  Locking model:
+ *      Lock a buffer object, if it is not already locked.
- *
- *      Buffers associated with inodes for which buffer locking
- *      is not enabled are not protected by semaphores, and are
- *      assumed to be exclusively owned by the caller.  There is a
- *      spinlock in the buffer, used by the caller when concurrent
- *      access is possible.
- */
-/*
- *      Locks a buffer object, if it is not already locked.  Note that this in
- *      no way locks the underlying pages, so it is only useful for
- *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      independent access to the underlying pages.
 *
 *      If we come across a stale, pinned, locked buffer, we know that we are
 *      being asked to lock a buffer that has been reallocated. Because it is
@@ -900,10 +899,7 @@ xfs_buf_lock_value(
 }
 /*
- *      Locks a buffer object.
+ *      Lock a buffer object.
- *      Note that this in no way locks the underlying pages, so it is only
- *      useful for synchronizing concurrent use of buffer objects, not for
- *      synchronizing independent access to the underlying pages.
 *
 *      If we come across a stale, pinned, locked buffer, we know that we
 *      are being asked to lock a buffer that has been reallocated. Because
@@ -919,8 +915,6 @@ xfs_buf_lock(
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
                xfs_log_force(bp->b_target->bt_mount, 0);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
        XB_SET_OWNER(bp);
@@ -964,9 +958,7 @@ xfs_buf_wait_unpin(
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (atomic_read(&bp->b_pin_count) == 0)
                        break;
-                if (atomic_read(&bp->b_io_remaining))
+                io_schedule();
-                        blk_run_address_space(bp->b_target->bt_mapping);
-                schedule();
        }
        remove_wait_queue(&bp->b_waiters, &wait);
        set_current_state(TASK_RUNNING);
@@ -1178,10 +1170,8 @@ _xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
 {
-        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-                bp->b_flags &= ~_XBF_PAGE_LOCKED;
                xfs_buf_ioend(bp, schedule);
-        }
 }
 STATIC void
@@ -1190,35 +1180,12 @@ xfs_buf_bio_end_io(
        int                     error)
 {
        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
-        unsigned int            blocksize = bp->b_target->bt_bsize;
-        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        xfs_buf_ioerror(bp, -error);
        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-        do {
-                struct page     *page = bvec->bv_page;
-                ASSERT(!PagePrivate(page));
-                if (unlikely(bp->b_error)) {
-                        if (bp->b_flags & XBF_READ)
-                                ClearPageUptodate(page);
-                } else if (blocksize >= PAGE_CACHE_SIZE) {
-                        SetPageUptodate(page);
-                } else if (!PagePrivate(page) &&
-                                (bp->b_flags & _XBF_PAGE_CACHE)) {
-                        set_page_region(page, bvec->bv_offset, bvec->bv_len);
-                }
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (bp->b_flags & _XBF_PAGE_LOCKED)
-                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
        _xfs_buf_ioend(bp, 1);
        bio_put(bio);
 }
@@ -1232,7 +1199,6 @@ _xfs_buf_ioapply(
        int                     offset = bp->b_offset;
        int                     size = bp->b_count_desired;
        sector_t                sector = bp->b_bn;
-        unsigned int            blocksize = bp->b_target->bt_bsize;
        total_nr_pages = bp->b_page_count;
        map_i = 0;
@@ -1253,29 +1219,6 @@ _xfs_buf_ioapply(
                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
        }
-        /* Special code path for reading a sub page size buffer in --
-         * we populate up the whole page, and hence the other metadata
-         * in the same page.  This optimization is only valid when the
-         * filesystem block size is not smaller than the page size.
-         */
-        if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-            ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
-              (XBF_READ|_XBF_PAGE_LOCKED)) &&
-            (blocksize >= PAGE_CACHE_SIZE)) {
-                bio = bio_alloc(GFP_NOIO, 1);
-                bio->bi_bdev = bp->b_target->bt_bdev;
-                bio->bi_sector = sector - (offset >> BBSHIFT);
-                bio->bi_end_io = xfs_buf_bio_end_io;
-                bio->bi_private = bp;
-                bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
-                size = 0;
-                atomic_inc(&bp->b_io_remaining);
-                goto submit_io;
-        }
 next_chunk:
        atomic_inc(&bp->b_io_remaining);
@@ -1289,8 +1232,9 @@ next_chunk:
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
        for (; size && nr_pages; nr_pages--, map_i++) {
-                int     rbytes, nbytes = PAGE_CACHE_SIZE - offset;
+                int     rbytes, nbytes = PAGE_SIZE - offset;
                if (nbytes > size)
                        nbytes = size;
@@ -1305,7 +1249,6 @@ next_chunk:
                total_nr_pages--;
        }
-submit_io:
        if (likely(bio->bi_size)) {
                if (xfs_buf_is_vmapped(bp)) {
                        flush_kernel_vmap_range(bp->b_addr,
@@ -1315,18 +1258,7 @@ submit_io:
                if (size)
                        goto next_chunk;
        } else {
-                /*
-                 * if we get here, no pages were added to the bio. However,
-                 * we can't just error out here - if the pages are locked then
-                 * we have to unlock them otherwise we can hang on a later
-                 * access to the page.
-                 */
                xfs_buf_ioerror(bp, EIO);
-                if (bp->b_flags & _XBF_PAGE_LOCKED) {
-                        int i;
-                        for (i = 0; i < bp->b_page_count; i++)
-                                unlock_page(bp->b_pages[i]);
-                }
                bio_put(bio);
        }
 }
@@ -1371,8 +1303,6 @@ xfs_buf_iowait(
 {
        trace_xfs_buf_iowait(bp, _RET_IP_);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1390,8 +1320,8 @@ xfs_buf_offset(
                return XFS_BUF_PTR(bp) + offset;
        offset += bp->b_offset;
-        page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
+        page = bp->b_pages[offset >> PAGE_SHIFT];
-        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
+        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
 }
 /*
@@ -1413,9 +1343,9 @@ xfs_buf_iomove(
                page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
                cpoff = xfs_buf_poff(boff + bp->b_offset);
                csize = min_t(size_t,
-                              PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
+                              PAGE_SIZE-cpoff, bp->b_count_desired-boff);
-                ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+                ASSERT(((csize + cpoff) <= PAGE_SIZE));
                switch (mode) {
                case XBRW_ZERO:
@@ -1438,51 +1368,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-        struct xfs_perag        *pag;
+        struct xfs_buf          *bp;
-        uint                    i;
-        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+restart:
-                pag = xfs_perag_get(btp->bt_mount, i);
+        spin_lock(&btp->bt_lru_lock);
-                spin_lock(&pag->pag_buf_lock);
+        while (!list_empty(&btp->bt_lru)) {
-                while (rb_first(&pag->pag_buf_tree)) {
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-                        spin_unlock(&pag->pag_buf_lock);
+                if (atomic_read(&bp->b_hold) > 1) {
+                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                        spin_lock(&pag->pag_buf_lock);
+                        goto restart;
                }
-                spin_unlock(&pag->pag_buf_lock);
+                /*
-                xfs_perag_put(pag);
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      buftarg list for delwrite queue processing
+xfs_buftarg_shrink(
- */
+        struct shrinker         *shrink,
-static LIST_HEAD(xfs_buftarg_list);
+        int                     nr_to_scan,
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
+        gfp_t                   mask)
-STATIC void
-xfs_register_buftarg(
-        xfs_buftarg_t           *btp)
 {
-        spin_lock(&xfs_buftarg_lock);
+        struct xfs_buftarg      *btp = container_of(shrink,
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                                        struct xfs_buftarg, bt_shrinker);
-        spin_unlock(&xfs_buftarg_lock);
+        struct xfs_buf          *bp;
-}
+        LIST_HEAD(dispose);
-STATIC void
+        if (!nr_to_scan)
-xfs_unregister_buftarg(
+                return btp->bt_lru_nr;
-        xfs_buftarg_t           *btp)
-{
+        spin_lock(&btp->bt_lru_lock);
-        spin_lock(&xfs_buftarg_lock);
+        while (!list_empty(&btp->bt_lru)) {
-        list_del(&btp->bt_list);
+                if (nr_to_scan-- <= 0)
-        spin_unlock(&xfs_buftarg_lock);
+                        break;
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+                /*
+                 * Decrement the b_lru_ref count unless the value is already
+                 * zero. If the value is already zero, we need to reclaim the
+                 * buffer, otherwise it gets another trip through the LRU.
+                 */
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
+                        continue;
+                }
+                /*
+                 * remove the buffer from the LRU now to avoid needing another
+                 * lock round trip inside xfs_buf_rele().
+                 */
+                list_move(&bp->b_lru, &dispose);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1490,17 +1453,13 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1516,21 +1475,12 @@ xfs_setsize_buftarg_flags(
        btp->bt_smask = sectorsize - 1;
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
-                printk(KERN_WARNING
+                xfs_warn(btp->bt_mount,
-                        "XFS: Cannot set_blocksize to %u on device %s\n",
+                        "Cannot set_blocksize to %u on device %s\n",
                        sectorsize, XFS_BUFTARG_NAME(btp));
                return EINVAL;
        }
-        if (verbose &&
-            (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
-                printk(KERN_WARNING
-                        "XFS: %u byte sectors in use on device %s.  "
-                        "This is suboptimal; %u or greater is ideal.\n",
-                        sectorsize, XFS_BUFTARG_NAME(btp),
-                        (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
-        }
        return 0;
 }
@@ -1545,7 +1495,7 @@ xfs_setsize_buftarg_early(
        struct block_device     *bdev)
 {
        return xfs_setsize_buftarg_flags(btp,
-                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
+                        PAGE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 int
@@ -1558,59 +1508,17 @@ xfs_setsize_buftarg(
 }
 STATIC int
-xfs_mapping_buftarg(
-        xfs_buftarg_t           *btp,
-        struct block_device     *bdev)
-{
-        struct backing_dev_info *bdi;
-        struct inode            *inode;
-        struct address_space    *mapping;
-        static const struct address_space_operations mapping_aops = {
-                .sync_page = block_sync_page,
-                .migratepage = fail_migrate_page,
-        };
-        inode = new_inode(bdev->bd_inode->i_sb);
-        if (!inode) {
-                printk(KERN_WARNING
-                        "XFS: Cannot allocate mapping inode for device %s\n",
-                        XFS_BUFTARG_NAME(btp));
-                return ENOMEM;
-        }
-        inode->i_ino = get_next_ino();
-        inode->i_mode = S_IFBLK;
-        inode->i_bdev = bdev;
-        inode->i_rdev = bdev->bd_dev;
-        bdi = blk_get_backing_dev_info(bdev);
-        if (!bdi)
-                bdi = &default_backing_dev_info;
-        mapping = &inode->i_data;
-        mapping->a_ops = &mapping_aops;
-        mapping->backing_dev_info = bdi;
-        mapping_set_gfp_mask(mapping, GFP_NOFS);
-        btp->bt_mapping = mapping;
-        return 0;
-}
-STATIC int
 xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
@@ -1627,12 +1535,19 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
-        if (xfs_setsize_buftarg_early(btp, bdev))
+        btp->bt_bdi = blk_get_backing_dev_info(bdev);
+        if (!btp->bt_bdi)
                goto error;
-        if (xfs_mapping_buftarg(btp, bdev))
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
+        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1737,27 +1652,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1845,8 +1739,8 @@ xfsbufd(
        do {
                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
                long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
-                int     count = 0;
                struct list_head tmp;
+                struct blk_plug plug;
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1862,16 +1756,15 @@ xfsbufd(
                xfs_buf_delwri_split(target, &tmp, age);
                list_sort(NULL, &tmp, xfs_buf_cmp);
+                blk_start_plug(&plug);
                while (!list_empty(&tmp)) {
                        struct xfs_buf *bp;
                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
                        xfs_bdstrat_cb(bp);
-                        count++;
                }
-                if (count)
+                blk_finish_plug(&plug);
-                        blk_run_address_space(target->bt_mapping);
        } while (!kthread_should_stop());
        return 0;
@@ -1891,6 +1784,7 @@ xfs_flush_buftarg(
        int             pincount = 0;
        LIST_HEAD(tmp_list);
        LIST_HEAD(wait_list);
+        struct blk_plug plug;
        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1905,6 +1799,8 @@ xfs_flush_buftarg(
         * we do that after issuing all the IO.
         */
        list_sort(NULL, &tmp_list, xfs_buf_cmp);
+        blk_start_plug(&plug);
        while (!list_empty(&tmp_list)) {
                bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
                ASSERT(target == bp->b_target);
@@ -1915,10 +1811,10 @@ xfs_flush_buftarg(
                }
                xfs_bdstrat_cb(bp);
        }
+        blk_finish_plug(&plug);
        if (wait) {
-                /* Expedite and wait for IO to complete. */
+                /* Wait for IO to complete. */
-                blk_run_address_space(target->bt_mapping);
                while (!list_empty(&wait_list)) {
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
@@ -1944,15 +1840,15 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_workqueue("xfsdatad");
+        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                                WQ_MEM_RECLAIM, 1);
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +1864,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf98..a9a1c4512645 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -61,30 +61,11 @@ typedef enum {
 #define XBF_DONT_BLOCK  (1 << 16)/* do not block in current thread */
 /* flags used only internally */
-#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
 #define _XBF_PAGES      (1 << 18)/* backed by refcounted pages */
 #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
+#define _XBF_KMEM       (1 << 20)/* backed by heap memory */
 #define _XBF_DELWRI_Q   (1 << 21)/* buffer on delwri queue */
-/*
- * Special flag for supporting metadata blocks smaller than a FSB.
- *
- * In this case we can have multiple xfs_buf_t on a single page and
- * need to lock out concurrent xfs_buf_t readers as they only
- * serialise access to the buffer.
- *
- * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
- * between reads of the page. Hence we can have one thread read the
- * page and modify it, but then race with another thread that thinks
- * the page is not up-to-date and hence reads it again.
- *
- * The result is that the first modifcation to the page is lost.
- * This sort of AGF/AGI reading race can happen when unlinking inodes
- * that require truncation and results in the AGI unlinked list
- * modifications being lost.
- */
-#define _XBF_PAGE_LOCKED        (1 << 22)
 typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
@@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_LOCK,             "LOCK" },       /* should never be set */\
        { XBF_TRYLOCK,          "TRYLOCK" },    /* ditto */\
        { XBF_DONT_BLOCK,       "DONT_BLOCK" }, /* ditto */\
-        { _XBF_PAGE_CACHE,      "PAGE_CACHE" }, \
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_RUN_QUEUES,      "RUN_QUEUES" }, \
-        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
+        { _XBF_KMEM,            "KMEM" }, \
-        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }
+        { _XBF_DELWRI_Q,        "DELWRI_Q" }
 typedef enum {
        XBT_FORCE_SLEEP = 0,
@@ -120,7 +99,7 @@ typedef struct xfs_bufhash {
 typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
-        struct address_space    *bt_mapping;
+        struct backing_dev_info *bt_bdi;
        struct xfs_mount        *bt_mount;
        unsigned int            bt_bsize;
        unsigned int            bt_sshift;
@@ -128,27 +107,19 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
-} xfs_buftarg_t;
-/*
+        /* LRU control structures */
- *      xfs_buf_t:  Buffer structure for pagecache-based buffers
+        struct shrinker         bt_shrinker;
- *
+        struct list_head        bt_lru;
- * This buffer structure is used by the pagecache buffer management routines
+        spinlock_t              bt_lru_lock;
- * to refer to an assembly of pages forming a logical buffer.
+        unsigned int            bt_lru_nr;
- *
+} xfs_buftarg_t;
- * The buffer structure is used on a temporary basis only, and discarded when
- * released.  The real data storage is recorded in the pagecache. Buffers are
- * hashed to the block device on which the file system resides.
- */
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
@@ -164,9 +135,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -176,7 +149,6 @@ typedef struct xfs_buf {
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -264,7 +236,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -315,7 +288,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +300,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
@@ -346,8 +324,7 @@ extern void xfs_buf_terminate(void);
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-        if (!bp->b_relse)
+        xfs_buf_unlock(bp);
-                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..d61611c88012
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+STATIC int
+xfs_trim_extents(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_fsblock_t           start,
+        xfs_fsblock_t           len,
+        xfs_fsblock_t           minlen,
+        __uint64_t              *blocks_trimmed)
+{
+        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        struct xfs_perag        *pag;
+        int                     error;
+        int                     i;
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+        if (error || !agbp)
+                goto out_put_perag;
+        cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Force out the log.  This means any transactions that might have freed
+         * space before we took the AGF buffer lock are now on disk, and the
+         * volatile disk cache is flushed.
+         */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * Look up the longest btree in the AGF and start with it.
+         */
+        error = xfs_alloc_lookup_le(cur, 0,
+                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+        if (error)
+                goto out_del_cursor;
+        /*
+         * Loop until we are done with all extents that are large
+         * enough to be worth discarding.
+         */
+        while (i) {
+                xfs_agblock_t fbno;
+                xfs_extlen_t flen;
+                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+                if (error)
+                        goto out_del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                /*
+                 * Too small?  Give up.
+                 */
+                if (flen < minlen) {
+                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                        goto out_del_cursor;
+                }
+                /*
+                 * If the extent is entirely outside of the range we are
+                 * supposed to discard skip it.  Do not bother to trim
+                 * down partially overlapping ranges for now.
+                 */
+                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                /*
+                 * If any blocks in the range are still busy, skip the
+                 * discard and try again the next time.
+                 */
+                if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                        trace_xfs_discard_busy(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                trace_xfs_discard_extent(mp, agno, fbno, flen);
+                error = -blkdev_issue_discard(bdev,
+                                XFS_AGB_TO_DADDR(mp, agno, fbno),
+                                XFS_FSB_TO_BB(mp, flen),
+                                GFP_NOFS, 0);
+                if (error)
+                        goto out_del_cursor;
+                *blocks_trimmed += flen;
+next_extent:
+                error = xfs_btree_decrement(cur, 0, &i);
+                if (error)
+                        goto out_del_cursor;
+        }
+out_del_cursor:
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_buf_relse(agbp);
+out_put_perag:
+        xfs_perag_put(pag);
+        return error;
+}
+int
+xfs_ioc_trim(
+        struct xfs_mount                *mp,
+        struct fstrim_range __user      *urange)
+{
+        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+        unsigned int            granularity = q->limits.discard_granularity;
+        struct fstrim_range     range;
+        xfs_fsblock_t           start, len, minlen;
+        xfs_agnumber_t          start_agno, end_agno, agno;
+        __uint64_t              blocks_trimmed = 0;
+        int                     error, last_error = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (!blk_queue_discard(q))
+                return -XFS_ERROR(EOPNOTSUPP);
+        if (copy_from_user(&range, urange, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        /*
+         * Truncating down the len isn't actually quite correct, but using
+         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+         * used by the fstrim application.  In the end it really doesn't
+         * matter as trimming blocks is an advisory interface.
+         */
+        start = XFS_B_TO_FSBT(mp, range.start);
+        len = XFS_B_TO_FSBT(mp, range.len);
+        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start_agno >= mp->m_sb.sb_agcount)
+                return -XFS_ERROR(EINVAL);
+        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        if (end_agno >= mp->m_sb.sb_agcount)
+                end_agno = mp->m_sb.sb_agcount - 1;
+        for (agno = start_agno; agno <= end_agno; agno++) {
+                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                          &blocks_trimmed);
+                if (error)
+                        last_error = error;
+        }
+        if (last_error)
+                return last_error;
+        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+        if (copy_to_user(urange, &range, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+struct fstrim_range;
+extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
@@ -81,8 +89,10 @@ xfs_fs_encode_fh(
         * seven combinations work.  The real answer is "don't use v2".
         */
        len = xfs_fileid_length(fileid_type);
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return 255;
+        }
        *max_len = len;
        switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..f4213ba1ff85 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
 #include "xfs_trace.h"
 #include <linux/dcache.h>
+#include <linux/falloc.h>
 static const struct vm_operations_struct xfs_file_vm_ops;
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_lock(&VFS_I(ip)->i_mutex);
+        xfs_ilock(ip, type);
+}
+static inline void
+xfs_rw_iunlock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_iunlock(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+static inline void
+xfs_rw_ilock_demote(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_ilock_demote(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+/*
 *      xfs_iozero
 *
 *      xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                        if (ret) {
+                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                                return ret;
+                        }
                }
-                mutex_unlock(&inode->i_mutex);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                if (ret) {
+        } else
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +319,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -309,7 +343,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +351,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
+STATIC void
+xfs_aio_write_isize_update(
+        struct inode    *inode,
+        loff_t          *ppos,
+        ssize_t         bytes_written)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_fsize_t             isize = i_size_read(inode);
+        if (bytes_written > 0)
+                XFS_STATS_ADD(xs_write_bytes, bytes_written);
+        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                        *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occurred.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+        struct xfs_inode        *ip)
+{
+        if (ip->i_new_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize, new_size;
+        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -355,27 +440,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
+        xfs_aio_write_isize_update(inode, ppos, ret);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_aio_write_newsize_update(ip);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,247 +629,318 @@ out_lock:
        return error;
 }
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_write_checks(
-        struct kiocb            *iocb,
+        struct file             *file,
-        const struct iovec      *iovp,
+        loff_t                  *pos,
-        unsigned long           nr_segs,
+        size_t                  *count,
-        loff_t                  pos)
+        int                     *iolock)
 {
-        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             new_size;
-        ssize_t                 ret = 0, error = 0;
+        int                     error = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        size_t                  ocount = 0, count;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
+        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                *iolock = 0;
+                return error;
+        }
-        BUG_ON(iocb->ki_pos != pos);
+        new_size = *pos + *count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
-        if (unlikely(file->f_flags & O_DIRECT))
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                ioflags |= IO_ISDIRECT;
+                file_update_time(file);
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+        /*
+         * If the offset is beyond the size of the file, we need to zero any
+         * blocks that fall between the existing EOF and the start of this
+         * write.
+         */
+        if (*pos > ip->i_size)
+                error = -xfs_zero_eof(ip, *pos, ip->i_size);
-        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
-        count = ocount;
+        /*
-        if (count == 0)
+         * If we're writing the file then make sure to clear the setuid and
-                return 0;
+         * setgid bits if the process is not being run by root.  This keeps
+         * people from modifying setuid and setgid binaries.
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+         */
+        return file_remove_suid(file);
-        if (XFS_FORCED_SHUTDOWN(mp))
+}
-                return -EIO;
-relock:
+/*
-        if (ioflags & IO_ISDIRECT) {
+ * xfs_file_dio_aio_write - handle direct IO writes
-                iolock = XFS_IOLOCK_SHARED;
+ *
-                need_i_mutex = 0;
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
-        } else {
+ * By separating it from the buffered write path we remove all the tricky to
-                iolock = XFS_IOLOCK_EXCL;
+ * follow locking changes and looping.
-                need_i_mutex = 1;
+ *
-                mutex_lock(&inode->i_mutex);
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos,
+        size_t                  ocount,
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        size_t                  count = ocount;
+        int                     unaligned_io = 0;
+        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp;
+        *iolock = 0;
+        if ((pos & target->bt_smask) || (count & target->bt_smask))
+                return -XFS_ERROR(EINVAL);
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+                unaligned_io = 1;
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+                *iolock = XFS_IOLOCK_EXCL;
+        else
+                *iolock = XFS_IOLOCK_SHARED;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        if (ret)
+                return ret;
+        if (mapping->nrpages) {
+                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                        FI_REMAPF_LOCKED);
+                if (ret)
+                        return ret;
        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        /*
+         * If we are doing unaligned IO, wait for all other IO to drain,
-start:
+         * otherwise demote the lock if we had to flush cached pages
-        error = -generic_write_checks(file, &pos, &count,
+         */
-                                        S_ISBLK(inode->i_mode));
+        if (unaligned_io)
-        if (error) {
+                xfs_ioend_wait(ip);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
-                goto out_unlock_mutex;
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                *iolock = XFS_IOLOCK_SHARED;
        }
-        if (ioflags & IO_ISDIRECT) {
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-                xfs_buftarg_t   *target =
+        ret = generic_file_direct_write(iocb, iovp,
-                        XFS_IS_REALTIME_INODE(ip) ?
+                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+        /* No fallback to buffered IO on errors for XFS. */
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        ASSERT(ret < 0 || ret == count);
-                        return XFS_ERROR(-EINVAL);
+        return ret;
-                }
+}
-                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+STATIC ssize_t
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+xfs_file_buffered_aio_write(
-                        iolock = XFS_IOLOCK_EXCL;
+        struct kiocb            *iocb,
-                        need_i_mutex = 1;
+        const struct iovec      *iovp,
-                        mutex_lock(&inode->i_mutex);
+        unsigned long           nr_segs,
-                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+        loff_t                  pos,
-                        goto start;
+        size_t                  ocount,
-                }
+        int                     *iolock)
-        }
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     enospc = 0;
+        size_t                  count = ocount;
-        new_size = pos + count;
+        *iolock = XFS_IOLOCK_EXCL;
-        if (new_size > ip->i_size)
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-                ip->i_new_size = new_size;
-        if (likely(!(ioflags & IO_INVIS)))
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-                file_update_time(file);
+        if (ret)
+                return ret;
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+write_retry:
+        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                        pos, &iocb->ki_pos, count, ret);
        /*
-         * If the offset is beyond the size of the file, we have a couple
+         * if we just got an ENOSPC, flush the inode now we aren't holding any
-         * of things to do. First, if there is already space allocated
+         * page locks and retry *once*
-         * we need to either create holes or zero the disk or ...
-         *
-         * If there is a page where the previous size lands, we need
-         * to zero it out up to the new size.
         */
+        if (ret == -ENOSPC && !enospc) {
-        if (pos > ip->i_size) {
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                error = xfs_zero_eof(ip, pos, ip->i_size);
+                if (ret)
-                if (error) {
+                        return ret;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                enospc = 1;
-                        goto out_unlock_internal;
+                goto write_retry;
-                }
        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        current->backing_dev_info = NULL;
+        return ret;
+}
-        /*
+STATIC ssize_t
-         * If we're writing the file then make sure to clear the
+xfs_file_aio_write(
-         * setuid and setgid bits if the process is not being run
+        struct kiocb            *iocb,
-         * by root.  This keeps people from modifying setuid and
+        const struct iovec      *iovp,
-         * setgid binaries.
+        unsigned long           nr_segs,
-         */
+        loff_t                  pos)
-        error = -file_remove_suid(file);
+{
-        if (unlikely(error))
+        struct file             *file = iocb->ki_filp;
-                goto out_unlock_internal;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     iolock;
+        size_t                  ocount = 0;
-        /* We can write back this queue in page reclaim */
+        XFS_STATS_INC(xs_write_calls);
-        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
+        BUG_ON(iocb->ki_pos != pos);
-                if (mapping->nrpages) {
-                        WARN_ON(need_i_mutex == 0);
-                        error = xfs_flushinval_pages(ip,
-                                        (pos & PAGE_CACHE_MASK),
-                                        -1, FI_REMAPF_LOCKED);
-                        if (error)
-                                goto out_unlock_internal;
-                }
-                if (need_i_mutex) {
+        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-                        /* demote the lock now the cached pages are gone */
+        if (ret)
-                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                return ret;
-                        mutex_unlock(&inode->i_mutex);
-                        iolock = XFS_IOLOCK_SHARED;
+        if (ocount == 0)
-                        need_i_mutex = 0;
+                return 0;
-                }
-                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                /*
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                 * direct-io write to a hole: fall through to buffered I/O
+                return -EIO;
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
+        if (unlikely(file->f_flags & O_DIRECT))
-                        count -= ret;
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
+        else
+                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
-                        ioflags &= ~IO_ISDIRECT;
+        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-                        xfs_iunlock(ip, iolock);
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
+        if (ret <= 0)
-                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+                goto out_unlock;
-                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                                pos, &iocb->ki_pos, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
+                int error, error2;
-        isize = i_size_read(inode);
+                xfs_rw_iunlock(ip, iolock);
-        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                error = filemap_write_and_wait_range(mapping, pos, end);
-                iocb->ki_pos = isize;
+                xfs_rw_ilock(ip, iolock);
-        if (iocb->ki_pos > ip->i_size) {
+                error2 = -xfs_file_fsync(file,
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-                if (iocb->ki_pos > ip->i_size)
+                if (error)
-                        ip->i_size = iocb->ki_pos;
+                        ret = error;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                else if (error2)
+                        ret = error2;
        }
-        error = -ret;
+out_unlock:
-        if (ret <= 0)
+        xfs_aio_write_newsize_update(ip);
-                goto out_unlock_internal;
+        xfs_rw_iunlock(ip, iolock);
+        return ret;
+}
-        XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+        struct file     *file,
+        int             mode,
+        loff_t          offset,
+        loff_t          len)
+{
+        struct inode    *inode = file->f_path.dentry->d_inode;
+        long            error;
+        loff_t          new_size = 0;
+        xfs_flock64_t   bf;
+        xfs_inode_t     *ip = XFS_I(inode);
+        int             cmd = XFS_IOC_RESVSP;
+        int             attr_flags = XFS_ATTR_NOLOCK;
-        /* Handle various SYNC-type writes */
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+                return -EOPNOTSUPP;
-                loff_t end = pos + ret - 1;
-                int error2;
-                xfs_iunlock(ip, iolock);
+        bf.l_whence = 0;
-                if (need_i_mutex)
+        bf.l_start = offset;
-                        mutex_unlock(&inode->i_mutex);
+        bf.l_len = len;
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                if (!error)
-                        error = error2;
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file,
+        if (mode & FALLOC_FL_PUNCH_HOLE)
-                                         (file->f_flags & __O_SYNC) ? 0 : 1);
+                cmd = XFS_IOC_UNRESVSP;
-                if (!error)
-                        error = error2;
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
        }
- out_unlock_internal:
+        if (file->f_flags & O_DSYNC)
-        if (ip->i_new_size) {
+                attr_flags |= XFS_ATTR_SYNC;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                ip->i_new_size = 0;
+        error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
-                /*
+        if (error)
-                 * If this was a direct or synchronous I/O that failed (such
+                goto out_unlock;
-                 * as ENOSPC) then part of the I/O may have been written to
-                 * disk before the error occured.  In this case the on-disk
+        /* Change file size if needed */
-                 * file size may have been adjusted beyond the in-memory file
+        if (new_size) {
-                 * size and now needs to be truncated back.
+                struct iattr iattr;
-                 */
-                if (ip->i_d.di_size > ip->i_size)
+                iattr.ia_valid = ATTR_SIZE;
-                        ip->i_d.di_size = ip->i_size;
+                iattr.ia_size = new_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
-        xfs_iunlock(ip, iolock);
- out_unlock_mutex:
+out_unlock:
-        if (need_i_mutex)
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                mutex_unlock(&inode->i_mutex);
+        return error;
-        return -error;
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
@@ -921,6 +1059,7 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
+        .fallocate      = xfs_file_fallocate,
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -623,6 +624,10 @@ xfs_ioc_space(
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
+        if (filp->f_flags & O_DSYNC)
+                attr_flags |= XFS_ATTR_SYNC;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
@@ -694,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
        xfs_mount_t             *mp,
        void                    __user *arg)
 {
-        xfs_fsop_geom_v1_t      fsgeo;
+        xfs_fsop_geom_t         fsgeo;
        int                     error;
-        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
                return -error;
-        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+        /*
+         * Caller should have passed an argument of type
+         * xfs_fsop_geom_v1_t.  This is a proper subset of the
+         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+         */
+        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
@@ -984,10 +994,22 @@ xfs_ioctl_setattr(
                /*
                 * Extent size must be a multiple of the appropriate block
-                 * size, if set at all.
+                 * size, if set at all. It must also be smaller than the
+                 * maximum extent size supported by the filesystem.
+                 *
+                 * Also, for non-realtime files, limit the extent size hint to
+                 * half the size of the AGs in the filesystem so alignment
+                 * doesn't result in extents larger than an AG.
                 */
                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
+                        xfs_extlen_t    size;
+                        xfs_fsblock_t   extsize_fsb;
+                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                        if (extsize_fsb > MAXEXTLEN) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
                        if (XFS_IS_REALTIME_INODE(ip) ||
                            ((mask & FSX_XFLAGS) &&
@@ -996,6 +1018,10 @@ xfs_ioctl_setattr(
                                       mp->m_sb.sb_blocklog;
                        } else {
                                size = mp->m_sb.sb_blocksize;
+                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+                                        code = XFS_ERROR(EINVAL);
+                                        goto error_return;
+                                }
                        }
                        if (fa->fsx_extsize % size) {
@@ -1294,6 +1320,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
+        case FITRIM:
+                return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a2973..dd21784525a8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
 #include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/falloc.h>
 #include <linux/fiemap.h>
 #include <linux/slab.h>
@@ -71,7 +70,7 @@ xfs_synchronize_times(
 /*
 * If the linux inode is valid, mark it dirty.
- * Used when commiting a dirty inode into a transaction so that
+ * Used when committing a dirty inode into a transaction so that
 * the inode will get written back by the linux code
 */
 void
@@ -103,7 +102,8 @@ xfs_mark_inode_dirty(
 STATIC int
 xfs_init_security(
        struct inode    *inode,
-        struct inode    *dir)
+        struct inode    *dir,
+        const struct qstr *qstr)
 {
        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
@@ -111,7 +111,7 @@ xfs_init_security(
        unsigned char   *name;
        int             error;
-        error = security_inode_init_security(inode, dir, (char **)&name,
+        error = security_inode_init_security(inode, dir, qstr, (char **)&name,
                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
@@ -195,7 +195,7 @@ xfs_vn_mknod(
        inode = VFS_I(ip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -368,7 +368,7 @@ xfs_vn_symlink(
        inode = VFS_I(cip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -505,58 +505,6 @@ xfs_vn_setattr(
        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
-STATIC long
-xfs_vn_fallocate(
-        struct inode    *inode,
-        int             mode,
-        loff_t          offset,
-        loff_t          len)
-{
-        long            error;
-        loff_t          new_size = 0;
-        xfs_flock64_t   bf;
-        xfs_inode_t     *ip = XFS_I(inode);
-        /* preallocation on directories not yet supported */
-        error = -ENODEV;
-        if (S_ISDIR(inode->i_mode))
-                goto out_error;
-        bf.l_whence = 0;
-        bf.l_start = offset;
-        bf.l_len = len;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        /* check the new inode size is valid before allocating */
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-            offset + len > i_size_read(inode)) {
-                new_size = offset + len;
-                error = inode_newsize_ok(inode, new_size);
-                if (error)
-                        goto out_unlock;
-        }
-        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                       0, XFS_ATTR_NOLOCK);
-        if (error)
-                goto out_unlock;
-        /* Change file size if needed */
-        if (new_size) {
-                struct iattr iattr;
-                iattr.ia_valid = ATTR_SIZE;
-                iattr.ia_size = new_size;
-                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
-        }
-out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
-        return error;
-}
 #define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 /*
@@ -650,7 +598,6 @@ static const struct inode_operations xfs_inode_operations = {
        .getxattr               = generic_getxattr,
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .fallocate              = xfs_vn_fallocate,
        .fiemap                 = xfs_vn_fiemap,
 };
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff79..244be9cbfe78 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,10 +37,8 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
-#include <support/debug.h>
 #include <support/uuid.h>
 #include <linux/semaphore.h>
@@ -87,6 +85,7 @@
 #include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_buf.h>
+#include <xfs_message.h>
 /*
 * Feature macros (disable/enable)
@@ -281,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #define __arch_pack
 #endif
+#define ASSERT_ALWAYS(expr)     \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef DEBUG
+#define ASSERT(expr)    ((void)0)
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+#else /* DEBUG */
+#define ASSERT(expr)    \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef STATIC
+# define STATIC noinline
+#endif
+#endif /* DEBUG */
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..9f76cceb678d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        struct va_format        *vaf)
+{
+        if (mp && mp->m_fsname) {
+                printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+                return;
+        }
+        printk("%sXFS: %pV\n", level, vaf);
+}
+void xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        __xfs_printk(level, mp, &vaf);
+        va_end(args);
+}
+#define define_xfs_printk_level(func, kern_level)               \
+void func(const struct xfs_mount *mp, const char *fmt, ...)     \
+{                                                               \
+        struct va_format        vaf;                            \
+        va_list                 args;                           \
+                                                                \
+        va_start(args, fmt);                                    \
+                                                                \
+        vaf.fmt = fmt;                                          \
+        vaf.va = &args;                                         \
+                                                                \
+        __xfs_printk(kern_level, mp, &vaf);                     \
+        va_end(args);                                           \
+}                                                               \
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+void
+xfs_alert_tag(
+        const struct xfs_mount  *mp,
+        int                     panic_tag,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+                xfs_printk(KERN_ALERT, mp,
+                        "XFS: Transforming an alert into a BUG.");
+                do_panic = 1;
+        }
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        __xfs_printk(KERN_ALERT, mp, &vaf);
+        va_end(args);
+        BUG_ON(do_panic);
+}
+void
+assfail(char *expr, char *file, int line)
+{
+        xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+                expr, file, line);
+        BUG();
+}
+void
+xfs_hex_dump(void *p, int length)
+{
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..f1b3fc1b6c4e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,40 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+struct xfs_mount;
+extern void xfs_printk(const char *level, const struct xfs_mount *mp,
+                      const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+                         const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#ifdef DEBUG
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+extern void assfail(char *expr, char *f, int l);
+extern void xfs_hex_dump(void *p, int length);
+#endif  /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3c..b38e58d02299 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -173,6 +173,15 @@ xfs_parseargs(
        __uint8_t               iosizelog = 0;
        /*
+         * set up the mount name first so all the errors will refer to the
+         * correct device.
+         */
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        /*
         * Copy binary VFS mount flags we are interested in.
         */
        if (sb->s_flags & MS_RDONLY)
@@ -189,6 +198,7 @@ xfs_parseargs(
        mp->m_flags |= XFS_MOUNT_BARRIER;
        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+        mp->m_flags |= XFS_MOUNT_DELAYLOG;
        /*
         * These can be overridden by the mount option parsing.
@@ -207,24 +217,21 @@ xfs_parseargs(
                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -232,14 +239,12 @@ xfs_parseargs(
                        if (!mp->m_logname)
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -248,8 +253,7 @@ xfs_parseargs(
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -257,8 +261,7 @@ xfs_parseargs(
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -280,16 +283,14 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        dsunit = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -297,8 +298,7 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
@@ -356,20 +356,19 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: ihashsize no longer used, option is deprecated.");
+        "ihashsize no longer used, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisdsync has no effect, option is deprecated.");
+        "osyncisdsync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisosync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisosync has no effect, option is deprecated.");
+        "osyncisosync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
+        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "unknown mount option [%s].", this_char);
-                                "XFS: unknown mount option [%s].", this_char);
                        return EINVAL;
                }
        }
@@ -379,40 +378,37 @@ xfs_parseargs(
         */
        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                xfs_warn(mp, "no-recovery mounts must be read-only.");
                return EINVAL;
        }
        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: sunit and swidth options incompatible with the noalign option");
+        "sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "quota support not available in this kernel.");
-                        "XFS: quota support not available in this kernel.");
                return EINVAL;
        }
 #endif
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "cannot mount with both project and group quota");
-                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "sunit and swidth must be specified together");
-                        "XFS: sunit and swidth must be specified together");
                return EINVAL;
        }
        if (dsunit && (dswidth % dsunit != 0)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
+        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
                return EINVAL;
        }
@@ -438,8 +434,7 @@ done:
            mp->m_logbufs != 0 &&
            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
                return XFS_ERROR(EINVAL);
        }
@@ -448,22 +443,16 @@ done:
            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
             !is_power_of_2(mp->m_logbsize))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
                return XFS_ERROR(EINVAL);
        }
-        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
-        if (!mp->m_fsname)
-                return ENOMEM;
-        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
        if (iosizelog) {
                if (iosizelog > XFS_MAX_IO_LOG ||
                    iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
-                "XFS: invalid log iosize: %d [not %d-%d]",
                                iosizelog, XFS_MIN_IO_LOG,
                                XFS_MAX_IO_LOG);
                        return XFS_ERROR(EINVAL);
@@ -606,10 +595,11 @@ xfs_blkdev_get(
 {
        int                     error = 0;
-        *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+        *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                printk("XFS: Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
        }
        return -error;
@@ -620,7 +610,7 @@ xfs_blkdev_put(
        struct block_device     *bdev)
 {
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+                blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 /*
@@ -663,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
        int error;
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                  "Disabling barriers, not supported with external log device");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
        if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
-                  "Disabling barriers, underlying device is readonly");
+                        "Disabling barriers, underlying device is readonly");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
        error = xfs_barrier_test(mp);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
-                  "Disabling barriers, trial barrier write failed");
+                        "Disabling barriers, trial barrier write failed");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
@@ -742,8 +732,8 @@ xfs_open_devices(
                        goto out_close_logdev;
                if (rtdev == ddev || rtdev == logdev) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
                        error = EINVAL;
                        goto out_close_rtdev;
                }
@@ -826,63 +816,6 @@ xfs_setup_devices(
        return 0;
 }
-/*
- * XFS AIL push thread support
- */
-void
-xfsaild_wakeup(
-        struct xfs_ail          *ailp,
-        xfs_lsn_t               threshold_lsn)
-{
-        ailp->xa_target = threshold_lsn;
-        wake_up_process(ailp->xa_task);
-}
-STATIC int
-xfsaild(
-        void    *data)
-{
-        struct xfs_ail  *ailp = data;
-        xfs_lsn_t       last_pushed_lsn = 0;
-        long            tout = 0; /* milliseconds */
-        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
-                /* swsusp */
-                try_to_freeze();
-                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
-                        continue;
-                tout = xfsaild_push(ailp, &last_pushed_lsn);
-        }
-        return 0;
-}       /* xfsaild */
-int
-xfsaild_start(
-        struct xfs_ail  *ailp)
-{
-        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
-                                    ailp->xa_mount->m_fsname);
-        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(ailp->xa_task);
-        return 0;
-}
-void
-xfsaild_stop(
-        struct xfs_ail  *ailp)
-{
-        kthread_stop(ailp->xa_task);
-}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -935,7 +868,7 @@ out_reclaim:
 * Slab object creation initialisation for the XFS inode.
 * This covers only the idempotent fields in the XFS inode;
 * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
 * fields in the xfs inode that left in the initialise state
 * when freeing the inode.
 */
@@ -1076,7 +1009,7 @@ xfs_fs_write_inode(
                        error = 0;
                        goto out_unlock;
                }
-                error = xfs_iflush(ip, 0);
+                error = xfs_iflush(ip, SYNC_TRYLOCK);
        }
 out_unlock:
@@ -1118,6 +1051,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
@@ -1187,22 +1122,12 @@ xfs_fs_sync_fs(
                return -error;
        if (laptop_mode) {
-                int     prev_sync_seq = mp->m_sync_seq;
                /*
                 * The disk must be active because we're syncing.
                 * We schedule xfssyncd now (now that the disk is
                 * active) instead of later (when it might not be).
                 */
-                wake_up_process(mp->m_sync_task);
+                flush_delayed_work_sync(&mp->m_sync_work);
-                /*
-                 * We have to wait for the sync iteration to complete.
-                 * If we don't, the disk activity caused by the sync
-                 * will come after the sync is completed, and that
-                 * triggers another sync from laptop mode.
-                 */
-                wait_event(mp->m_wait_single_sync_task,
-                                mp->m_sync_seq != prev_sync_seq);
        }
        return 0;
@@ -1330,8 +1255,8 @@ xfs_fs_remount(
                         * options that we can't actually change.
                         */
 #if 0
-                        printk(KERN_INFO
+                        xfs_info(mp,
-        "XFS: mount option \"%s\" not supported for remount\n", p);
+                "mount option \"%s\" not supported for remount\n", p);
                        return -EINVAL;
 #else
                        break;
@@ -1352,8 +1277,7 @@ xfs_fs_remount(
                if (mp->m_update_flags) {
                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "failed to write sb changes");
-                                        "XFS: failed to write sb changes");
                                return error;
                        }
                        mp->m_update_flags = 0;
@@ -1399,7 +1323,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
@@ -1437,15 +1361,15 @@ xfs_finish_flags(
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
                } else if (mp->m_logbsize > 0 &&
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size must be greater than or equal to log stripe size");
+                "logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size for version 1 logs must be 16K or 32K");
+                "logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
                }
        }
@@ -1462,8 +1386,8 @@ xfs_finish_flags(
         * prohibit r/w mounts of read-only filesystems
         */
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: cannot mount a read-only filesystem as read-write");
+                        "cannot mount a read-only filesystem as read-write");
                return XFS_ERROR(EROFS);
        }
@@ -1487,9 +1411,6 @@ xfs_fs_fill_super(
        spin_lock_init(&mp->m_sb_lock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
-        INIT_LIST_HEAD(&mp->m_sync_list);
-        spin_lock_init(&mp->m_sync_lock);
-        init_waitqueue_head(&mp->m_wait_single_sync_task);
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1536,10 +1457,14 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_sb;
-        error = xfs_mountfs(mp);
+        /*
-        if (error)
+         * we must configure the block size in the superblock before we run the
-                goto out_filestream_unmount;
+         * full mount process as the mount process can lookup and cache inodes.
+         * For the same reason we must also initialise the syncd and register
+         * the inode cache shrinker so that inodes can be reclaimed during
+         * operations like a quotacheck that iterate all inodes in the
+         * filesystem.
+         */
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1547,6 +1472,16 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
+        error = xfs_syncd_init(mp);
+        if (error)
+                goto out_filestream_unmount;
+        xfs_inode_shrinker_register(mp);
+        error = xfs_mountfs(mp);
+        if (error)
+                goto out_syncd_stop;
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = ENOENT;
@@ -1562,14 +1497,11 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
-        error = xfs_syncd_init(mp);
-        if (error)
-                goto fail_vnrele;
-        xfs_inode_shrinker_register(mp);
        return 0;
+ out_syncd_stop:
+        xfs_inode_shrinker_unregister(mp);
+        xfs_syncd_stop(mp);
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
@@ -1593,6 +1525,9 @@ xfs_fs_fill_super(
        }
 fail_unmount:
+        xfs_inode_shrinker_unregister(mp);
+        xfs_syncd_stop(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
         * This can and will cause log traffic as inodes go inactive
@@ -1782,6 +1717,38 @@ xfs_destroy_zones(void)
 }
 STATIC int __init
+xfs_init_workqueues(void)
+{
+        /*
+         * max_active is set to 8 to give enough concurency to allow
+         * multiple work operations on each CPU to run. This allows multiple
+         * filesystems to be running sync work concurrently, and scales with
+         * the number of CPUs in the system.
+         */
+        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_syncd_wq)
+                goto out;
+        xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_ail_wq)
+                goto out_destroy_syncd;
+        return 0;
+out_destroy_syncd:
+        destroy_workqueue(xfs_syncd_wq);
+out:
+        return -ENOMEM;
+}
+STATIC void
+xfs_destroy_workqueues(void)
+{
+        destroy_workqueue(xfs_ail_wq);
+        destroy_workqueue(xfs_syncd_wq);
+}
+STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
@@ -1796,10 +1763,14 @@ init_xfs_fs(void)
        if (error)
                goto out;
-        error = xfs_mru_cache_init();
+        error = xfs_init_workqueues();
        if (error)
                goto out_destroy_zones;
+        error = xfs_mru_cache_init();
+        if (error)
+                goto out_destroy_wq;
        error = xfs_filestream_init();
        if (error)
                goto out_mru_cache_uninit;
@@ -1816,6 +1787,10 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
+        error = xfs_init_workqueues();
+        if (error)
+                goto out_sysctl_unregister;
        vfs_initquota();
        error = register_filesystem(&xfs_fs_type);
@@ -1833,6 +1808,8 @@ init_xfs_fs(void)
        xfs_filestream_uninit();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
+ out_destroy_wq:
+        xfs_destroy_workqueues();
 out_destroy_zones:
        xfs_destroy_zones();
 out:
@@ -1849,6 +1826,7 @@ exit_xfs_fs(void)
        xfs_buf_terminate();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
+        xfs_destroy_workqueues();
        xfs_destroy_zones();
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -39,6 +40,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
 /*
 * The inode lookup is done in batches to keep the amount of lock traffic and
 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -53,14 +56,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +91,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +121,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +141,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -334,7 +365,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -373,7 +404,7 @@ xfs_quiesce_fs(
 /*
 * Second stage of a quiesce. The data is already synced, now we have to take
 * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
+ * wait for any remaining transactions to drain out before proceeding.
 */
 void
 xfs_quiesce_attr(
@@ -397,69 +428,18 @@ xfs_quiesce_attr(
        /* Push the superblock and write an unmount record */
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "xfs_attr_quiesce: failed to log sb changes. "
                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
 }
-/*
+static void
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+xfs_syncd_queue_sync(
- * Doing this has two advantages:
+        struct xfs_mount        *mp)
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *),
-        struct completion *completion)
 {
-        struct xfs_sync_work *work;
+        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        work->w_completion = completion;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-        iput(inode);
-}
-void
-xfs_flush_inodes(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        DECLARE_COMPLETION_ONSTACK(completion);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        wait_for_completion(&completion);
-        xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 /*
@@ -469,84 +449,119 @@ xfs_flush_inodes(
 */
 STATIC void
 xfs_sync_worker(
-        struct xfs_mount *mp,
+        struct work_struct *work)
-        void            *unused)
 {
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_sync_work);
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp, 0);
+                        error = xfs_fs_log_dummy(mp);
+                else
+                        xfs_log_force(mp, 0);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+                /* start pushing all the metadata that is currently dirty */
+                xfs_ail_push_all(mp->m_ail);
        }
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
+        /* queue us up again */
+        xfs_syncd_queue_sync(mp);
 }
-STATIC int
+/*
-xfssyncd(
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
-        void                    *arg)
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
 {
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        xfs_sync_work_t         *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                if (list_empty(&mp->m_sync_list))
-                        timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
+        /*
-                /*
+         * We can have inodes enter reclaim after we've shut down the syncd
-                 * We can get woken by laptop mode, to do a sync -
+         * workqueue during unmount, so don't allow reclaim work to be queued
-                 * that's the (only!) case where the list would be
+         * during unmount.
-                 * empty with time remaining.
+         */
-                 */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                return;
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_splice_init(&mp->m_sync_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
+        rcu_read_lock();
-                        (*work->w_syncer)(mp, work->w_data);
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                        list_del(&work->w_list);
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                        if (work == &mp->m_sync_work)
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-                                continue;
-                        if (work->w_completion)
-                                complete(work->w_completion);
-                        kmem_free(work);
-                }
        }
+        rcu_read_unlock();
+}
-        return 0;
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        queue_work(xfs_syncd_wq, &mp->m_flush_work);
+        flush_work_sync(&mp->m_flush_work);
+}
+STATIC void
+xfs_flush_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(work,
+                                        struct xfs_mount, m_flush_work);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 int
 xfs_syncd_init(
        struct xfs_mount        *mp)
 {
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        mp->m_sync_work.w_mount = mp;
+        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        mp->m_sync_work.w_completion = NULL;
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
-        if (IS_ERR(mp->m_sync_task))
+        xfs_syncd_queue_sync(mp);
-                return -PTR_ERR(mp->m_sync_task);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -554,7 +569,9 @@ void
 xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
-        kthread_stop(mp->m_sync_task);
+        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
+        cancel_work_sync(&mp->m_flush_work);
 }
 void
@@ -573,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -592,12 +613,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -639,9 +660,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +680,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -723,8 +754,10 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error = 0;
+        int     error;
+restart:
+        error = 0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -750,9 +783,31 @@ xfs_reclaim_inode(
        if (xfs_inode_clean(ip))
                goto reclaim;
-        /* Now we have an inode that needs flushing */
+        /*
-        error = xfs_iflush(ip, sync_mode);
+         * Now we have an inode that needs flushing.
+         *
+         * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+         * reclaim as we can deadlock with inode cluster removal.
+         * xfs_ifree_cluster() can lock the inode buffer before it locks the
+         * ip->i_lock, and we are doing the exact opposite here. As a result,
+         * doing a blocking xfs_itobp() to get the cluster buffer will result
+         * in an ABBA deadlock with xfs_ifree_cluster().
+         *
+         * As xfs_ifree_cluser() must gather all inodes that are active in the
+         * cache to mark them stale, if we hit this case we don't actually want
+         * to do IO here - we want the inode marked stale so we can simply
+         * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+         * just unlock the inode, back off and try again. Hopefully the next
+         * pass through will see the stale flag set on the inode.
+         */
+        error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
        if (sync_mode & SYNC_WAIT) {
+                if (error == EAGAIN) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        /* backoff longer than in xfs_ifree_cluster */
+                        delay(2);
+                        goto restart;
+                }
                xfs_iflock(ip);
                goto reclaim;
        }
@@ -767,7 +822,7 @@ xfs_reclaim_inode(
         * pass on the error.
         */
        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
        }
@@ -795,12 +850,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +919,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -891,14 +946,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
@@ -945,7 +1010,13 @@ xfs_reclaim_inodes(
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
@@ -960,10 +1031,15 @@ xfs_reclaim_inode_shrink(
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer and push the AIL */
+                xfs_syncd_queue_reclaim(mp);
+                xfs_ail_push_all(mp->m_ail);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+                                        &nr_to_scan);
                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 32ba6628290c..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
+extern struct workqueue_struct  *xfs_syncd_wq;  /* sync workqueue */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 static struct ctl_table_header *xfs_table_header;
@@ -36,7 +37,7 @@ xfs_stats_clear_proc_handler(
        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
        if (!ret && write && *valp) {
-                printk("XFS Clearing xfsstats\n");
+                xfs_notice(NULL, "Clearing xfsstats");
                for_each_possible_cpu(c) {
                        preempt_disable();
                        /* save vn_active, it's a universal truth! */
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
        return ret;
 }
+STATIC int
+xfs_panic_mask_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             ret, *valp = ctl->data;
+        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                xfs_panic_mask = *valp;
+#ifdef DEBUG
+                xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+        }
+        return ret;
+}
 #endif /* CONFIG_PROC_FS */
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c594..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 xfs_agblock_t agbno, xfs_extlen_t len),
+        TP_ARGS(mp, agno, agbno, len),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len)
+)
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a12..6fa214603819 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
-        sv_destroy(&dqp->q_pinwait);
        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
        atomic_dec(&xfs_Gqm->qm_totaldquots);
@@ -545,9 +544,10 @@ xfs_qm_dqtobp(
        /*
         * A simple sanity check in case we got a corrupted dquot...
         */
-        if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+        error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                           "dqtobp")) {
+                           "dqtobp");
+        if (error) {
                if (!(flags & XFS_QMOPT_DQREPAIR)) {
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EIO);
@@ -600,7 +600,7 @@ xfs_qm_dqread(
        /*
         * Reservation counters are defined as reservation plus current usage
-         * to avoid having to add everytime.
+         * to avoid having to add every time.
         */
        dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
        dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
@@ -828,7 +828,7 @@ xfs_qm_dqget(
        if (xfs_do_dqerror) {
                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-                        cmn_err(CE_DEBUG, "Returning error in dqget");
+                        xfs_debug(mp, "Returning error in dqget");
                        return (EIO);
                }
        }
@@ -1208,8 +1208,9 @@ xfs_qm_dqflush(
        /*
         * A simple sanity check in case we got a corrupted dquot..
         */
-        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+        error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
-                           XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+                           XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+        if (error) {
                xfs_buf_relse(bp);
                xfs_dqfunlock(dqp);
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1392,8 +1393,8 @@ xfs_qm_dqpurge(
                 */
                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
                if (error)
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp, "%s: dquot %p flush failed",
-                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
+                                __func__, dqp);
                xfs_dqflock(dqp);
        }
        ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1426,36 +1427,38 @@ xfs_qm_dqpurge(
 void
 xfs_qm_dqprint(xfs_dquot_t *dqp)
 {
-        cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
+        struct xfs_mount        *mp = dqp->q_mount;
-        cmn_err(CE_DEBUG, "---- dquotID =  %d",
+        xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+        xfs_debug(mp, "---- dquotID =  %d",
                (int)be32_to_cpu(dqp->q_core.d_id));
-        cmn_err(CE_DEBUG, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
+        xfs_debug(mp, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
-        cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
+        xfs_debug(mp, "---- fs      =  0x%p", dqp->q_mount);
-        cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+        xfs_debug(mp, "---- blkno   =  0x%x", (int) dqp->q_blkno);
-        cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+        xfs_debug(mp, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
-        cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkhlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
-        cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inohlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
-        cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inoslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
-        cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_bcount),
                (int)be64_to_cpu(dqp->q_core.d_bcount));
-        cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- icount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_icount),
                (int)be64_to_cpu(dqp->q_core.d_icount));
-        cmn_err(CE_DEBUG, "---- btimer  =  %d",
+        xfs_debug(mp, "---- btimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_btimer));
-        cmn_err(CE_DEBUG, "---- itimer  =  %d",
+        xfs_debug(mp, "---- itimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_itimer));
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
         */
        error = xfs_qm_dqflush(dqp, 0);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
-                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+                        __func__, error, dqp);
-                        error, dqp);
        xfs_dqunlock(dqp);
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..69228aa8605a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
        int             i = 0;
        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
+                xfs_debug(mp, "   %d. \"%d (%s)\"   "
                                  "bcnt = %lld, icnt = %lld, refs = %d",
                        i++, be32_to_cpu(dqp->q_core.d_id),
                        DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
                xfs_dqlock(dqp);
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+                xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
 #endif
                list_del_init(&dqp->q_freelist);
                xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
         * quotas immediately.
         */
        if (mp->m_sb.sb_rextents) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-                        "Cannot turn on quotas for realtime filesystem %s",
-                        mp->m_fsname);
                mp->m_qflags = 0;
                goto write_changes;
        }
@@ -402,14 +400,13 @@ xfs_qm_mount_quotas(
                         * off, but the on disk superblock doesn't know that !
                         */
                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: Superblock update failed!",
-                                "XFS mount_quotas: Superblock update failed!");
+                                __func__);
                }
        }
        if (error) {
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "Failed to initialize disk quotas.");
-                        "Failed to initialize disk quotas.");
                return;
        }
@@ -464,12 +461,10 @@ xfs_qm_dqflush_all(
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        int                     recl;
        struct xfs_dquot        *dqp;
-        int                     niters;
        int                     error;
        if (!q)
                return 0;
-        niters = 0;
 again:
        mutex_lock(&q->qi_dqlist_lock);
        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -1230,13 +1225,6 @@ xfs_qm_qino_alloc(
        }
        /*
-         * Keep an extra reference to this quota inode. This inode is
-         * locked exclusively and joined to the transaction already.
-         */
-        ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-        IHOLD(*ip);
-        /*
         * Make the changes in the superblock, and log those too.
         * sbfields arg may contain fields other than *QUOTINO;
         * VERSIONNUM for example.
@@ -1264,7 +1252,7 @@ xfs_qm_qino_alloc(
        xfs_mod_sb(tp, sbfields);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
                return error;
        }
        return 0;
@@ -1299,7 +1287,7 @@ xfs_qm_reset_dqcounts(
                 * output any warnings because it's perfectly possible to
                 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
                 */
-                (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+                (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
                                      "xfs_quotacheck");
                ddq->d_bcount = 0;
                ddq->d_icount = 0;
@@ -1324,14 +1312,9 @@ xfs_qm_dqiter_bufs(
 {
        xfs_buf_t       *bp;
        int             error;
-        int             notcommitted;
-        int             incr;
        int             type;
        ASSERT(blkcnt > 0);
-        notcommitted = 0;
-        incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
-                XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
        type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
                (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
        error = 0;
@@ -1676,7 +1659,7 @@ xfs_qm_quotacheck(
         */
        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+        xfs_notice(mp, "Quotacheck needed: Please wait.");
        /*
         * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1754,9 +1737,9 @@ xfs_qm_quotacheck(
 error_return:
        if (error) {
-                cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
+                xfs_warn(mp,
-                        "Disabling quotas.",
+        "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
-                        mp->m_fsname, error);
+                        error);
                /*
                 * We must turn off quotas.
                 */
@@ -1764,12 +1747,11 @@ xfs_qm_quotacheck(
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
                if (xfs_mount_reset_sbqflags(mp)) {
-                        cmn_err(CE_WARN, "XFS quotacheck %s: "
+                        xfs_warn(mp,
-                                "Failed to reset quota flags.", mp->m_fsname);
+                                "Quotacheck: Failed to reset quota flags.");
                }
-        } else {
+        } else
-                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+                xfs_notice(mp, "Quotacheck: Done.");
-        }
        return (error);
 }
@@ -1863,12 +1845,14 @@ xfs_qm_dqreclaim_one(void)
        xfs_dquot_t     *dqpout;
        xfs_dquot_t     *dqp;
        int             restarts;
+        int             startagain;
        restarts = 0;
        dqpout = NULL;
        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+        startagain = 0;
        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1869,10 @@ startagain:
                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
+                        restarts++;
+                        startagain = 1;
+                        goto dqunlock;
                }
                /*
@@ -1906,23 +1887,20 @@ startagain:
                        ASSERT(list_empty(&dqp->q_mplist));
                        list_del_init(&dqp->q_freelist);
                        xfs_Gqm->qm_dqfrlist_cnt--;
-                        xfs_dqunlock(dqp);
                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
+                        goto dqunlock;
                }
                ASSERT(dqp->q_hash);
                ASSERT(!list_empty(&dqp->q_mplist));
                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
+                 * Try to grab the flush lock. If this dquot is in the process
-                 * getting flushed to disk, we don't want to reclaim it.
+                 * of getting flushed to disk, we don't want to reclaim it.
                 */
-                if (!xfs_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp))
-                        xfs_dqunlock(dqp);
+                        goto dqunlock;
-                        continue;
-                }
                /*
                 * We have the flush lock so we know that this is not in the
@@ -1941,11 +1919,10 @@ startagain:
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp, "%s: dquot %p flush failed",
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+                                        __func__, dqp);
                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        goto dqunlock;
-                        continue;
                }
                /*
@@ -1967,13 +1944,8 @@ startagain:
                 */
                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
                        restarts++;
-                        mutex_unlock(&dqp->q_hash->qh_lock);
+                        startagain = 1;
-                        xfs_dqfunlock(dqp);
+                        goto qhunlock;
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        goto startagain;
                }
                ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1958,20 @@ startagain:
                xfs_Gqm->qm_dqfrlist_cnt--;
                dqpout = dqp;
                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
                mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
                xfs_dqfunlock(dqp);
+dqunlock:
                xfs_dqunlock(dqp);
                if (dqpout)
                        break;
                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        return NULL;
+                        break;
+                if (startagain) {
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                        goto again;
+                }
        }
        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        return dqpout;
@@ -2119,7 +2097,7 @@ xfs_qm_write_sb_changes(
        int             error;
 #ifdef QUOTADEBUG
-        cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
        if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 * block in the dquot/xqm code.
 */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
-/*
- * When doing a quotacheck, we log dquot clusters of this many FSBs at most
- * in a single transaction. We don't want to ask for too huge a log reservation.
- */
-#define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788ab..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
             (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
            (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
            xfs_dev_is_read_only(mp, "changing quota state")) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "please mount with%s%s%s%s.",
-                        "XFS: please mount with%s%s%s%s.",
                        (!quotaondisk ? "out quota" : ""),
                        (uquotaondisk ? " usrquota" : ""),
                        (pquotaondisk ? " prjquota" : ""),
@@ -135,7 +134,7 @@ xfs_qm_newmount(
                 */
                if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
                        /*
-                         * If an error occured, qm_mount_quotas code
+                         * If an error occurred, qm_mount_quotas code
                         * has already disabled quotas. So, just finish
                         * mounting, and get on with the boring life
                         * without disk quotas.
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223e..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
-#ifdef DEBUG
-# define qdprintk(s, args...)   cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...)   do { } while (0)
-#endif
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
@@ -178,7 +172,7 @@ xfs_qm_scall_quotaoff(
        /*
         * Next we make the changes in the quota flag in the mount struct.
         * This isn't protected by a particular lock directly, because we
-         * don't want to take a mrlock everytime we depend on quotas being on.
+         * don't want to take a mrlock every time we depend on quotas being on.
         */
        mp->m_qflags &= ~(flags);
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
        int             error = 0, error2 = 0;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+                xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+                        __func__, flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -318,20 +313,19 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        uint            accflags;
        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
         * Switching on quota accounting must be done at mount time.
         */
-        accflags = flags & XFS_ALL_QUOTA_ACCT;
        flags &= ~(XFS_ALL_QUOTA_ACCT);
        sbflags = 0;
        if (flags == 0) {
-                qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+                xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+                        __func__, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -352,12 +346,13 @@ xfs_qm_scall_quotaon(
            (flags & XFS_GQUOTA_ACCT) == 0 &&
            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
            (flags & XFS_OQUOTA_ENFD))) {
-                qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+                xfs_debug(mp,
-                        flags, mp->m_sb.sb_qflags);
+                        "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+                        __func__, flags, mp->m_sb.sb_qflags);
                return XFS_ERROR(EINVAL);
        }
        /*
-         * If everything's upto-date incore, then don't waste time.
+         * If everything's up to-date incore, then don't waste time.
         */
        if ((mp->m_qflags & flags) == flags)
                return XFS_ERROR(EEXIST);
@@ -541,7 +536,7 @@ xfs_qm_scall_setqlim(
                        q->qi_bsoftlimit = soft;
                }
        } else {
-                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+                xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +552,7 @@ xfs_qm_scall_setqlim(
                        q->qi_rtbsoftlimit = soft;
                }
        } else {
-                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +569,7 @@ xfs_qm_scall_setqlim(
                        q->qi_isoftlimit = soft;
                }
        } else {
-                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+                xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
        }
        /*
@@ -939,10 +934,11 @@ struct mutex  qcheck_lock;
 #define DQTEST_LIST_PRINT(l, NXT, title) \
 { \
          xfs_dqtest_t  *dqp; int i = 0;\
-          cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+          xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
          for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
               dqp = (xfs_dqtest_t *)dqp->NXT) { \
-                cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+                xfs_debug(dqp->q_mount,         \
+                        "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
                         ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),      \
                         dqp->d_bcount, dqp->d_icount); } \
 }
@@ -966,16 +962,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 }
 STATIC void
 xfs_qm_dqtest_print(
-        xfs_dqtest_t    *d)
+        struct xfs_mount        *mp,
+        struct dqtest           *d)
 {
-        cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
+        xfs_debug(mp, "-----------DQTEST DQUOT----------------");
-        cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
+        xfs_debug(mp, "---- dquot ID = %d", d->d_id);
-        cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
+        xfs_debug(mp, "---- fs       = 0x%p", d->q_mount);
-        cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount   = %Lu (0x%x)",
                d->d_bcount, (int)d->d_bcount);
-        cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- icount   = %Lu (0x%x)",
                d->d_icount, (int)d->d_icount);
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 STATIC void
@@ -989,12 +986,14 @@ xfs_qm_dqtest_failed(
 {
        qmtest_nfails++;
        if (error)
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, error, reason);
+                        "quotacheck failed id=%d, err=%d\nreason: %s",
+                        d->d_id, error, reason);
        else
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, reason, (int)a, (int)b);
+                        "quotacheck failed id=%d (%s) [%d != %d]",
-        xfs_qm_dqtest_print(d);
+                        d->d_id, reason, (int)a, (int)b);
+        xfs_qm_dqtest_print(dqp->q_mount, d);
        if (dqp)
                xfs_qm_dqprint(dqp);
 }
@@ -1021,9 +1020,9 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_bcount) >=
            be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
                if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] BLK TIMER NOT STARTED",
+                                "%d [%s] BLK TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
@@ -1031,16 +1030,16 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_icount) >=
            be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
                if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] INO TIMER NOT STARTED",
+                                "%d [%s] INO TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
 #ifdef QUOTADEBUG
        if (!err) {
-                cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
+                xfs_debug(dqp->q_mount, "%d [%s] qchecked",
-                        d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                        d->d_id, DQFLAGTO_TYPESTR(d));
        }
 #endif
        return (err);
@@ -1137,8 +1136,8 @@ xfs_qm_internalqcheck_adjust(
        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
                *res = BULKSTAT_RV_NOTHING;
-                qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+                xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
-                        (unsigned long long) ino,
+                        __func__, (unsigned long long) ino,
                        (unsigned long long) mp->m_sb.sb_uquotino,
                        (unsigned long long) mp->m_sb.sb_gquotino);
                return XFS_ERROR(EINVAL);
@@ -1223,12 +1222,12 @@ xfs_qm_internalqcheck(
                                 xfs_qm_internalqcheck_adjust,
                                 0, NULL, &done);
                if (error) {
-                        cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
+                        xfs_debug(mp, "Bulkstat returned error 0x%x", error);
                        break;
                }
        } while (!done);
-        cmn_err(CE_DEBUG, "Checking results against system dquots");
+        xfs_debug(mp, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
                xfs_dqtest_t    *d, *n;
                xfs_dqhash_t    *h;
@@ -1246,10 +1245,10 @@ xfs_qm_internalqcheck(
        }
        if (qmtest_nfails) {
-                cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
+                xfs_debug(mp, "******** quotacheck failed  ********");
-                cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
+                xfs_debug(mp, "failures = %d", qmtest_nfails);
        } else {
-                cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
+                xfs_debug(mp, "******** quotacheck successful! ********");
        }
        kmem_free(qmtest_udqtab);
        kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
             (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
              (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
+                xfs_debug(mp,
-                          " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
+                        "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+                        nblks, *resbcountp, hardlimit);
 #endif
                if (nblks > 0) {
                        /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 975aa10e1a47..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-#include "debug.h"
-/* xfs_mount.h drags a lot of crap in, sorry.. */
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_error.h"
-static char             message[1024];  /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL       7
-#define XFS_ERR_MASK            ((1 << 3) - 1)
-static const char * const       err_level[XFS_MAX_ERR_LEVEL+1] =
-                                        {KERN_EMERG, KERN_ALERT, KERN_CRIT,
-                                         KERN_ERR, KERN_WARNING, KERN_NOTICE,
-                                         KERN_INFO, KERN_DEBUG};
-void
-cmn_err(register int level, char *fmt, ...)
-{
-        char    *fp = fmt;
-        int     len;
-        ulong   flags;
-        va_list ap;
-        level &= XFS_ERR_MASK;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
-        spin_lock_irqsave(&xfs_err_lock,flags);
-        va_start(ap, fmt);
-        if (*fmt == '!') fp++;
-        len = vsnprintf(message, sizeof(message), fp, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
-        va_end(ap);
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
-}
-void
-xfs_fs_vcmn_err(
-        int                     level,
-        struct xfs_mount        *mp,
-        char                    *fmt,
-        va_list                 ap)
-{
-        unsigned long           flags;
-        int                     len = 0;
-        level &= XFS_ERR_MASK;
-        if (level > XFS_MAX_ERR_LEVEL)
-                level = XFS_MAX_ERR_LEVEL;
-        spin_lock_irqsave(&xfs_err_lock,flags);
-        if (mp) {
-                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
-                /*
-                 * Skip the printk if we can't print anything useful
-                 * due to an over-long device name.
-                 */
-                if (len >= sizeof(message))
-                        goto out;
-        }
-        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
-        if (len >= sizeof(message))
-                len = sizeof(message) - 1;
-        if (message[len-1] == '\n')
-                message[len-1] = 0;
-        printk("%s%s\n", err_level[level], message);
- out:
-        spin_unlock_irqrestore(&xfs_err_lock,flags);
-        BUG_ON(level == CE_PANIC);
-}
-void
-assfail(char *expr, char *file, int line)
-{
-        printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
-        BUG();
-}
-void
-xfs_hex_dump(void *p, int length)
-{
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
-}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index d2d20462fd4f..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_DEBUG_H__
-#define __XFS_SUPPORT_DEBUG_H__
-#include <stdarg.h>
-#define CE_DEBUG        7               /* debug        */
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
-#define CE_WARN         4               /* warning      */
-#define CE_ALERT        1               /* alert        */
-#define CE_PANIC        0               /* panic        */
-extern void cmn_err(int, char *, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void assfail(char *expr, char *f, int l);
-#define ASSERT_ALWAYS(expr)     \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef DEBUG
-#define ASSERT(expr)    ((void)0)
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-#else /* DEBUG */
-#define ASSERT(expr)    \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC noinline
-#endif
-#endif /* DEBUG */
-#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c022..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        rwlock_t        pag_ici_lock;   /* incore inode lock */
+        spinlock_t      pag_ici_lock;   /* incore inode cache lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca5..27d64d752eab 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agblock_t bno, xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
 /*
 * Get the data from the pointed-to record.
 */
-STATIC int                              /* error */
+int                                     /* error */
 xfs_alloc_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           *bno,   /* output: starting block of extent */
@@ -151,10 +147,9 @@ xfs_alloc_get_rec(
 */
 STATIC void
 xfs_alloc_compute_aligned(
+        xfs_alloc_arg_t *args,          /* allocation argument structure */
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
-        xfs_extlen_t    alignment,      /* alignment for allocation */
-        xfs_extlen_t    minlen,         /* minimum length for allocation */
        xfs_agblock_t   *resbno,        /* result block number */
        xfs_extlen_t    *reslen)        /* result length */
 {
@@ -162,8 +157,8 @@ xfs_alloc_compute_aligned(
        xfs_extlen_t    diff;
        xfs_extlen_t    len;
-        if (alignment > 1 && foundlen >= minlen) {
+        if (args->alignment > 1 && foundlen >= args->minlen) {
-                bno = roundup(foundbno, alignment);
+                bno = roundup(foundbno, args->alignment);
                diff = bno - foundbno;
                len = diff >= foundlen ? 0 : foundlen - diff;
        } else {
@@ -468,6 +463,27 @@ xfs_alloc_read_agfl(
        return 0;
 }
+STATIC int
+xfs_alloc_update_counters(
+        struct xfs_trans        *tp,
+        struct xfs_perag        *pag,
+        struct xfs_buf          *agbp,
+        long                    len)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        pag->pagf_freeblks += len;
+        be32_add_cpu(&agf->agf_freeblks, len);
+        xfs_trans_agblocks_delta(tp, len);
+        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+                     be32_to_cpu(agf->agf_length)))
+                return EFSCORRUPTED;
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+        return 0;
+}
 /*
 * Allocation group level functions.
 */
@@ -509,49 +525,44 @@ xfs_alloc_ag_vextent(
                ASSERT(0);
                /* NOTREACHED */
        }
-        if (error)
+        if (error || args->agbno == NULLAGBLOCK)
                return error;
-        /*
-         * If the allocation worked, need to change the agf structure
-         * (and log it), and the superblock.
-         */
-        if (args->agbno != NULLAGBLOCK) {
-                xfs_agf_t       *agf;   /* allocation group freelist header */
-                long            slen = (long)args->len;
-                ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+        ASSERT(args->len >= args->minlen);
-                ASSERT(!(args->wasfromfl) || !args->isfl);
+        ASSERT(args->len <= args->maxlen);
-                ASSERT(args->agbno % args->alignment == 0);
+        ASSERT(!args->wasfromfl || !args->isfl);
-                if (!(args->wasfromfl)) {
+        ASSERT(args->agbno % args->alignment == 0);
-                        agf = XFS_BUF_TO_AGF(args->agbp);
+        if (!args->wasfromfl) {
-                        be32_add_cpu(&agf->agf_freeblks, -(args->len));
+                error = xfs_alloc_update_counters(args->tp, args->pag,
-                        xfs_trans_agblocks_delta(args->tp,
+                                                  args->agbp,
-                                                 -((long)(args->len)));
+                                                  -((long)(args->len)));
-                        args->pag->pagf_freeblks -= args->len;
+                if (error)
-                        ASSERT(be32_to_cpu(agf->agf_freeblks) <=
+                        return error;
-                                be32_to_cpu(agf->agf_length));
-                        xfs_alloc_log_agf(args->tp, args->agbp,
+                /*
-                                                XFS_AGF_FREEBLKS);
+                 * Search the busylist for these blocks and mark the
-                        /*
+                 * transaction as synchronous if blocks are found. This
-                         * Search the busylist for these blocks and mark the
+                 * avoids the need to block due to a synchronous log
-                         * transaction as synchronous if blocks are found. This
+                 * force to ensure correct ordering as the synchronous
-                         * avoids the need to block due to a synchronous log
+                 * transaction will guarantee that for us.
-                         * force to ensure correct ordering as the synchronous
+                 */
-                         * transaction will guarantee that for us.
+                if (xfs_alloc_busy_search(args->mp, args->agno,
-                         */
+                                        args->agbno, args->len))
-                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                        xfs_trans_set_sync(args->tp);
-                                                args->agbno, args->len))
-                                xfs_trans_set_sync(args->tp);
-                }
-                if (!args->isfl)
-                        xfs_trans_mod_sb(args->tp,
-                                args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
-                                        XFS_TRANS_SB_FDBLOCKS, -slen);
-                XFS_STATS_INC(xs_allocx);
-                XFS_STATS_ADD(xs_allocb, args->len);
        }
-        return 0;
+        if (!args->isfl) {
+                xfs_trans_mod_sb(args->tp, args->wasdel ?
+                                 XFS_TRANS_SB_RES_FDBLOCKS :
+                                 XFS_TRANS_SB_FDBLOCKS,
+                                 -((long)(args->len)));
+        }
+        XFS_STATS_INC(xs_allocx);
+        XFS_STATS_ADD(xs_allocb, args->len);
+        return error;
 }
 /*
@@ -577,61 +588,58 @@ xfs_alloc_ag_vextent_exact(
        xfs_extlen_t    rlen;   /* length of returned extent */
        ASSERT(args->alignment == 1);
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO);
+                                          args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
         * if any free block does.
         */
-        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+        if (error)
                goto error0;
-        if (!i) {
+        if (!i)
-                /*
+                goto not_found;
-                 * Didn't find it, return null.
-                 */
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * Grab the freespace record.
         */
-        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+        if (error)
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        ASSERT(fbno <= args->agbno);
        minend = args->agbno + args->minlen;
        maxend = args->agbno + args->maxlen;
        fend = fbno + flen;
        /*
         * Give up if the freespace isn't long enough for the minimum request.
         */
-        if (fend < minend) {
+        if (fend < minend)
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                args->agbno = NULLAGBLOCK;
-                return 0;
-        }
        /*
         * End of extent will be smaller of the freespace end and the
         * maximal requested end.
-         */
+         *
-        end = XFS_AGBLOCK_MIN(fend, maxend);
-        /*
         * Fix the length according to mod and prod if given.
         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
        args->len = end - args->agbno;
        xfs_alloc_fix_len(args);
-        if (!xfs_alloc_fix_minleft(args)) {
+        if (!xfs_alloc_fix_minleft(args))
-                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                goto not_found;
-                return 0;
-        }
        rlen = args->len;
        ASSERT(args->agbno + rlen <= fend);
        end = args->agbno + rlen;
        /*
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +648,25 @@ xfs_alloc_ag_vextent_exact(
                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
-                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                                      args->len, XFSA_FIXUP_BNO_OK);
+        if (error) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
                goto error0;
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-        trace_xfs_alloc_exact_done(args);
        args->wasfromfl = 0;
+        trace_xfs_alloc_exact_done(args);
+        return 0;
+not_found:
+        /* Didn't find it, return null. */
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        args->agbno = NULLAGBLOCK;
+        trace_xfs_alloc_exact_notfound(args);
        return 0;
 error0:
@@ -659,6 +676,94 @@ error0:
 }
 /*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+        struct xfs_alloc_arg    *args,  /* allocation argument structure */
+        struct xfs_btree_cur    **gcur, /* good cursor */
+        struct xfs_btree_cur    **scur, /* searching cursor */
+        xfs_agblock_t           gdiff,  /* difference for search comparison */
+        xfs_agblock_t           *sbno,  /* extent found by search */
+        xfs_extlen_t            *slen,
+        xfs_extlen_t            *slena, /* aligned length */
+        int                     dir)    /* 0 = search right, 1 = search left */
+{
+        xfs_agblock_t           bno;
+        xfs_agblock_t           new;
+        xfs_agblock_t           sdiff;
+        int                     error;
+        int                     i;
+        /* The good extent is perfect, no need to  search. */
+        if (!gdiff)
+                goto out_use_good;
+        /*
+         * Look until we find a better one, run out of space or run off the end.
+         */
+        do {
+                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
+                /*
+                 * The good extent is closer than this one.
+                 */
+                if (!dir) {
+                        if (bno >= args->agbno + gdiff)
+                                goto out_use_good;
+                } else {
+                        if (bno <= args->agbno - gdiff)
+                                goto out_use_good;
+                }
+                /*
+                 * Same distance, compare length and pick the best.
+                 */
+                if (*slena >= args->minlen) {
+                        args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                                       args->alignment, *sbno,
+                                                       *slen, &new);
+                        /*
+                         * Choose closer size and invalidate other cursor.
+                         */
+                        if (sdiff < gdiff)
+                                goto out_use_search;
+                        goto out_use_good;
+                }
+                if (!dir)
+                        error = xfs_btree_increment(*scur, 0, &i);
+                else
+                        error = xfs_btree_decrement(*scur, 0, &i);
+                if (error)
+                        goto error0;
+        } while (i);
+out_use_good:
+        xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+        *scur = NULL;
+        return 0;
+out_use_search:
+        xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+        *gcur = NULL;
+        return 0;
+error0:
+        /* caller invalidates cursors */
+        return error;
+}
+/*
 * Allocate a variable extent near bno in the allocation group agno.
 * Extent's length (returned in len) will be between minlen and maxlen,
 * and of the form k * prod + mod unless there's nothing that large.
@@ -775,8 +880,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
                                continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -896,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -912,8 +1017,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
-                                        args->minlen, &gtbnoa, &gtlena);
+                                                  &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -925,203 +1030,45 @@ xfs_alloc_ag_vextent_near(
                        }
                }
        } while (bno_cur_lt || bno_cur_gt);
        /*
         * Got both cursors still active, need to find better entry.
         */
        if (bno_cur_lt && bno_cur_gt) {
-                /*
-                 * Left side is long enough, look for a right side entry.
-                 */
                if (ltlena >= args->minlen) {
                        /*
-                         * Fix up the length.
+                         * Left side is good, look for a right side entry.
                         */
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, ltbno, ltlen, &ltnew);
+                        error = xfs_alloc_find_best_extent(args,
+                                                &bno_cur_lt, &bno_cur_gt,
+                                                ltdiff, &gtbno, &gtlen, &gtlena,
+                                                0 /* search right */);
+                } else {
+                        ASSERT(gtlena >= args->minlen);
                        /*
-                         * Not perfect.
+                         * Right side is good, look for a left side entry.
-                         */
-                        if (ltdiff) {
-                                /*
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_gt, &gtbno,
-                                                        &gtlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(gtbno, gtlen,
-                                                args->alignment, args->minlen,
-                                                &gtbnoa, &gtlena);
-                                        /*
-                                         * The left one is clearly better.
-                                         */
-                                        if (gtbnoa >= args->agbno + ltdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (gtlena >= args->minlen) {
-                                                args->len =
-                                                        XFS_EXTLEN_MIN(gtlena,
-                                                                args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                gtdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        gtbno, gtlen, &gtnew);
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                if (gtdiff < ltdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the right end.
-                                         */
-                                        if ((error = xfs_btree_increment(
-                                                        bno_cur_gt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_gt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_gt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The left side is perfect, trash the right side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_gt,
-                                                     XFS_BTREE_NOERROR);
-                                bno_cur_gt = NULL;
-                        }
-                }
-                /*
-                 * It's the right side that was found first, look left.
-                 */
-                else {
-                        /*
-                         * Fix up the length.
                         */
                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
                        xfs_alloc_fix_len(args);
-                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
                                args->alignment, gtbno, gtlen, &gtnew);
-                        /*
-                         * Right side entry isn't perfect.
+                        error = xfs_alloc_find_best_extent(args,
-                         */
+                                                &bno_cur_gt, &bno_cur_lt,
-                        if (gtdiff) {
+                                                gtdiff, &ltbno, &ltlen, &ltlena,
-                                /*
+                                                1 /* search left */);
-                                 * Look until we find a better one, run out of
-                                 * space, or run off the end.
-                                 */
-                                while (bno_cur_lt && bno_cur_gt) {
-                                        if ((error = xfs_alloc_get_rec(
-                                                        bno_cur_lt, &ltbno,
-                                                        &ltlen, &i)))
-                                                goto error0;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                        xfs_alloc_compute_aligned(ltbno, ltlen,
-                                                args->alignment, args->minlen,
-                                                &ltbnoa, &ltlena);
-                                        /*
-                                         * The right one is clearly better.
-                                         */
-                                        if (ltbnoa <= args->agbno - gtdiff) {
-                                                xfs_btree_del_cursor(
-                                                        bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                        /*
-                                         * If we reach a big enough entry,
-                                         * compare the two and pick the best.
-                                         */
-                                        if (ltlena >= args->minlen) {
-                                                args->len = XFS_EXTLEN_MIN(
-                                                        ltlena, args->maxlen);
-                                                xfs_alloc_fix_len(args);
-                                                rlen = args->len;
-                                                ltdiff = xfs_alloc_compute_diff(
-                                                        args->agbno, rlen,
-                                                        args->alignment,
-                                                        ltbno, ltlen, &ltnew);
-                                                /*
-                                                 * Left side is better.
-                                                 */
-                                                if (ltdiff < gtdiff) {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_gt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_gt = NULL;
-                                                }
-                                                /*
-                                                 * Right side is better.
-                                                 */
-                                                else {
-                                                        xfs_btree_del_cursor(
-                                                                bno_cur_lt,
-                                                                XFS_BTREE_NOERROR);
-                                                        bno_cur_lt = NULL;
-                                                }
-                                                break;
-                                        }
-                                        /*
-                                         * Fell off the left end.
-                                         */
-                                        if ((error = xfs_btree_decrement(
-                                                        bno_cur_lt, 0, &i)))
-                                                goto error0;
-                                        if (!i) {
-                                                xfs_btree_del_cursor(bno_cur_lt,
-                                                        XFS_BTREE_NOERROR);
-                                                bno_cur_lt = NULL;
-                                                break;
-                                        }
-                                }
-                        }
-                        /*
-                         * The right side is perfect, trash the left side.
-                         */
-                        else {
-                                xfs_btree_del_cursor(bno_cur_lt,
-                                        XFS_BTREE_NOERROR);
-                                bno_cur_lt = NULL;
-                        }
                }
+                if (error)
+                        goto error0;
        }
        /*
         * If we couldn't get anything, give up.
         */
@@ -1130,6 +1077,7 @@ xfs_alloc_ag_vextent_near(
                args->agbno = NULLAGBLOCK;
                return 0;
        }
        /*
         * At this point we have selected a freespace entry, either to the
         * left or to the right.  If it's on the right, copy all the
@@ -1146,6 +1094,7 @@ xfs_alloc_ag_vextent_near(
                j = 1;
        } else
                j = 0;
        /*
         * Fix up the length and compute the useful address.
         */
@@ -1248,8 +1197,7 @@ xfs_alloc_ag_vextent_size(
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-        xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
+        xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
-                &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1274,8 +1222,8 @@ xfs_alloc_ag_vextent_size(
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        if (flen < bestrlen)
                                break;
-                        xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+                        xfs_alloc_compute_aligned(args, fbno, flen,
-                                args->minlen, &rbno, &rlen);
+                                                  &rbno, &rlen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1453,6 +1401,7 @@ xfs_free_ag_extent(
        xfs_mount_t     *mp;            /* mount point struct for filesystem */
        xfs_agblock_t   nbno;           /* new starting block of freespace */
        xfs_extlen_t    nlen;           /* new length of freespace */
+        xfs_perag_t     *pag;           /* per allocation group data */
        mp = tp->t_mountp;
        /*
@@ -1651,30 +1600,20 @@ xfs_free_ag_extent(
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        cnt_cur = NULL;
        /*
         * Update the freespace totals in the ag and superblock.
         */
-        {
+        pag = xfs_perag_get(mp, agno);
-                xfs_agf_t       *agf;
+        error = xfs_alloc_update_counters(tp, pag, agbp, len);
-                xfs_perag_t     *pag;           /* per allocation group data */
+        xfs_perag_put(pag);
+        if (error)
-                pag = xfs_perag_get(mp, agno);
+                goto error0;
-                pag->pagf_freeblks += len;
-                xfs_perag_put(pag);
+        if (!isfl)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                agf = XFS_BUF_TO_AGF(agbp);
+        XFS_STATS_INC(xs_freex);
-                be32_add_cpu(&agf->agf_freeblks, len);
+        XFS_STATS_ADD(xs_freeb, len);
-                xfs_trans_agblocks_delta(tp, len);
-                XFS_WANT_CORRUPTED_GOTO(
-                        be32_to_cpu(agf->agf_freeblks) <=
-                        be32_to_cpu(agf->agf_length),
-                        error0);
-                xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-                if (!isfl)
-                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                XFS_STATS_INC(xs_freex);
-                XFS_STATS_ADD(xs_freeb, len);
-        }
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -2456,17 +2395,33 @@ xfs_free_extent(
        memset(&args, 0, sizeof(xfs_alloc_arg_t));
        args.tp = tp;
        args.mp = tp->t_mountp;
+        /*
+         * validate that the block number is legal - the enables us to detect
+         * and handle a silent filesystem corruption rather than crashing.
+         */
        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-        ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+        if (args.agno >= args.mp->m_sb.sb_agcount)
+                return EFSCORRUPTED;
        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+        if (args.agbno >= args.mp->m_sb.sb_agblocks)
+                return EFSCORRUPTED;
        args.pag = xfs_perag_get(args.mp, args.agno);
-        if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
+        ASSERT(args.pag);
+        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+        if (error)
                goto error0;
-#ifdef DEBUG
-        ASSERT(args.agbp != NULL);
+        /* validate the extent size is legal now we have the agf locked */
-        ASSERT((args.agbno + len) <=
+        if (args.agbno + len >
-                be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length));
+                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-#endif
+                error = EFSCORRUPTED;
+                goto error0;
+        }
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
        xfs_perag_put(args.pag);
@@ -2676,7 +2631,7 @@ restart:
 * will require a synchronous transaction, but it can still be
 * used to distinguish between a partial or exact match.
 */
-static int
+int
 xfs_alloc_busy_search(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
 #define __XFS_ALLOC_H__
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 /*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *      - the AG superblock, AGF, AGI and AGFL
+ *      - the AGF (bno and cnt) and AGI btree root blocks
+ *      - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)     \
+        ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+/*
 * Argument structure for xfs_alloc routines.
 * This is turned into a structure to avoid having 20 arguments passed
 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
                struct xfs_perag *pag);
 #ifdef __KERNEL__
 void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-                xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
 void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+        xfs_agblock_t bno, xfs_extlen_t len);
 #endif  /* __KERNEL__ */
 /*
@@ -205,4 +222,18 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * It didn't all fit, so we have to sort everything on hashval.
         */
        sbsize = sf->hdr.count * sizeof(*sbuf);
-        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
        /*
         * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
-                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
                                retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..fa00788de2f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the middle part of a previous delayed allocation.
                 * Contiguity is impossible here.
                 * This case is avoided almost all the time.
+                 *
+                 * We start with a delayed allocation:
+                 *
+                 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                 *  PREV @ idx
+                 *
+                 * and we are allocating:
+                 *                     +rrrrrrrrrrrrrrrrr+
+                 *                            new
+                 *
+                 * and we set it up for insertion as:
+                 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                 *                            new
+                 *  PREV @ idx          LEFT              RIGHT
+                 *                      inserted at idx + 1
                 */
                temp = new->br_startoff - PREV.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_bmbt_set_blockcount(ep, temp);
-                r[0] = *new;
-                r[1].br_state = PREV.br_state;
-                r[1].br_startblock = 0;
-                r[1].br_startoff = new_endoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                r[1].br_blockcount = temp2;
+                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
+                LEFT = *new;
+                RIGHT.br_state = PREV.br_state;
+                RIGHT.br_startblock = nullstartblock(
+                                (int)xfs_bmap_worst_indlen(ip, temp2));
+                RIGHT.br_startoff = new_endoff;
+                RIGHT.br_blockcount = temp2;
+                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+                xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -2348,6 +2365,13 @@ xfs_bmap_rtalloc(
         */
        if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
                ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+        /*
+         * Lock out other modifications to the RT bitmap inode.
+         */
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
        /*
         * If it's an allocation to an empty file at offset 0,
         * pick an extent that will space things out in the rt area.
@@ -2430,7 +2454,7 @@ xfs_bmap_btalloc_nullfb(
                startag = ag = 0;
        pag = xfs_perag_get(mp, ag);
-        while (*blen < ap->alen) {
+        while (*blen < args->maxlen) {
                if (!pag->pagf_init) {
                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
                                                    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2476,7 @@ xfs_bmap_btalloc_nullfb(
                        notinit = 1;
                if (xfs_inode_is_filestream(ap->ip)) {
-                        if (*blen >= ap->alen)
+                        if (*blen >= args->maxlen)
                                break;
                        if (ap->userdata) {
@@ -2498,14 +2522,14 @@ xfs_bmap_btalloc_nullfb(
         * If the best seen length is less than the request
         * length, use the best as the minimum.
         */
-        else if (*blen < ap->alen)
+        else if (*blen < args->maxlen)
                args->minlen = *blen;
        /*
-         * Otherwise we've seen an extent as big as alen,
+         * Otherwise we've seen an extent as big as maxlen,
         * use that as the minimum.
         */
        else
-                args->minlen = ap->alen;
+                args->minlen = args->maxlen;
        /*
         * set the failure fallback case to look in the selected
@@ -2573,7 +2597,9 @@ xfs_bmap_btalloc(
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->rval;
-        args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+        /* Trim the allocation back to the maximum an AG can fit. */
+        args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
@@ -2621,7 +2647,7 @@ xfs_bmap_btalloc(
                        /*
                         * Adjust for alignment
                         */
-                        if (blen > args.alignment && blen <= ap->alen)
+                        if (blen > args.alignment && blen <= args.maxlen)
                                args.minlen = blen - args.alignment;
                        args.minalignslop = 0;
                } else {
@@ -2640,7 +2666,7 @@ xfs_bmap_btalloc(
                         * of minlen+alignment+slop doesn't go up
                         * between the calls.
                         */
-                        if (blen > mp->m_dalign && blen <= ap->alen)
+                        if (blen > mp->m_dalign && blen <= args.maxlen)
                                nextminlen = blen - mp->m_dalign;
                        else
                                nextminlen = args.minlen;
@@ -3500,7 +3526,7 @@ xfs_bmap_search_extents(
        if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
                     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-                xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+                xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                                "Access to block zero in inode %llu "
                                "start_block: %llx start_off: %llx "
                                "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4174,12 +4200,11 @@ xfs_bmap_read_extents(
                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
                                "corrupt dinode %Lu, (btree extents).",
                                (unsigned long long) ip->i_ino);
-                        XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
+                        XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-                                         XFS_ERRLEVEL_LOW,
+                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
-                                        ip->i_mount);
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
@@ -4485,6 +4510,16 @@ xfs_bmapi(
                                /* Figure out the extent size, adjust alen */
                                extsz = xfs_get_extsz_hint(ip);
                                if (extsz) {
+                                        /*
+                                         * make sure we don't exceed a single
+                                         * extent length when we align the
+                                         * extent by reducing length we are
+                                         * going to allocate by the maximum
+                                         * amount extent size aligment may
+                                         * require.
+                                         */
+                                        alen = XFS_FILBLKS_MIN(len,
+                                                   MAXEXTLEN - (2 * extsz - 1));
                                        error = xfs_bmap_extsize_align(mp,
                                                        &got, &prev, extsz,
                                                        rt, eof,
@@ -5743,7 +5778,7 @@ xfs_check_block(
                        else
                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
                        if (*thispa == *pp) {
-                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
+                                xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
@@ -5908,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
        return;
 error0:
-        cmn_err(CE_WARN, "%s: at error0", __func__);
+        xfs_warn(mp, "%s: at error0", __func__);
        if (bp_release)
                xfs_trans_brelse(NULL, bp);
 error_norelse:
-        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
+        xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
                __func__, i);
        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
        return;
@@ -6115,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
                                                ip->i_ino, start_fsb);
                        }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7e..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
                return error;
        }
        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
+        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-        }
        *bpp = bp;
        return 0;
 }
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5b..7b7e005e3dcc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
        orig = bip->bli_orig;
        buffer = XFS_BUF_PTR(bp);
        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
-                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(bp->b_mount,
-        "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+                                "%s: bip %x buffer %x orig %x index %d",
-                                bip, bp, orig, x);
+                                __func__, bip, bp, orig, x);
+                        ASSERT(0);
+                }
        }
 }
 #else
@@ -141,8 +143,7 @@ xfs_buf_item_log_check(
 #define         xfs_buf_item_log_check(x)
 #endif
-STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
+STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
-STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 /*
 * This returns the number of log iovecs needed to log the
@@ -428,13 +429,15 @@ xfs_buf_item_unpin(
                if (remove) {
                        /*
-                         * We have to remove the log item from the transaction
+                         * If we are in a transaction context, we have to
-                         * as we are about to release our reference to the
+                         * remove the log item from the transaction as we are
-                         * buffer.  If we don't, the unlock that occurs later
+                         * about to release our reference to the buffer.  If we
-                         * in xfs_trans_uncommit() will ry to reference the
+                         * don't, the unlock that occurs later in
+                         * xfs_trans_uncommit() will try to reference the
                         * buffer which we no longer have a hold on.
                         */
-                        xfs_trans_del_item(lip);
+                        if (lip->li_desc)
+                                xfs_trans_del_item(lip);
                        /*
                         * Since the transaction no longer refers to the buffer,
@@ -450,7 +453,7 @@ xfs_buf_item_unpin(
                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
-                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        xfs_buf_do_callbacks(bp);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
@@ -918,15 +921,26 @@ xfs_buf_attach_iodone(
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 }
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
 STATIC void
 xfs_buf_do_callbacks(
-        xfs_buf_t       *bp,
+        struct xfs_buf          *bp)
-        xfs_log_item_t  *lip)
 {
-        xfs_log_item_t  *nlip;
+        struct xfs_log_item     *lip;
-        while (lip != NULL) {
+        while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
-                nlip = lip->li_bio_list;
+                XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
                ASSERT(lip->li_cb != NULL);
                /*
                 * Clear the next pointer so we don't have any
@@ -936,7 +950,6 @@ xfs_buf_do_callbacks(
                 */
                lip->li_bio_list = NULL;
                lip->li_cb(bp, lip);
-                lip = nlip;
        }
 }
@@ -949,128 +962,75 @@ xfs_buf_do_callbacks(
 */
 void
 xfs_buf_iodone_callbacks(
-        xfs_buf_t       *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_log_item_t  *lip;
+        struct xfs_log_item     *lip = bp->b_fspriv;
-        static ulong    lasttime;
+        struct xfs_mount        *mp = lip->li_mountp;
-        static xfs_buftarg_t *lasttarg;
+        static ulong            lasttime;
-        xfs_mount_t     *mp;
+        static xfs_buftarg_t    *lasttarg;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        if (likely(!XFS_BUF_GETERROR(bp)))
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                goto do_callbacks;
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        /*
-                /*
+         * If we've already decided to shutdown the filesystem because of
-                 * If we've already decided to shutdown the filesystem
+         * I/O errors, there's no point in giving this a retry.
-                 * because of IO errors, there's no point in giving this
+         */
-                 * a retry.
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                 */
+                XFS_BUF_SUPER_STALE(bp);
-                mp = lip->li_mountp;
+                trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+                goto do_callbacks;
-                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+        }
-                        XFS_BUF_SUPER_STALE(bp);
-                        trace_xfs_buf_item_iodone(bp, _RET_IP_);
-                        xfs_buf_do_callbacks(bp, lip);
-                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
-                        XFS_BUF_CLR_IODONE_FUNC(bp);
-                        xfs_buf_ioend(bp, 0);
-                        return;
-                }
-                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
+        if (XFS_BUF_TARGET(bp) != lasttarg ||
-                    (time_after(jiffies, (lasttime + 5*HZ)))) {
+            time_after(jiffies, (lasttime + 5*HZ))) {
-                        lasttime = jiffies;
+                lasttime = jiffies;
-                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-                                        " block 0x%llx in %s",
+                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                      (__uint64_t)XFS_BUF_ADDR(bp));
-                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+        }
-                }
+        lasttarg = XFS_BUF_TARGET(bp);
-                lasttarg = XFS_BUF_TARGET(bp);
-                if (XFS_BUF_ISASYNC(bp)) {
+        /*
-                        /*
+         * If the write was asynchronous then no one will be looking for the
-                         * If the write was asynchronous then noone will be
+         * error.  Clear the error state and write the buffer out again.
-                         * looking for the error.  Clear the error state
+         *
-                         * and write the buffer out again delayed write.
+         * During sync or umount we'll write all pending buffers again
-                         *
+         * synchronous, which will catch these errors if they keep hanging
-                         * XXXsup This is OK, so long as we catch these
+         * around.
-                         * before we start the umount; we don't want these
+         */
-                         * DELWRI metadata bufs to be hanging around.
+        if (XFS_BUF_ISASYNC(bp)) {
-                         */
+                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
-                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+                if (!XFS_BUF_ISSTALE(bp)) {
-                        if (!(XFS_BUF_ISSTALE(bp))) {
+                        XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DELAYWRITE(bp);
-                                XFS_BUF_DONE(bp);
-                                XFS_BUF_SET_START(bp);
-                        }
-                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
-                        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
-                        xfs_buf_relse(bp);
-                } else {
-                        /*
-                         * If the write of the buffer was not asynchronous,
-                         * then we want to make sure to return the error
-                         * to the caller of bwrite().  Because of this we
-                         * cannot clear the B_ERROR state at this point.
-                         * Instead we install a callback function that
-                         * will be called when the buffer is released, and
-                         * that routine will clear the error state and
-                         * set the buffer to be written out again after
-                         * some delay.
-                         */
-                        /* We actually overwrite the existing b-relse
-                           function at times, but we're gonna be shutting down
-                           anyway. */
-                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_FINISH_IOWAIT(bp);
+                        XFS_BUF_SET_START(bp);
                }
+                ASSERT(XFS_BUF_IODONE_FUNC(bp));
+                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+                xfs_buf_relse(bp);
                return;
        }
-        xfs_buf_do_callbacks(bp, lip);
+        /*
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+         * If the write of the buffer was synchronous, we want to make
-        XFS_BUF_CLR_IODONE_FUNC(bp);
+         * sure to return the error to the caller of xfs_bwrite().
-        xfs_buf_ioend(bp, 0);
+         */
-}
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
-        xfs_buf_t       *bp)
-{
-        xfs_log_item_t  *lip;
-        xfs_mount_t     *mp;
-        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-        mp = (xfs_mount_t *)lip->li_mountp;
-        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
        XFS_BUF_STALE(bp);
        XFS_BUF_DONE(bp);
        XFS_BUF_UNDELAYWRITE(bp);
-        XFS_BUF_ERROR(bp,0);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-        if (! XFS_FORCED_SHUTDOWN(mp))
+do_callbacks:
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_do_callbacks(bp);
-        /*
-         * We have to unpin the pinned buffers so do the
-         * callbacks.
-         */
-        xfs_buf_do_callbacks(bp, lip);
        XFS_BUF_SET_FSPRIVATE(bp, NULL);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+        xfs_buf_ioend(bp, 0);
-        xfs_buf_relse(bp);
 }
 /*
 * This is the iodone() function for buffers which have been
 * logged.  It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
        xfs_buf_log_format_t    bli_format;     /* in-log header */
 } xfs_buf_log_item_t;
-/*
- * This structure is used during recovery to record the buf log
- * items which have been canceled and should not be replayed.
- */
-typedef struct xfs_buf_cancel {
-        xfs_daddr_t             bc_blkno;
-        uint                    bc_len;
-        int                     bc_refcount;
-        struct xfs_buf_cancel   *bc_next;
-} xfs_buf_cancel_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void    xfs_buf_item_relse(struct xfs_buf *);
 void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb3175..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
                error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
                if (unlikely(error == EFSCORRUPTED)) {
                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-                                cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
+                                xfs_alert(mp, "%s: bno %lld dir: inode %lld",
-                                        (long long)bno);
+                                        __func__, (long long)bno,
-                                cmn_err(CE_ALERT, "dir: inode %lld\n",
                                        (long long)dp->i_ino);
                                for (i = 0; i < nmap; i++) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
                                                i,
                                                (long long)mapp[i].br_startoff,
                                                (long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a6..be628677c288 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
        /* check inode formats now that data is flushed */
        error = xfs_swap_extents_check_format(ip, tip);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                    "%s: inode 0x%llx format is incompatible for exchanging.",
-                                __FILE__, ip->i_ino);
+                                __func__, ip->i_ino);
                goto out_unlock;
        }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
                        XFS_RANDOM_DIR_INO_VALIDATE))) {
-                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+                xfs_warn(mp, "Invalid inode number 0x%Lx",
                                (unsigned long long) ino);
                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
                return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
        if(blk2->index < 0) {
                state->inleaf = 1;
                blk2->index = 0;
-                cmn_err(CE_ALERT,
+                xfs_alert(args->dp->i_mount,
-                        "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: "
+        "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
-                        "blk1->index %d\n",
+                        __func__, blk1->index);
-                        blk1->index);
        }
 }
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
                        }
                        if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
-                                        "xfs_dir2_node_addname_int: dir ino "
+                        "%s: dir ino " "%llu needed freesp block %lld for\n"
-                                        "%llu needed freesp block %lld for\n"
+                        "  data block %lld, got %lld ifbno %llu lastfbno %d",
-                                        "  data block %lld, got %lld\n"
+                                        __func__, (unsigned long long)dp->i_ino,
-                                        "  ifbno %llu lastfbno %d\n",
-                                        (unsigned long long)dp->i_ino,
                                        (long long)xfs_dir2_db_to_fdb(mp, dbno),
                                        (long long)dbno, (long long)fbno,
                                        (unsigned long long)ifbno, lastfbno);
                                if (fblk) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                " fblk 0x%p blkno %llu "
+                                " fblk 0x%p blkno %llu index %d magic 0x%x",
-                                                "index %d magic 0x%x\n",
                                                fblk,
                                                (unsigned long long)fblk->blkno,
                                                fblk->index,
                                                fblk->magic);
                                } else {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp, " ... fblk is NULL");
-                                                " ... fblk is NULL\n");
                                }
                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
                                                 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
                        break;
                if (e != xfs_etrap[i])
                        continue;
-                cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+                xfs_notice(NULL, "%s: error %d", __func__, e);
                BUG();
                break;
        }
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(NULL,
        "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
                                expression, file, line, xfs_etest_fsname[i]);
                        return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
-                        cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+                        xfs_warn(mp, "error tag #%d on", error_tag);
                        return 0;
                }
        }
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == 0) {
-                        cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+                        xfs_warn(mp, "Turned on XFS error tag #%d",
                                error_tag);
                        xfs_etest[i] = error_tag;
                        xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                }
        }
-        cmn_err(CE_WARN, "error tag overflow, too many turned on");
+        xfs_warn(mp, "error tag overflow, too many turned on");
        return 1;
 }
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
                     xfs_etest[i] != 0) {
                        cleared = 1;
-                        cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+                        xfs_warn(mp, "Clearing XFS error tag #%d",
                                xfs_etest[i]);
                        xfs_etest[i] = 0;
                        xfs_etest_fsid[i] = 0LL;
@@ -144,45 +144,12 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
        }
        if (loud || cleared)
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Cleared all XFS error tags for filesystem");
-                        "Cleared all XFS error tags for filesystem \"%s\"",
-                        mp->m_fsname);
        return 0;
 }
 #endif /* DEBUG */
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
-        va_list ap;
-#ifdef DEBUG
-        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
-            && (level & CE_ALERT)) {
-                level &= ~CE_ALERT;
-                level |= CE_PANIC;
-                cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
-        }
-        va_start(ap, fmt);
-        xfs_fs_vcmn_err(level, mp, fmt, ap);
-        va_end(ap);
-}
 void
 xfs_error_report(
        const char              *tag,
@@ -193,9 +160,8 @@ xfs_error_report(
        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
-                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
+                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                            CE_ALERT, mp,
+                "Internal error %s at line %d of file %s.  Caller 0x%p\n",
-                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
@@ -215,4 +181,5 @@ xfs_corruption_error(
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
        xfs_error_report(tag, level, mp, filename, linenum, ra);
+        xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #endif /* DEBUG */
 /*
- * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
- *                      a panic by setting xfs_panic_mask in a
+ *                      a panic by setting xfs_panic_mask in a sysctl.
- *                      sysctl.  update xfs_max[XFS_PARAM] if
- *                      more are added.
 */
 #define         XFS_NO_PTAG                     0
 #define         XFS_PTAG_IFLUSH                 0x00000001
@@ -160,23 +158,4 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define         XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
-struct xfs_mount;
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
-                char *fmt, va_list ap)
-        __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...)
-        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
-extern void xfs_hex_dump(void *p, int length);
-#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
-        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
-#define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
 }
 /*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+        struct xfs_efi_log_item *efip)
+{
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+                spin_lock(&ailp->xa_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                xfs_trans_ail_delete(ailp, &efip->efi_item);
+                xfs_efi_item_free(efip);
+        }
+}
+/*
 * This returns the number of iovecs needed to log the given efi item.
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
        uint                    size;
-        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        ASSERT(atomic_read(&efip->efi_next_extent) ==
+                                efip->efi_format.efi_nextents);
        efip->efi_format.efi_type = XFS_LI_EFI;
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
 }
 /*
- * While EFIs cannot really be pinned, the unpin operation is the
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
- * last place at which the EFI is manipulated during a transaction.
+ * which the EFI is manipulated during a transaction.  If we are being asked to
- * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * remove the EFI it's because the transaction has been cancelled and by
- * free the EFI.
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
 */
 STATIC void
 xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        struct xfs_ail          *ailp = lip->li_ailp;
-        spin_lock(&ailp->xa_lock);
+        if (remove) {
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
+                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-                if (remove)
+                if (lip->li_desc)
                        xfs_trans_del_item(lip);
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, lip);
                xfs_efi_item_free(efip);
-        } else {
+                return;
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
        }
+        __xfs_efi_release(efip);
 }
 /*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
 }
 /*
- * The EFI is logged only once and cannot be moved in the log, so
+ * The EFI is logged only once and cannot be moved in the log, so simply return
- * simply return the lsn at which it's been logged.  The canceled
+ * the lsn at which it's been logged.  For bulk transaction committed
- * flag is not paid any attention here.  Checking for that is delayed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * until the EFI is unpinned.
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
 */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
        return lsn;
 }
@@ -230,6 +254,7 @@ xfs_efi_init(
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        atomic_set(&efip->efi_next_extent, 0);
        return efip;
 }
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 /*
- * This is called by the efd item code below to release references to
+ * This is called by the efd item code below to release references to the given
- * the given efi item.  Each efd calls this with the number of
+ * efi item.  Each efd calls this with the number of extents that it has
- * extents that it has logged, and when the sum of these reaches
+ * logged, and when the sum of these reaches the total number of extents logged
- * the total number of extents logged by this efi item we can free
+ * by this efi item we can free the efi item.
- * the efi item.
- *
- * Freeing the efi item requires that we remove it from the AIL.
- * We'll use the AIL lock to protect our counters as well as
- * the removal from the AIL.
 */
 void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-        int                     extents_left;
+        if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+                __xfs_efi_release(efip);
-        ASSERT(efip->efi_next_extent > 0);
-        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&ailp->xa_lock);
-        ASSERT(efip->efi_next_extent >= nextents);
-        efip->efi_next_extent -= nextents;
-        extents_left = efip->efi_next_extent;
-        if (extents_left == 0) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                spin_unlock(&ailp->xa_lock);
-        }
 }
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
 #define XFS_EFI_MAX_FAST_EXTENTS        16
 /*
- * Define EFI flags.
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
 */
-#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_RECOVERED       1
-#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_COMMITTED       2
-#define XFS_EFI_CANCELED        0x4
 /*
 * This is the "extent free intention" log item.  It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
 */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
-        uint                    efi_flags;      /* misc flags */
+        atomic_t                efi_next_extent;
-        uint                    efi_next_extent;
+        unsigned long           efi_flags;      /* misc flags */
        xfs_efi_log_format_t    efi_format;
 } xfs_efi_log_item_t;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
        xfs_fsop_geom_t         *geo,
        int                     new_version)
 {
+        memset(geo, 0, sizeof(*geo));
        geo->blocksize = mp->m_sb.sb_blocksize;
        geo->rtextsize = mp->m_sb.sb_rextsize;
        geo->agblocks = mp->m_sb.sb_agblocks;
@@ -374,6 +377,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+        xfs_set_low_space_thresholds(mp);
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
@@ -381,8 +385,8 @@ xfs_growfs_data_private(
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
                if (error) {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
-                        "error %d reading secondary superblock for ag %d",
+                "error %d reading secondary superblock for ag %d",
                                error, agno);
                        break;
                }
@@ -395,7 +399,7 @@ xfs_growfs_data_private(
                if (!(error = xfs_bwrite(mp, bp))) {
                        continue;
                } else {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
                "write error %d updating secondary superblock for ag %d",
                                error, agno);
                        break; /* no point in continuing */
@@ -611,12 +615,13 @@ out:
 *
 * We cannot use an inode here for this - that will push dirty state back up
 * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
 */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             flags)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -631,8 +636,7 @@ xfs_fs_log_dummy(
        /* log the UUID because it is an unchanging field */
        xfs_mod_sb(tp, XFS_SB_UUID);
-        if (flags & SYNC_WAIT)
+        xfs_trans_set_sync(tp);
-                xfs_trans_set_sync(tp);
        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c3447..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
         */
        agno = XFS_INO_TO_AGNO(mp, inode);
        if (agno >= mp->m_sb.sb_agcount)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-                        "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agno, mp->m_sb.sb_agcount);
-                        agno, mp->m_sb.sb_agcount, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agino = XFS_INO_TO_AGINO(mp, inode);
        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-                        "xfs_difree: inode != XFS_AGINO_TO_INO() "
+                        __func__, (unsigned long long)inode,
-                        "(%llu != %llu) on %s.  Returning EINVAL.",
+                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-                        (unsigned long long)inode,
-                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
-                        mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
        if (agbno >= mp->m_sb.sb_agblocks)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-                        "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agbno, mp->m_sb.sb_agblocks);
-                        agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
@@ -1085,9 +1080,8 @@ xfs_difree(
         */
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-                        "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                return error;
        }
        agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
         * Look for the entry describing this inode.
         */
        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
-                        "xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
-                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
+                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
-                                error, mp->m_fsname);
+                                __func__, error);
                        goto error0;
                }
@@ -1170,9 +1162,8 @@ xfs_difree(
                error = xfs_inobt_update(cur, &rec);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
-        "xfs_difree: xfs_inobt_update returned an error %d on %s.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                                "xfs_ialloc_read_agi() returned "
+                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
-                                "error %d, agno %d",
+                        __func__, error, agno);
-                                error, agno);
                return error;
        }
@@ -1299,24 +1289,21 @@ xfs_imap(
                if (flags & XFS_IGET_UNTRUSTED)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agno (%d) >= "
+                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
-                                        "mp->m_sb.sb_agcount (%d)",
+                                __func__, agno, mp->m_sb.sb_agcount);
-                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agbno (0x%llx) >= "
+                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
-                                        "mp->m_sb.sb_agblocks (0x%lx)",
+                                __func__, (unsigned long long)agbno,
-                                        (unsigned long long) agbno,
+                                (unsigned long)mp->m_sb.sb_agblocks);
-                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: ino (0x%llx) != "
+                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
-                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
+                                __func__, ino,
-                                        "(0x%llx)",
+                                XFS_AGINO_TO_INO(mp, agno, agino));
-                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
                }
                xfs_stack_trace();
 #endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
         */
        if ((imap->im_blkno + imap->im_len) >
            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        __func__, (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_blkno,
                        (unsigned long long) imap->im_len,
                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
                return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8e..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
 /*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+/*
 * Allocate and initialise an xfs_inode.
 */
 STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_active, "xfs_iolock_active");
        /* initialise the xfs inode */
        ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 }
 /*
@@ -144,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
                        goto out_error;
                }
-                write_lock(&pag->pag_ici_lock);
+                spin_lock(&pag->pag_ici_lock);
                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
                ip->i_flags |= XFS_INEW;
                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                                &xfs_iolock_active, "xfs_iolock_active");
                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
+                spin_unlock(&pag->pag_ici_lock);
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        *ipp = ip;
        return 0;
 out_preload_end:
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
@@ -366,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..a37480a6e023 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        i * mp->m_sb.sb_inodesize);
                if (!dip->di_next_unlinked)  {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
+        "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
                                bp);
                        ASSERT(dip->di_next_unlinked);
                }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
                                   (int)imap->im_len, buf_flags, &bp);
        if (error) {
                if (error != EAGAIN) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                                "%s: xfs_trans_read_buf() returned error %d.",
-                                "an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                } else {
                        ASSERT(buf_flags & XBF_TRYLOCK);
                }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
                                                XFS_ERRLEVEL_HIGH, mp, dip);
 #ifdef DEBUG
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(mp,
-                                        "Device %s - bad inode magic/vsn "
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                                        "daddr %lld #%d (magic=%x)",
-                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
                                be16_to_cpu(dip->di_magic));
+                        ASSERT(0);
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
                     be64_to_cpu(dip->di_nblocks))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
                        (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
        }
        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
                     !ip->i_mount->m_rtdev_targp)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, has realtime flag set.",
                        ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
                         * no local regular files yet
                         */
                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (local format for regular file).",
-                                        "(local format for regular file).",
                                        (unsigned long long) ip->i_ino);
                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
                                                     XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (bad size %Ld for local inode).",
-                                        "(bad size %Ld for local inode).",
                                        (unsigned long long) ip->i_ino,
                                        (long long) di_size);
                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
                size = be16_to_cpu(atp->hdr.totsize);
                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
-                                "corrupt inode %Lu "
+                                "corrupt inode %Lu (bad attr fork size %Ld).",
-                                "(bad attr fork size %Ld).",
                                (unsigned long long) ip->i_ino,
                                (long long) size);
                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
-                        "corrupt inode %Lu "
+        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                        "(bad size %d for local fork, size = %d).",
                        (unsigned long long) ip->i_ino, size,
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                        "corrupt inode %Lu ((a)extents = %d).",
                        (unsigned long long) ip->i_ino, nex);
                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
            || XFS_BMDR_SPACE_CALC(nrecs) >
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
-                        "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
-                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                 ip->i_mount);
+                                 ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -813,11 +804,9 @@ xfs_iread(
         */
        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                xfs_alert(mp,
-                                "dip->di_magic (0x%x) != "
+                        "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
-                                "XFS_DINODE_MAGIC (0x%x)",
+                        __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-                                be16_to_cpu(dip->di_magic),
-                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
                error = XFS_ERROR(EINVAL);
                goto out_brelse;
@@ -835,9 +824,8 @@ xfs_iread(
                error = xfs_iformat(ip, dip);
                if (error)  {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                                        "xfs_iformat() returned error %d",
+                                __func__, error);
-                                        error);
 #endif /* DEBUG */
                        goto out_brelse;
                }
@@ -887,7 +875,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        xfs_buf_set_ref(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -1016,8 +1004,8 @@ xfs_ialloc(
         * This is because we're setting fields here we need
         * to prevent others from looking at until we're done.
         */
-        error = xfs_trans_iget(tp->t_mountp, tp, ino,
+        error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
-                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+                         XFS_ILOCK_EXCL, &ip);
        if (error)
                return error;
        ASSERT(ip != NULL);
@@ -1166,6 +1154,7 @@ xfs_ialloc(
        /*
         * Log the new values stuffed into the inode.
         */
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1820,9 +1809,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1867,9 +1855,9 @@ xfs_iunlink_remove(
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
                                            &last_ibp, &last_offset, 0);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp,
-                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
+                                        "%s: xfs_inotobp() returned error %d.",
-                                        error, mp->m_fsname);
+                                        __func__, error);
                                return error;
                        }
                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1882,9 +1870,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -2000,17 +1987,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2022,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2632,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2643,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2707,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2719,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
@@ -2774,7 +2789,7 @@ xfs_iflush(
        /*
         * We can't flush the inode until it is unpinned, so wait for it if we
-         * are allowed to block.  We know noone new can pin it, because we are
+         * are allowed to block.  We know no one new can pin it, because we are
         * holding the inode lock shared and you need to hold it exclusively to
         * pin the inode.
         *
@@ -2820,7 +2835,7 @@ xfs_iflush(
         * Get the buffer containing the on-disk inode.
         */
        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-                                (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
+                                (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
                return error;
@@ -2911,16 +2926,16 @@ xfs_iflush_int(
        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-                        ip->i_ino, ip, ip->i_d.di_magic);
+                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
                goto corrupt_out;
        }
        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2928,9 +2943,9 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+                                "%s: Bad regular inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2939,28 +2954,28 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+                                "%s: Bad directory inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        }
        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
                                XFS_RANDOM_IFLUSH_5)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        "%s: detected corrupt incore inode %Lu, "
-                        ip->i_ino,
+                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        __func__, ip->i_ino,
                        ip->i_d.di_nextents + ip->i_d.di_anextents,
-                        ip->i_d.di_nblocks,
+                        ip->i_d.di_nblocks, ip);
-                        ip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
-                        ip->i_ino, ip->i_d.di_forkoff, ip);
+                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
                goto corrupt_out;
        }
        /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc9..ff4e2a30227d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -111,7 +111,7 @@ struct xfs_imap {
 * Generally, we do not want to hold the i_rlock while holding the
 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
 *
- * xfs_iptr_t contains all the inode fields upto and including the
+ * xfs_iptr_t contains all the inode fields up to and including the
 * i_mnext and i_mprev fields, it is used as a marker in the inode
 * chain off the mount structure by xfs_sync calls.
 */
@@ -336,7 +336,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 /*
 * Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was choosen
+ * and using two 16bit values to hold new 32bit projid was chosen
 * to retain compatibility with "old" filesystems).
 */
 static inline prid_t
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE      0x0002  /* inode has been staled */
+#define XFS_ISTALE              0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
-#define XFS_INEW        0x0008  /* inode has just been allocated */
+#define XFS_INEW                0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
 /*
 * Flags for inode locking.
@@ -408,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * Flags for lockdep annotations.
 *
- * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
+ * XFS_LOCK_PARENT - for directory operations that require locking a
- * (ie directory operations that require locking a directory inode and
+ * parent directory inode and a child entry inode.  The parent gets locked
- * an entry inode).  The first inode gets locked with this flag so it
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
- * gets a lockdep subclass of 1 and the second lock will have a lockdep
+ * lock will have a lockdep subclass of 0.
- * subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
 *
 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
 * with xfs_lock_inodes().  This flag is used as the starting subclass
 * and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 2, the
+ * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 3, and so on. It is
+ * second lock will have a lockdep subclass of 5, and so on. It is
 * the responsibility of the class builder to shift this to the correct
 * portion of the lock_mode lockdep mask.
 */
 #define XFS_LOCK_PARENT         1
-#define XFS_LOCK_INUMORDER      2
+#define XFS_LOCK_RTBITMAP       2
+#define XFS_LOCK_RTSUM          3
+#define XFS_LOCK_INUMORDER      4
 #define XFS_IOLOCK_SHIFT        16
 #define XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_SHIFT         24
 #define XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
 #define XFS_IOLOCK_DEP_MASK     0x00ff0000
 #define XFS_ILOCK_DEP_MASK      0xff000000
@@ -438,6 +446,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+extern struct lock_class_key xfs_iolock_reclaimable;
 /*
 * Flags for xfs_itruncate_start().
 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c3..576fdfe81d60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
 }
 /*
+ * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode. In this case, we
+ * need to do this conversion before we write the extents into the log. Because
+ * we don't have the disk inode to write into here, we allocate a buffer and
+ * format the extents into it via xfs_iextents_copy(). We free the buffer in
+ * the unlock routine after the copy for the log has been made.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only log on-disk extents
+ * here, so always use the physical fork size to determine the size of the
+ * buffer we need to allocate.
+ */
+STATIC void
+xfs_inode_item_format_extents(
+        struct xfs_inode        *ip,
+        struct xfs_log_iovec    *vecp,
+        int                     whichfork,
+        int                     type)
+{
+        xfs_bmbt_rec_t          *ext_buffer;
+        ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+        if (whichfork == XFS_DATA_FORK)
+                ip->i_itemp->ili_extents_buf = ext_buffer;
+        else
+                ip->i_itemp->ili_aextents_buf = ext_buffer;
+        vecp->i_addr = ext_buffer;
+        vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+        vecp->i_type = type;
+}
+/*
 * This is called to fill in the vector of log iovecs for the
 * given inode log item.  It fills the first item with an inode
 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs;
        size_t                  data_bytes;
-        xfs_bmbt_rec_t          *ext_buffer;
        xfs_mount_t             *mp;
        vecp->i_addr = &iip->ili_format;
@@ -320,22 +354,8 @@ xfs_inode_item_format(
                        } else
 #endif
                        {
-                                /*
+                                xfs_inode_item_format_extents(ip, vecp,
-                                 * There are delayed allocation extents
+                                        XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-                                 * in the inode, or we need to convert
-                                 * the extents to on disk format.
-                                 * Use xfs_iextents_copy()
-                                 * to copy only the real extents into
-                                 * a separate buffer.  We'll free the
-                                 * buffer in the unlock routine.
-                                 */
-                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
-                                        KM_SLEEP);
-                                iip->ili_extents_buf = ext_buffer;
-                                vecp->i_addr = ext_buffer;
-                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                                XFS_DATA_FORK);
-                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        }
                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
                        iip->ili_format.ilf_dsize = vecp->i_len;
@@ -445,19 +465,12 @@ xfs_inode_item_format(
                         */
                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
                        vecp->i_len = ip->i_afp->if_bytes;
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
 #else
                        ASSERT(iip->ili_aextents_buf == NULL);
-                        /*
+                        xfs_inode_item_format_extents(ip, vecp,
-                         * Need to endian flip before logging
+                                        XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-                         */
-                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
-                                KM_SLEEP);
-                        iip->ili_aextents_buf = ext_buffer;
-                        vecp->i_addr = ext_buffer;
-                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                        XFS_ATTR_FORK);
 #endif
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
@@ -760,11 +773,11 @@ xfs_inode_item_push(
         * Push the inode to it's backing buffer. This will not remove the
         * inode from the AIL - a further push will be required to trigger a
         * buffer push. However, this allows all the dirty inodes to be pushed
-         * to the buffer before it is pushed to disk. THe buffer IO completion
+         * to the buffer before it is pushed to disk. The buffer IO completion
-         * will pull th einode from the AIL, mark it clean and unlock the flush
+         * will pull the inode from the AIL, mark it clean and unlock the flush
         * lock.
         */
-        (void) xfs_iflush(ip, 0);
+        (void) xfs_iflush(ip, SYNC_TRYLOCK);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 }
@@ -842,15 +855,64 @@ xfs_inode_item_destroy(
 * flushed to disk.  It is responsible for removing the inode item
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
 */
 void
 xfs_iflush_done(
        struct xfs_buf          *bp,
        struct xfs_log_item     *lip)
 {
-        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode_log_item *iip;
-        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_log_item     *blip;
+        struct xfs_log_item     *next;
+        struct xfs_log_item     *prev;
        struct xfs_ail          *ailp = lip->li_ailp;
+        int                     need_ail = 0;
+        /*
+         * Scan the buffer IO completions for other inodes being completed and
+         * attach them to the current inode log item.
+         */
+        blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        prev = NULL;
+        while (blip != NULL) {
+                if (lip->li_cb != xfs_iflush_done) {
+                        prev = blip;
+                        blip = blip->li_bio_list;
+                        continue;
+                }
+                /* remove from list */
+                next = blip->li_bio_list;
+                if (!prev) {
+                        XFS_BUF_SET_FSPRIVATE(bp, next);
+                } else {
+                        prev->li_bio_list = next;
+                }
+                /* add to current list */
+                blip->li_bio_list = lip->li_bio_list;
+                lip->li_bio_list = blip;
+                /*
+                 * while we have the item, do the unlocked check for needing
+                 * the AIL lock.
+                 */
+                iip = INODE_ITEM(blip);
+                if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+                        need_ail++;
+                blip = next;
+        }
+        /* make sure we capture the state of the initial inode. */
+        iip = INODE_ITEM(lip);
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+                need_ail++;
        /*
         * We only want to pull the item from the AIL if it is
@@ -861,28 +923,37 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
+        if (need_ail) {
+                struct xfs_log_item *log_items[need_ail];
+                int i = 0;
                spin_lock(&ailp->xa_lock);
-                if (lip->li_lsn == iip->ili_flush_lsn) {
+                for (blip = lip; blip; blip = blip->li_bio_list) {
-                        /* xfs_trans_ail_delete() drops the AIL lock. */
+                        iip = INODE_ITEM(blip);
-                        xfs_trans_ail_delete(ailp, lip);
+                        if (iip->ili_logged &&
-                } else {
+                            blip->li_lsn == iip->ili_flush_lsn) {
-                        spin_unlock(&ailp->xa_lock);
+                                log_items[i++] = blip;
+                        }
+                        ASSERT(i <= need_ail);
                }
+                /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+                xfs_trans_ail_delete_bulk(ailp, log_items, i);
        }
-        iip->ili_logged = 0;
        /*
-         * Clear the ili_last_fields bits now that we know that the
+         * clean up and unlock the flush lock now we are done. We can clear the
-         * data corresponding to them is safely on disk.
+         * ili_last_fields bits now that we know that the data corresponding to
+         * them is safely on disk.
         */
-        iip->ili_last_fields = 0;
+        for (blip = lip; blip; blip = next) {
+                next = blip->li_bio_list;
+                blip->li_bio_list = NULL;
-        /*
+                iip = INODE_ITEM(blip);
-         * Release the inode's flush lock since we're done with it.
+                iip->ili_logged = 0;
-         */
+                iip->ili_last_fields = 0;
-        xfs_ifunlock(ip);
+                xfs_ifunlock(iip->ili_inode);
+        }
 }
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
                                                << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
-int
-xfs_iomap(
-        struct xfs_inode        *ip,
-        xfs_off_t               offset,
-        ssize_t                 count,
-        int                     flags,
-        struct xfs_bmbt_irec    *imap,
-        int                     *nimaps,
-        int                     *new)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           offset_fsb, end_fsb;
-        int                     error = 0;
-        int                     lockmode = 0;
-        int                     bmapi_flags = 0;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        *new = 0;
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
-        switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
-        case BMAPI_READ:
-                lockmode = xfs_ilock_map_shared(ip);
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                break;
-        case BMAPI_WRITE:
-                lockmode = XFS_ILOCK_EXCL;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
-                xfs_ilock(ip, lockmode);
-                break;
-        case BMAPI_ALLOCATE:
-                lockmode = XFS_ILOCK_SHARED;
-                bmapi_flags = XFS_BMAPI_ENTIRE;
-                /* Attempt non-blocking lock */
-                if (flags & BMAPI_TRYLOCK) {
-                        if (!xfs_ilock_nowait(ip, lockmode))
-                                return XFS_ERROR(EAGAIN);
-                } else {
-                        xfs_ilock(ip, lockmode);
-                }
-                break;
-        default:
-                BUG();
-        }
-        ASSERT(offset <= mp->m_maxioffset);
-        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-                count = mp->m_maxioffset - offset;
-        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb,
-                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL);
-        if (error)
-                goto out;
-        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
-        case BMAPI_WRITE:
-                /* If we found an extent, return it */
-                if (*nimaps &&
-                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                if (flags & BMAPI_DIRECT) {
-                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       imap, nimaps);
-                } else {
-                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      imap, nimaps);
-                }
-                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
-                }
-                *new = 1;
-                break;
-        case BMAPI_ALLOCATE:
-                /* If we found an extent, return it */
-                xfs_iunlock(ip, lockmode);
-                lockmode = 0;
-                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
-                        break;
-                }
-                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 imap, nimaps);
-                break;
-        }
-        ASSERT(*nimaps <= 1);
-out:
-        if (lockmode)
-                xfs_iunlock(ip, lockmode);
-        return XFS_ERROR(error);
-}
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -220,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
 }
 STATIC int
-xfs_cmn_err_fsblock_zero(
+xfs_alert_fsblock_zero(
        xfs_inode_t     *ip,
        xfs_bmbt_irec_t *imap)
 {
-        xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+        xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                        "Access to block zero in inode %llu "
                        "start_block: %llx start_off: %llx "
                        "blkcnt: %llx extent-state: %x\n",
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-STATIC int
+int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             flags,
        xfs_bmbt_irec_t *imap,
-        int             *nmaps)
+        int             nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        imap->br_blockcount +
                                        imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
-        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
+        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -366,11 +246,10 @@ xfs_iomap_write_direct(
        }
        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-                error = xfs_cmn_err_fsblock_zero(ip, imap);
+                error = xfs_alert_fsblock_zero(ip, imap);
                goto error_out;
        }
-        *nmaps = 1;
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        *nmaps = 0;     /* nothing set-up here */
 error_out:
        return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
        xfs_bmbt_irec_t *imap,
        int             nimaps,
        int             *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+        int             found_delalloc = 0;
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
                                return 0;
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+                        if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                                found_delalloc = 1;
                }
        }
-        *prealloc = 1;
+        if (!found_delalloc)
+                *prealloc = 1;
        return 0;
 }
-STATIC int
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip)
+{
+        xfs_fsblock_t           alloc_blocks = 0;
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                int shift = 0;
+                int64_t freesp;
+                /*
+                 * rounddown_pow_of_two() returns an undefined result
+                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+                 * ensure we always pass in a non-zero value.
+                 */
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                        rounddown_pow_of_two(alloc_blocks));
+                xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+                freesp = mp->m_sb.sb_fdblocks;
+                if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                        shift = 2;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                shift++;
+                        if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                shift++;
+                }
+                if (shift)
+                        alloc_blocks >>= shift;
+        }
+        if (alloc_blocks < mp->m_writeio_blocks)
+                alloc_blocks = mp->m_writeio_blocks;
+        return alloc_blocks;
+}
+int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap)
-        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
+                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
 retry:
        if (prealloc) {
+                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-                last_fsb = ioalign + mp->m_writeio_blocks;
+                last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +431,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-        if (error && (error != ENOSPC))
+        switch (error) {
+        case 0:
+        case ENOSPC:
+        case EDQUOT:
+                break;
+        default:
                return XFS_ERROR(error);
+        }
        /*
-         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-         * then we must have run out of space - flush all other inodes with
+         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * delalloc blocks and retry without EOF preallocation.
+         * some of the excess reserved metadata space. For both cases, retry
+         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                        return XFS_ERROR(ENOSPC);
+                        return XFS_ERROR(error ? error : ENOSPC);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error == ENOSPC) {
-                xfs_flush_inodes(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inodes(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
                flushed = 1;
                error = 0;
@@ -520,11 +464,9 @@ retry:
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
+                return xfs_alert_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
-        *nmaps = 1;
        return 0;
 }
@@ -538,13 +480,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-STATIC int
+int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *imap,
+        xfs_bmbt_irec_t *imap)
-        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
        int             error = 0;
        int             nres;
-        *retmap = 0;
        /*
         * Make sure that the dquots are there.
         */
@@ -675,12 +614,11 @@ xfs_iomap_write_allocate(
                 * covers at least part of the callers request
                 */
                if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, imap);
+                        return xfs_alert_fsblock_zero(ip, imap);
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
                                   imap->br_blockcount))) {
-                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
                }
@@ -786,7 +724,7 @@ xfs_iomap_write_unwritten(
                        return XFS_ERROR(error);
                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, &imap);
+                        return xfs_alert_fsblock_zero(ip, &imap);
                if ((numblks_fsb = imap.br_blockcount) == 0) {
                        /*
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-/* base extent manipulation calls */
-#define BMAPI_READ      (1 << 0)        /* read extents */
-#define BMAPI_WRITE     (1 << 1)        /* create extents */
-#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-/* modifiers */
-#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-#define BMAPI_FLAGS \
-        { BMAPI_READ,           "READ" }, \
-        { BMAPI_WRITE,          "WRITE" }, \
-        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
-        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
-        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
 struct xfs_bmbt_irec;
-extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                     struct xfs_bmbt_irec *, int *, int *);
+                        struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+                        struct xfs_bmbt_irec *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index dc1882adaf54..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -204,7 +204,6 @@ xfs_bulkstat(
        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_daddr_t             bno;    /* inode cluster start daddr */
        int                     chunkidx; /* current index into inode chunk */
        int                     clustidx; /* current index into inode cluster */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
@@ -463,7 +462,6 @@ xfs_bulkstat(
                                                 mp->m_sb.sb_inopblog);
                                }
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
                                /*
                                 * Skip if this inode is free.
                                 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9e..b612ce4520ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
                                xfs_buftarg_t   *log_target,
                                xfs_daddr_t     blk_offset,
                                int             num_bblks);
-STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_space_left(struct log *log, atomic64_t *head);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 /* local functions to manipulate grant head */
 STATIC int  xlog_grant_log_space(xlog_t         *log,
                                 xlog_ticket_t  *xtic);
-STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+STATIC void xlog_grant_push_ail(struct log      *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
                                           xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
-STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_grant_tail(struct log *log);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
 STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
                                     xfs_lsn_t tail_lsn);
 #else
 #define xlog_verify_dest_ptr(a,b)
-#define xlog_verify_grant_head(a,b)
+#define xlog_verify_grant_tail(a)
 #define xlog_verify_iclog(a,b,c,d)
 #define xlog_verify_tail_lsn(a,b,c)
 #endif
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 static void
-xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
+xlog_grant_sub_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        if (*qp) {
+        int64_t head_val = atomic64_read(head);
-                tic->t_next         = (*qp);
+        int64_t new, old;
-                tic->t_prev         = (*qp)->t_prev;
-                (*qp)->t_prev->t_next = tic;
-                (*qp)->t_prev       = tic;
-        } else {
-                tic->t_prev = tic->t_next = tic;
-                *qp = tic;
-        }
-        tic->t_flags |= XLOG_TIC_IN_Q;
+        do {
-}
+                int     cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
-{
-        if (tic == tic->t_next) {
-                *qp = NULL;
-        } else {
-                *qp = tic->t_next;
-                tic->t_next->t_prev = tic->t_prev;
-                tic->t_prev->t_next = tic->t_next;
-        }
-        tic->t_next = tic->t_prev = NULL;
+                space -= bytes;
-        tic->t_flags &= ~XLOG_TIC_IN_Q;
+                if (space < 0) {
+                        space += log->l_logsize;
+                        cycle--;
+                }
+                old = head_val;
+                new = xlog_assign_grant_head_val(cycle, space);
+                head_val = atomic64_cmpxchg(head, old, new);
+        } while (head_val != old);
 }
 static void
-xlog_grant_sub_space(struct log *log, int bytes)
+xlog_grant_add_space(
+        struct log      *log,
+        atomic64_t      *head,
+        int             bytes)
 {
-        log->l_grant_write_bytes -= bytes;
+        int64_t head_val = atomic64_read(head);
-        if (log->l_grant_write_bytes < 0) {
+        int64_t new, old;
-                log->l_grant_write_bytes += log->l_logsize;
-                log->l_grant_write_cycle--;
-        }
-        log->l_grant_reserve_bytes -= bytes;
-        if ((log)->l_grant_reserve_bytes < 0) {
-                log->l_grant_reserve_bytes += log->l_logsize;
-                log->l_grant_reserve_cycle--;
-        }
-}
+        do {
+                int             tmp;
+                int             cycle, space;
-static void
+                xlog_crack_grant_head_val(head_val, &cycle, &space);
-xlog_grant_add_space_write(struct log *log, int bytes)
-{
-        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (tmp > bytes)
-                log->l_grant_write_bytes += bytes;
-        else {
-                log->l_grant_write_cycle++;
-                log->l_grant_write_bytes = bytes - tmp;
-        }
-}
-static void
+                tmp = log->l_logsize - space;
-xlog_grant_add_space_reserve(struct log *log, int bytes)
+                if (tmp > bytes)
-{
+                        space += bytes;
-        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+                else {
-        if (tmp > bytes)
+                        space = bytes - tmp;
-                log->l_grant_reserve_bytes += bytes;
+                        cycle++;
-        else {
+                }
-                log->l_grant_reserve_cycle++;
-                log->l_grant_reserve_bytes = bytes - tmp;
-        }
-}
-static inline void
+                old = head_val;
-xlog_grant_add_space(struct log *log, int bytes)
+                new = xlog_assign_grant_head_val(cycle, space);
-{
+                head_val = atomic64_cmpxchg(head, old, new);
-        xlog_grant_add_space_write(log, bytes);
+        } while (head_val != old);
-        xlog_grant_add_space_reserve(log, bytes);
 }
 static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                xlog_grant_push_ail(log, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
                trace_xfs_log_reserve(log, internal_ticket);
-                xlog_grant_push_ail(mp,
+                xlog_grant_push_ail(log,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
                retval = xlog_grant_log_space(log, internal_ticket);
@@ -399,11 +374,10 @@ xfs_log_mount(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+                xfs_notice(mp, "Mounting Filesystem");
        else {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp,
-                        "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
-                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -418,7 +392,7 @@ xfs_log_mount(
         */
        error = xfs_trans_ail_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
+                xfs_warn(mp, "AIL initialisation failed: error %d", error);
                goto out_free_log;
        }
        mp->m_log->l_ailp = mp->m_ail;
@@ -438,7 +412,8 @@ xfs_log_mount(
                if (readonly)
                        mp->m_flags |= XFS_MOUNT_RDONLY;
                if (error) {
-                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
+                        xfs_warn(mp, "log mount/recovery failed: error %d",
+                                error);
                        goto out_destroy_ail;
                }
        }
@@ -567,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                         */
                }
-                if (error) {
+                if (error)
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: unmount record failed", __func__);
-                                "xfs_log_unmount: unmount record failed");
-                }
                spin_lock(&log->l_icloglock);
@@ -584,8 +557,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                        } else {
                                spin_unlock(&log->l_icloglock);
                        }
@@ -625,8 +598,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_force_wait, PMEM,
+                                xlog_wait(&iclog->ic_force_wait,
-                                        &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                } else {
                        spin_unlock(&log->l_icloglock);
                }
@@ -703,55 +676,46 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 {
        xlog_ticket_t   *tic;
        xlog_t          *log = mp->m_log;
-        int             need_bytes, free_bytes, cycle, bytes;
+        int             need_bytes, free_bytes;
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
-        if (tail_lsn == 0) {
+        if (tail_lsn == 0)
-                /* needed since sync_lsn is 64 bits */
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                spin_lock(&log->l_icloglock);
-                tail_lsn = log->l_last_sync_lsn;
-                spin_unlock(&log->l_icloglock);
-        }
-        spin_lock(&log->l_grant_lock);
-        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+        /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-         * tail_lsn.
+        if (tail_lsn != 1)
-         */
+                atomic64_set(&log->l_tail_lsn, tail_lsn);
-        if (tail_lsn != 1) {
-                log->l_tail_lsn = tail_lsn;
-        }
-        if ((tic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_write_cycle;
+                spin_lock(&log->l_grant_write_lock);
-                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                do {
                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_regrant_write_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_write_headq);
+                }
+                spin_unlock(&log->l_grant_write_lock);
        }
-        if ((tic = log->l_reserve_headq)) {
+        if (!list_empty_careful(&log->l_reserveq)) {
 #ifdef DEBUG
                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                        panic("Recovery problem");
 #endif
-                cycle = log->l_grant_reserve_cycle;
+                spin_lock(&log->l_grant_reserve_lock);
-                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                free_bytes = xlog_space_left(log, cycle, bytes);
+                list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                do {
                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
                                need_bytes = tic->t_unit_res*tic->t_cnt;
                        else
@@ -760,12 +724,12 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_wait);
+                        trace_xfs_log_grant_wake_up(log, tic);
-                        tic = tic->t_next;
+                        wake_up(&tic->t_wait);
-                } while (tic != log->l_reserve_headq);
+                }
+                spin_unlock(&log->l_grant_reserve_lock);
        }
-        spin_unlock(&log->l_grant_lock);
+}
-}       /* xfs_log_move_tail */
 /*
 * Determine if we have a transaction that has gone to disk
@@ -797,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
                break;
        case XLOG_STATE_COVER_NEED:
        case XLOG_STATE_COVER_NEED2:
-                if (!xfs_trans_ail_tail(log->l_ailp) &&
+                if (!xfs_ail_min_lsn(log->l_ailp) &&
                    xlog_iclogs_empty(log)) {
                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                                log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -831,23 +795,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
 * We may be holding the log iclog lock upon entering this routine.
 */
 xfs_lsn_t
-xlog_assign_tail_lsn(xfs_mount_t *mp)
+xlog_assign_tail_lsn(
+        struct xfs_mount        *mp)
 {
-        xfs_lsn_t tail_lsn;
+        xfs_lsn_t               tail_lsn;
-        xlog_t    *log = mp->m_log;
+        struct log              *log = mp->m_log;
-        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
+        tail_lsn = xfs_ail_min_lsn(mp->m_ail);
-        spin_lock(&log->l_grant_lock);
+        if (!tail_lsn)
-        if (tail_lsn != 0) {
+                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-                log->l_tail_lsn = tail_lsn;
-        } else {
-                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
-        }
-        spin_unlock(&log->l_grant_lock);
+        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
-}       /* xlog_assign_tail_lsn */
+}
 /*
 * Return the space in the log between the tail and the head.  The head
@@ -864,37 +824,42 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
 * result is that we return the size of the log as the amount of space left.
 */
 STATIC int
-xlog_space_left(xlog_t *log, int cycle, int bytes)
+xlog_space_left(
-{
+        struct log      *log,
-        int free_bytes;
+        atomic64_t      *head)
-        int tail_bytes;
+{
-        int tail_cycle;
+        int             free_bytes;
+        int             tail_bytes;
-        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        int             tail_cycle;
-        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        int             head_cycle;
-        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+        int             head_bytes;
-                free_bytes = log->l_logsize - (bytes - tail_bytes);
-        } else if ((tail_cycle + 1) < cycle) {
+        xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+        tail_bytes = BBTOB(tail_bytes);
+        if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+                free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+        else if (tail_cycle + 1 < head_cycle)
                return 0;
-        } else if (tail_cycle < cycle) {
+        else if (tail_cycle < head_cycle) {
-                ASSERT(tail_cycle == (cycle - 1));
+                ASSERT(tail_cycle == (head_cycle - 1));
-                free_bytes = tail_bytes - bytes;
+                free_bytes = tail_bytes - head_bytes;
        } else {
                /*
                 * The reservation head is behind the tail.
                 * In this case we just want to return the size of the
                 * log as the amount of space left.
                 */
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+                xfs_alert(log->l_mp,
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
-                        tail_cycle, tail_bytes, cycle, bytes);
+                        tail_cycle, tail_bytes, head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
        return free_bytes;
-}       /* xlog_space_left */
+}
 /*
@@ -1034,7 +999,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
-                xlog_warn("XFS: Log allocation failed: No memory!");
+                xfs_warn(mp, "Log allocation failed: No memory!");
                goto out;
        }
@@ -1047,35 +1012,39 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
        log->l_prev_block  = -1;
-        log->l_tail_lsn    = xlog_assign_lsn(1, 0);
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
-        log->l_last_sync_lsn = log->l_tail_lsn;
+        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-        log->l_grant_reserve_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-        log->l_grant_write_cycle = 1;
+        xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+        INIT_LIST_HEAD(&log->l_reserveq);
+        INIT_LIST_HEAD(&log->l_writeq);
+        spin_lock_init(&log->l_grant_reserve_lock);
+        spin_lock_init(&log->l_grant_write_lock);
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log2_size = mp->m_sb.sb_logsectlog;
                if (log2_size < BBSHIFT) {
-                        xlog_warn("XFS: Log sector size too small "
+                        xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
-                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
+                                log2_size, BBSHIFT);
                        goto out_free_log;
                }
                log2_size -= BBSHIFT;
                if (log2_size > mp->m_sectbb_log) {
-                        xlog_warn("XFS: Log sector size too large "
+                        xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
-                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
+                                log2_size, mp->m_sectbb_log);
                        goto out_free_log;
                }
                /* for larger sector sizes, must have v2 or external log */
                if (log2_size && log->l_logBBstart > 0 &&
                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xfs_warn(mp,
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                "log sector size (0x%x) invalid for configuration.",
-                                  "for configuration.", log2_size);
+                                log2_size);
                        goto out_free_log;
                }
        }
@@ -1094,8 +1063,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
-        spin_lock_init(&log->l_grant_lock);
+        init_waitqueue_head(&log->l_flush_wait);
-        sv_init(&log->l_flush_wait, 0, "flush_wait");
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1119,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+                init_waitqueue_head(&iclog->ic_force_wait);
-                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                init_waitqueue_head(&iclog->ic_write_wait);
                iclogp = &iclog->ic_next;
        }
@@ -1167,15 +1135,11 @@ xlog_alloc_log(xfs_mount_t	*mp,
 out_free_iclog:
        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
                prev_iclog = iclog->ic_next;
-                if (iclog->ic_bp) {
+                if (iclog->ic_bp)
-                        sv_destroy(&iclog->ic_force_wait);
-                        sv_destroy(&iclog->ic_write_wait);
                        xfs_buf_free(iclog->ic_bp);
-                }
                kmem_free(iclog);
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
 out_free_log:
        kmem_free(log);
@@ -1223,61 +1187,60 @@ xlog_commit_record(
 * water mark.  In this manner, we would be creating a low water mark.
 */
 STATIC void
-xlog_grant_push_ail(xfs_mount_t *mp,
+xlog_grant_push_ail(
-                    int         need_bytes)
+        struct log      *log,
+        int             need_bytes)
 {
-    xlog_t      *log = mp->m_log;       /* pointer to the log */
+        xfs_lsn_t       threshold_lsn = 0;
-    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+        xfs_lsn_t       last_sync_lsn;
-    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+        int             free_blocks;
-    int         free_blocks;            /* free blocks left to write to */
+        int             free_bytes;
-    int         free_bytes;             /* free bytes left to write to */
+        int             threshold_block;
-    int         threshold_block;        /* block in lsn we'd like to be at */
+        int             threshold_cycle;
-    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+        int             free_threshold;
-    int         free_threshold;
+        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-    spin_lock(&log->l_grant_lock);
+        free_blocks = BTOBBT(free_bytes);
-    free_bytes = xlog_space_left(log,
-                                 log->l_grant_reserve_cycle,
+        /*
-                                 log->l_grant_reserve_bytes);
+         * Set the threshold for the minimum number of free blocks in the
-    tail_lsn = log->l_tail_lsn;
+         * log to the maximum of what the caller needs, one quarter of the
-    free_blocks = BTOBBT(free_bytes);
+         * log, and 256 blocks.
+         */
-    /*
+        free_threshold = BTOBB(need_bytes);
-     * Set the threshold for the minimum number of free blocks in the
+        free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
-     * log to the maximum of what the caller needs, one quarter of the
+        free_threshold = MAX(free_threshold, 256);
-     * log, and 256 blocks.
+        if (free_blocks >= free_threshold)
-     */
+                return;
-    free_threshold = BTOBB(need_bytes);
-    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
-    free_threshold = MAX(free_threshold, 256);
+                                                &threshold_block);
-    if (free_blocks < free_threshold) {
+        threshold_block += free_threshold;
-        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
-        threshold_cycle = CYCLE_LSN(tail_lsn);
        if (threshold_block >= log->l_logBBsize) {
-            threshold_block -= log->l_logBBsize;
+                threshold_block -= log->l_logBBsize;
-            threshold_cycle += 1;
+                threshold_cycle += 1;
        }
-        threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block);
+        threshold_lsn = xlog_assign_lsn(threshold_cycle,
+                                        threshold_block);
+        /*
+         * Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk. Use a snapshot of the last sync lsn
+         * so that it doesn't change between the compare and the set.
+         */
+        last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+        if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+                threshold_lsn = last_sync_lsn;
-        /* Don't pass in an lsn greater than the lsn of the last
+        /*
-         * log record known to be on disk.
+         * Get the transaction layer to kick the dirty buffers out to
+         * disk asynchronously. No point in trying to do this if
+         * the filesystem is shutting down.
         */
-        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+        if (!XLOG_FORCED_SHUTDOWN(log))
-            threshold_lsn = log->l_last_sync_lsn;
+                xfs_ail_push(log->l_ailp, threshold_lsn);
-    }
+}
-    spin_unlock(&log->l_grant_lock);
-    /*
-     * Get the transaction layer to kick the dirty buffers out to
-     * disk asynchronously. No point in trying to do this if
-     * the filesystem is shutting down.
-     */
-    if (threshold_lsn &&
-        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
-}       /* xlog_grant_push_ail */
 /*
 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1335,8 @@ xlog_sync(xlog_t		*log,
                 roundoff < BBTOB(1)));
        /* move grant heads by roundoff in sync */
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-        xlog_grant_add_space(log, roundoff);
+        xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
-        spin_unlock(&log->l_grant_lock);
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -1489,15 +1451,12 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
        }
        spinlock_destroy(&log->l_icloglock);
-        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
        log->l_mp->m_log = NULL;
@@ -1602,38 +1561,36 @@ xlog_print_tic_res(
            "SWAPEXT"
        };
-        xfs_fs_cmn_err(CE_WARN, mp,
+        xfs_warn(mp,
-                        "xfs_log_write: reservation summary:\n"
+                "xfs_log_write: reservation summary:\n"
-                        "  trans type  = %s (%u)\n"
+                "  trans type  = %s (%u)\n"
-                        "  unit res    = %d bytes\n"
+                "  unit res    = %d bytes\n"
-                        "  current res = %d bytes\n"
+                "  current res = %d bytes\n"
-                        "  total reg   = %u bytes (o/flow = %u bytes)\n"
+                "  total reg   = %u bytes (o/flow = %u bytes)\n"
-                        "  ophdrs      = %u (ophdr space = %u bytes)\n"
+                "  ophdrs      = %u (ophdr space = %u bytes)\n"
-                        "  ophdr + reg = %u bytes\n"
+                "  ophdr + reg = %u bytes\n"
-                        "  num regions = %u\n",
+                "  num regions = %u\n",
-                        ((ticket->t_trans_type <= 0 ||
+                ((ticket->t_trans_type <= 0 ||
-                          ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                          "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
-                        ticket->t_trans_type,
+                ticket->t_trans_type,
-                        ticket->t_unit_res,
+                ticket->t_unit_res,
-                        ticket->t_curr_res,
+                ticket->t_curr_res,
-                        ticket->t_res_arr_sum, ticket->t_res_o_flow,
+                ticket->t_res_arr_sum, ticket->t_res_o_flow,
-                        ticket->t_res_num_ophdrs, ophdr_spc,
+                ticket->t_res_num_ophdrs, ophdr_spc,
-                        ticket->t_res_arr_sum + 
+                ticket->t_res_arr_sum +
-                        ticket->t_res_o_flow + ophdr_spc,
+                ticket->t_res_o_flow + ophdr_spc,
-                        ticket->t_res_num);
+                ticket->t_res_num);
        for (i = 0; i < ticket->t_res_num; i++) {
-                uint r_type = ticket->t_res_arr[i].r_type; 
+                uint r_type = ticket->t_res_arr[i].r_type;
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
-                            "region[%u]: %s - %u bytes\n",
-                            i, 
                            ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+        xfs_alert_tag(mp, XFS_PTAG_LOGRES,
                "xfs_log_write: reservation ran out. Need to up reservation");
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
@@ -1721,7 +1678,7 @@ xlog_write_setup_ophdr(
        case XFS_LOG:
                break;
        default:
-                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                xfs_warn(log->l_mp,
                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
                        ophdr->oh_clientid, ticket);
                return NULL;
@@ -2232,7 +2189,7 @@ xlog_state_do_callback(
                                lowest_lsn = xlog_get_lowest_lsn(log);
                                if (lowest_lsn &&
                                    XFS_LSN_CMP(lowest_lsn,
-                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+                                                be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                                        iclog = iclog->ic_next;
                                        continue; /* Leave this iclog for
                                                   * another thread */
@@ -2240,23 +2197,21 @@ xlog_state_do_callback(
                                iclog->ic_state = XLOG_STATE_CALLBACK;
-                                spin_unlock(&log->l_icloglock);
-                                /* l_last_sync_lsn field protected by
+                                /*
-                                 * l_grant_lock. Don't worry about iclog's lsn.
+                                 * update the last_sync_lsn before we drop the
-                                 * No one else can be here except us.
+                                 * icloglock to ensure we are the only one that
+                                 * can update it.
                                 */
-                                spin_lock(&log->l_grant_lock);
+                                ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
-                                ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn,
+                                        be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-                                       be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+                                atomic64_set(&log->l_last_sync_lsn,
-                                log->l_last_sync_lsn =
+                                        be64_to_cpu(iclog->ic_header.h_lsn));
-                                        be64_to_cpu(iclog->ic_header.h_lsn);
-                                spin_unlock(&log->l_grant_lock);
-                        } else {
+                        } else
-                                spin_unlock(&log->l_icloglock);
                                ioerrors++;
-                        }
+                        spin_unlock(&log->l_icloglock);
                        /*
                         * Keep processing entries in the callback list until
@@ -2297,7 +2252,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_force_wait);
+                        wake_up_all(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2305,7 +2260,7 @@ xlog_state_do_callback(
                if (repeats > 5000) {
                        flushcnt += repeats;
                        repeats = 0;
-                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        xfs_warn(log->l_mp,
                                "%s: possible infinite loop (%d iterations)",
                                __func__, flushcnt);
                }
@@ -2344,7 +2299,7 @@ xlog_state_do_callback(
        spin_unlock(&log->l_icloglock);
        if (wake)
-                sv_broadcast(&log->l_flush_wait);
+                wake_up_all(&log->l_flush_wait);
 }
@@ -2395,7 +2350,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_write_wait);
+        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2444,7 +2399,7 @@ restart:
                XFS_STATS_INC(xs_log_noiclogs);
                /* Wait for log writes to have flushed */
-                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
+                xlog_wait(&log->l_flush_wait, &log->l_icloglock);
                goto restart;
        }
@@ -2527,6 +2482,18 @@ restart:
 *
 * Once a ticket gets put onto the reserveq, it will only return after
 * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
 */
 STATIC int
 xlog_grant_log_space(xlog_t        *log,
@@ -2534,24 +2501,27 @@ xlog_grant_log_space(xlog_t	   *log,
 {
        int              free_bytes;
        int              need_bytes;
-#ifdef DEBUG
-        xfs_lsn_t        tail_lsn;
-#endif
 #ifdef DEBUG
        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
                panic("grant Recovery problem");
 #endif
-        /* Is there space or do we need to sleep? */
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_grant_enter(log, tic);
+        need_bytes = tic->t_unit_res;
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes *= tic->t_ocnt;
        /* something is already sleeping; insert new transaction at end */
-        if (log->l_reserve_headq) {
+        if (!list_empty_careful(&log->l_reserveq)) {
-                xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                spin_lock(&log->l_grant_reserve_lock);
+                /* recheck the queue now we are locked */
+                if (list_empty(&log->l_reserveq)) {
+                        spin_unlock(&log->l_grant_reserve_lock);
+                        goto redo;
+                }
+                list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep1(log, tic);
@@ -2563,72 +2533,57 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
                 */
                trace_xfs_log_grant_wake1(log, tic);
-                spin_lock(&log->l_grant_lock);
        }
-        if (tic->t_flags & XFS_LOG_PERM_RESERV)
-                need_bytes = tic->t_unit_res*tic->t_ocnt;
-        else
-                need_bytes = tic->t_unit_res;
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-                                     log->l_grant_reserve_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_reserve_lock);
-                        xlog_ins_ticketq(&log->l_reserve_headq, tic);
+                if (list_empty(&tic->t_queue))
+                        list_add_tail(&tic->t_queue, &log->l_reserveq);
                trace_xfs_log_grant_sleep2(log, tic);
-                spin_unlock(&log->l_grant_lock);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
-                trace_xfs_log_grant_wake2(log, tic);
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+                trace_xfs_log_grant_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space(log, need_bytes);
+                spin_lock(&log->l_grant_reserve_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_reserve_lock);
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_grant_exit(log, tic);
-        xlog_verify_grant_head(log, 1);
+        xlog_verify_grant_tail(log);
-        spin_unlock(&log->l_grant_lock);
        return 0;
- error_return:
+error_return_unlocked:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        spin_lock(&log->l_grant_reserve_lock);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+error_return:
+        list_del_init(&tic->t_queue);
+        spin_unlock(&log->l_grant_reserve_lock);
        trace_xfs_log_grant_error(log, tic);
        /*
@@ -2638,7 +2593,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_grant_log_space */
@@ -2646,17 +2600,14 @@ redo:
 /*
 * Replenish the byte reservation required by moving the grant write head.
 *
- *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
 */
 STATIC int
 xlog_regrant_write_log_space(xlog_t        *log,
                             xlog_ticket_t *tic)
 {
        int             free_bytes, need_bytes;
-        xlog_ticket_t   *ntic;
-#ifdef DEBUG
-        xfs_lsn_t       tail_lsn;
-#endif
        tic->t_curr_res = tic->t_unit_res;
        xlog_tic_reset_res(tic);
@@ -2669,12 +2620,9 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                panic("regrant Recovery problem");
 #endif
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_regrant_write_enter(log, tic);
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
        /* If there are other waiters on the queue then give them a
         * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2631,76 @@ xlog_regrant_write_log_space(xlog_t	   *log,
         * this transaction.
         */
        need_bytes = tic->t_unit_res;
-        if ((ntic = log->l_write_headq)) {
+        if (!list_empty_careful(&log->l_writeq)) {
-                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                struct xlog_ticket *ntic;
-                                             log->l_grant_write_bytes);
-                do {
+                spin_lock(&log->l_grant_write_lock);
+                free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+                list_for_each_entry(ntic, &log->l_writeq, t_queue) {
                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_wait);
+                        wake_up(&ntic->t_wait);
-                        ntic = ntic->t_next;
+                }
-                } while (ntic != log->l_write_headq);
-                if (ntic != log->l_write_headq) {
-                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
-                                xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (ntic != list_first_entry(&log->l_writeq,
+                                                struct xlog_ticket, t_queue)) {
+                        if (list_empty(&tic->t_queue))
+                                list_add_tail(&tic->t_queue, &log->l_writeq);
                        trace_xfs_log_regrant_write_sleep1(log, tic);
-                        spin_unlock(&log->l_grant_lock);
+                        xlog_grant_push_ail(log, need_bytes);
-                        xlog_grant_push_ail(log->l_mp, need_bytes);
-                        spin_lock(&log->l_grant_lock);
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
+                        xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-                                &log->l_grant_lock, s);
-                        /* If we're shutting down, this tic is already
-                         * off the queue */
-                        spin_lock(&log->l_grant_lock);
-                        if (XLOG_FORCED_SHUTDOWN(log))
-                                goto error_return;
                        trace_xfs_log_regrant_write_wake1(log, tic);
-                }
+                } else
+                        spin_unlock(&log->l_grant_write_lock);
        }
 redo:
        if (XLOG_FORCED_SHUTDOWN(log))
-                goto error_return;
+                goto error_return_unlocked;
-        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+        free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-                                     log->l_grant_write_bytes);
        if (free_bytes < need_bytes) {
-                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                spin_lock(&log->l_grant_write_lock);
-                        xlog_ins_ticketq(&log->l_write_headq, tic);
+                if (list_empty(&tic->t_queue))
-                spin_unlock(&log->l_grant_lock);
+                        list_add_tail(&tic->t_queue, &log->l_writeq);
-                xlog_grant_push_ail(log->l_mp, need_bytes);
-                spin_lock(&log->l_grant_lock);
-                XFS_STATS_INC(xs_sleep_logspace);
-                trace_xfs_log_regrant_write_sleep2(log, tic);
-                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
-                /* If we're shutting down, this tic is already off the queue */
-                spin_lock(&log->l_grant_lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto error_return;
+                xlog_grant_push_ail(log, need_bytes);
+                XFS_STATS_INC(xs_sleep_logspace);
+                trace_xfs_log_regrant_write_sleep2(log, tic);
+                xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
                trace_xfs_log_regrant_write_wake2(log, tic);
                goto redo;
-        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+        }
-                xlog_del_ticketq(&log->l_write_headq, tic);
-        /* we've got enough space */
+        if (!list_empty(&tic->t_queue)) {
-        xlog_grant_add_space_write(log, need_bytes);
+                spin_lock(&log->l_grant_write_lock);
-#ifdef DEBUG
+                list_del_init(&tic->t_queue);
-        tail_lsn = log->l_tail_lsn;
+                spin_unlock(&log->l_grant_write_lock);
-        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
-                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
-                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
        }
-#endif
+        /* we've got enough space */
+        xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
        trace_xfs_log_regrant_write_exit(log, tic);
+        xlog_verify_grant_tail(log);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        return 0;
+ error_return_unlocked:
+        spin_lock(&log->l_grant_write_lock);
 error_return:
-        if (tic->t_flags & XLOG_TIC_IN_Q)
+        list_del_init(&tic->t_queue);
-                xlog_del_ticketq(&log->l_reserve_headq, tic);
+        spin_unlock(&log->l_grant_write_lock);
        trace_xfs_log_regrant_write_error(log, tic);
        /*
@@ -2778,7 +2710,6 @@ redo:
         */
        tic->t_curr_res = 0;
        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-        spin_unlock(&log->l_grant_lock);
        return XFS_ERROR(EIO);
 }       /* xlog_regrant_write_log_space */
@@ -2799,27 +2730,24 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head,
-        xlog_grant_sub_space(log, ticket->t_curr_res);
+                                        ticket->t_curr_res);
+        xlog_grant_sub_space(log, &log->l_grant_write_head,
+                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
        trace_xfs_log_regrant_reserve_sub(log, ticket);
-        xlog_verify_grant_head(log, 1);
        /* just return if we still have some of the pre-reserved space */
-        if (ticket->t_cnt > 0) {
+        if (ticket->t_cnt > 0)
-                spin_unlock(&log->l_grant_lock);
                return;
-        }
-        xlog_grant_add_space_reserve(log, ticket->t_unit_res);
+        xlog_grant_add_space(log, &log->l_grant_reserve_head,
+                                        ticket->t_unit_res);
        trace_xfs_log_regrant_reserve_exit(log, ticket);
-        xlog_verify_grant_head(log, 0);
-        spin_unlock(&log->l_grant_lock);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2771,29 @@ STATIC void
 xlog_ungrant_log_space(xlog_t        *log,
                       xlog_ticket_t *ticket)
 {
+        int     bytes;
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
-        spin_lock(&log->l_grant_lock);
        trace_xfs_log_ungrant_enter(log, ticket);
-        xlog_grant_sub_space(log, ticket->t_curr_res);
        trace_xfs_log_ungrant_sub(log, ticket);
-        /* If this is a permanent reservation ticket, we may be able to free
+        /*
+         * If this is a permanent reservation ticket, we may be able to free
         * up more space based on the remaining count.
         */
+        bytes = ticket->t_curr_res;
        if (ticket->t_cnt > 0) {
                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
-                xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
+                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
+        xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+        xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
        trace_xfs_log_ungrant_exit(log, ticket);
-        xlog_verify_grant_head(log, 1);
-        spin_unlock(&log->l_grant_lock);
        xfs_log_move_tail(log->l_mp, 1);
 }       /* xlog_ungrant_log_space */
@@ -2901,11 +2830,11 @@ xlog_state_release_iclog(
        if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
                /* update tail before writing to iclog */
-                xlog_assign_tail_lsn(log->l_mp);
+                xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
                sync++;
                iclog->ic_state = XLOG_STATE_SYNCING;
-                iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
+                iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
-                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, tail_lsn);
                /* cycle incremented when incrementing curr_block */
        }
        spin_unlock(&log->l_icloglock);
@@ -3088,7 +3017,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
+                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3119,10 +3048,8 @@ xfs_log_force(
        int     error;
        error = _xfs_log_force(mp, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3206,8 +3133,8 @@ try_again:
                                XFS_STATS_INC(xs_log_force_sleep);
-                                sv_wait(&iclog->ic_prev->ic_write_wait,
+                                xlog_wait(&iclog->ic_prev->ic_write_wait,
-                                        PSWP, &log->l_icloglock, s);
+                                                        &log->l_icloglock);
                                if (log_flushed)
                                        *log_flushed = 1;
                                already_slept = 1;
@@ -3235,7 +3162,7 @@ try_again:
                                return XFS_ERROR(EIO);
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
                        /*
                         * No need to grab the log lock here since we're
                         * only deciding whether or not to return EIO
@@ -3271,10 +3198,8 @@ xfs_log_force_lsn(
        int     error;
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3310,10 +3235,8 @@ xfs_log_ticket_put(
        xlog_ticket_t   *ticket)
 {
        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        if (atomic_dec_and_test(&ticket->t_ref)) {
+        if (atomic_dec_and_test(&ticket->t_ref))
-                sv_destroy(&ticket->t_wait);
                kmem_zone_free(xfs_log_ticket_zone, ticket);
-        }
 }
 xlog_ticket_t *
@@ -3435,6 +3358,7 @@ xlog_ticket_alloc(
        }
        atomic_set(&tic->t_ref, 1);
+        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3445,7 +3369,7 @@ xlog_ticket_alloc(
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
+        init_waitqueue_head(&tic->t_wait);
        xlog_tic_reset_res(tic);
@@ -3480,22 +3404,45 @@ xlog_verify_dest_ptr(
        }
        if (!good_ptr)
-                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+                xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 }
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
 STATIC void
-xlog_verify_grant_head(xlog_t *log, int equals)
+xlog_verify_grant_tail(
-{
+        struct log      *log)
-    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+{
-        if (equals)
+        int             tail_cycle, tail_blocks;
-            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
+        int             cycle, space;
-        else
-            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
-    } else {
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
-        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+        if (tail_cycle != cycle) {
-        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+                if (cycle - 1 != tail_cycle &&
-    }
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
-}       /* xlog_verify_grant_head */
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: cycle - 1 != tail_cycle", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
+                if (space > BBTOB(tail_blocks) &&
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: space > BBTOB(tail_blocks)", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
+        }
+}
 /* check if it will fit */
 STATIC void
@@ -3509,16 +3456,16 @@ xlog_verify_tail_lsn(xlog_t	    *log,
        blocks =
            log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
        if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    } else {
        ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
        if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
-            xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+                xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
        blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
        if (blocks < BTOBB(iclog->ic_offset) + 1)
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    }
 }       /* xlog_verify_tail_lsn */
@@ -3558,22 +3505,23 @@ xlog_verify_iclog(xlog_t	 *log,
        icptr = log->l_iclog;
        for (i=0; i < log->l_iclog_bufs; i++) {
                if (icptr == NULL)
-                        xlog_panic("xlog_verify_iclog: invalid ptr");
+                        xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
                icptr = icptr->ic_next;
        }
        if (icptr != log->l_iclog)
-                xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+                xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
        spin_unlock(&log->l_icloglock);
        /* check log magic numbers */
        if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
-                xlog_panic("xlog_verify_iclog: invalid magic num");
+                xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
        ptr = (xfs_caddr_t) &iclog->ic_header;
        for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
             ptr += BBSIZE) {
                if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-                        xlog_panic("xlog_verify_iclog: unexpected magic num");
+                        xfs_emerg(log->l_mp, "%s: unexpected magic num",
+                                __func__);
        }
        /* check fields */
@@ -3603,9 +3551,10 @@ xlog_verify_iclog(xlog_t	 *log,
                        }
                }
                if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
-                        cmn_err(CE_WARN, "xlog_verify_iclog: "
+                        xfs_warn(log->l_mp,
-                                "invalid clientid %d op 0x%p offset 0x%lx",
+                                "%s: invalid clientid %d op 0x%p offset 0x%lx",
-                                clientid, ophead, (unsigned long)field_offset);
+                                __func__, clientid, ophead,
+                                (unsigned long)field_offset);
                /* check length */
                field_offset = (__psint_t)
@@ -3716,12 +3665,10 @@ xfs_log_force_umount(
                xlog_cil_force(log);
        /*
-         * We must hold both the GRANT lock and the LOG lock,
+         * mark the filesystem and the as in a shutdown state and wake
-         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell them the bad news.
-         * everybody up to tell the bad news.
         */
        spin_lock(&log->l_icloglock);
-        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
        if (mp->m_sb_bp)
                XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3689,21 @@ xfs_log_force_umount(
        spin_unlock(&log->l_icloglock);
        /*
-         * We don't want anybody waiting for log reservations
+         * We don't want anybody waiting for log reservations after this. That
-         * after this. That means we have to wake up everybody
+         * means we have to wake up everybody queued up on reserveq as well as
-         * queued up on reserve_headq as well as write_headq.
+         * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
-         * In addition, we make sure in xlog_{re}grant_log_space
+         * we don't enqueue anything once the SHUTDOWN flag is set, and this
-         * that we don't enqueue anything once the SHUTDOWN flag
+         * action is protected by the grant locks.
-         * is set, and this action is protected by the GRANTLOCK.
         */
-        if ((tic = log->l_reserve_headq)) {
+        spin_lock(&log->l_grant_reserve_lock);
-                do {
+        list_for_each_entry(tic, &log->l_reserveq, t_queue)
-                        sv_signal(&tic->t_wait);
+                wake_up(&tic->t_wait);
-                        tic = tic->t_next;
+        spin_unlock(&log->l_grant_reserve_lock);
-                } while (tic != log->l_reserve_headq);
-        }
+        spin_lock(&log->l_grant_write_lock);
+        list_for_each_entry(tic, &log->l_writeq, t_queue)
-        if ((tic = log->l_write_headq)) {
+                wake_up(&tic->t_wait);
-                do {
+        spin_unlock(&log->l_grant_write_lock);
-                        sv_signal(&tic->t_wait);
-                        tic = tic->t_next;
-                } while (tic != log->l_write_headq);
-        }
-        spin_unlock(&log->l_grant_lock);
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97b..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
        init_rwsem(&cil->xc_ctx_lock);
-        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        init_waitqueue_head(&cil->xc_commit_wait);
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
        int     abort)
 {
        struct xfs_cil_ctx      *ctx = args;
-        struct xfs_log_vec      *lv;
-        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
        struct xfs_busy_extent  *busyp, *n;
-        /* unpin all the log items */
+        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
-        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                                        ctx->start_lsn, abort);
-                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
-                                                        abortflag);
-        }
        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -548,7 +543,7 @@ xlog_cil_push(
        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
        if (error)
-                goto out_abort;
+                goto out_abort_free_ticket;
        /*
         * now that we've written the checkpoint into the log, strictly
@@ -568,14 +563,15 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
        }
        spin_unlock(&cil->xc_cil_lock);
+        /* xfs_log_done always frees the ticket on error. */
        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-        if (error || commit_lsn == -1)
+        if (commit_lsn == -1)
                goto out_abort;
        /* attach all the transactions w/ busy extents to iclog */
@@ -592,7 +588,7 @@ restart:
         */
        spin_lock(&cil->xc_cil_lock);
        ctx->commit_lsn = commit_lsn;
-        sv_broadcast(&cil->xc_commit_wait);
+        wake_up_all(&cil->xc_commit_wait);
        spin_unlock(&cil->xc_cil_lock);
        /* release the hounds! */
@@ -605,6 +601,8 @@ out_free_ticket:
        kmem_free(new_ctx);
        return 0;
+out_abort_free_ticket:
+        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
        return XFS_ERROR(EIO);
@@ -627,7 +625,7 @@ out_abort:
 * background commit, returns without it held once background commits are
 * allowed again.
 */
-int
+void
 xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -642,11 +640,6 @@ xfs_log_commit_cil(
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
        /*
         * do all the hard work of formatting items (including memory
         * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -706,7 +699,6 @@ xfs_log_commit_cil(
         */
        if (push)
                xlog_cil_push(log, 0);
-        return 0;
 }
 /*
@@ -757,7 +749,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..5864850e9e34 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
 struct xfs_buf;
 struct log;
 struct xlog_ticket;
-struct xfs_buf_cancel;
 struct xfs_mount;
 /*
@@ -54,7 +53,6 @@ struct xfs_mount;
        BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
 static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
 {
        return ((xfs_lsn_t)cycle << 32) | block;
@@ -89,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
        return be32_to_cpu(i) >> 24;
 }
-#define xlog_panic(args...)     cmn_err(CE_PANIC, ## args)
-#define xlog_exit(args...)      cmn_err(CE_PANIC, ## args)
-#define xlog_warn(args...)      cmn_err(CE_WARN, ## args)
 /*
 * In core log state
 */
@@ -133,12 +127,10 @@ static inline uint xlog_get_client_id(__be32 i)
 */
 #define XLOG_TIC_INITED         0x1     /* has been initialized */
 #define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
-#define XLOG_TIC_IN_Q           0x4
 #define XLOG_TIC_FLAGS \
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
-        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \
+        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
-        { XLOG_TIC_IN_Q,        "XLOG_TIC_IN_Q" }
 #endif  /* __KERNEL__ */
@@ -152,6 +144,7 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
+#define XLOG_TAIL_WARN          0x10    /* log tail verify warning issued */
 #ifdef __KERNEL__
 /*
@@ -244,9 +237,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_wait;       /* ticket wait queue            : 20 */
+        wait_queue_head_t  t_wait;       /* ticket wait queue */
-        struct xlog_ticket *t_next;      /*                              :4|8 */
+        struct list_head   t_queue;      /* reserve/write queue */
-        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -353,8 +345,8 @@ typedef union xlog_in_core2 {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_in_core {
-        sv_t                    ic_force_wait;
+        wait_queue_head_t       ic_force_wait;
-        sv_t                    ic_write_wait;
+        wait_queue_head_t       ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -421,7 +413,7 @@ struct xfs_cil {
        struct xfs_cil_ctx      *xc_ctx;
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
-        sv_t                    xc_commit_wait;
+        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
 };
@@ -491,7 +483,7 @@ typedef struct log {
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
-        struct xfs_buf_cancel   **l_buf_cancel_table;
+        struct list_head        *l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
@@ -503,29 +495,40 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
+        wait_queue_head_t       l_flush_wait ____cacheline_aligned_in_smp;
                                                /* waiting for iclog flush */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
        spinlock_t              l_icloglock;    /* grab to change iclog state */
-        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
-                                                 * buffers */
-        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
        int                     l_curr_cycle;   /* Cycle number of log writes */
        int                     l_prev_cycle;   /* Cycle number before last
                                                 * block increment */
        int                     l_curr_block;   /* current logical log block */
        int                     l_prev_block;   /* previous logical log block */
-        /* The following block of fields are changed while holding grant_lock */
+        /*
-        spinlock_t              l_grant_lock ____cacheline_aligned_in_smp;
+         * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
-        xlog_ticket_t           *l_reserve_headq;
+         * read without needing to hold specific locks. To avoid operations
-        xlog_ticket_t           *l_write_headq;
+         * contending with other hot objects, place each of them on a separate
-        int                     l_grant_reserve_cycle;
+         * cacheline.
-        int                     l_grant_reserve_bytes;
+         */
-        int                     l_grant_write_cycle;
+        /* lsn of last LR on disk */
-        int                     l_grant_write_bytes;
+        atomic64_t              l_last_sync_lsn ____cacheline_aligned_in_smp;
+        /* lsn of 1st LR with unflushed * buffers */
+        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
+        /*
+         * ticket grant locks, queues and accounting have their own cachlines
+         * as these are quite hot and can be operated on concurrently.
+         */
+        spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_reserveq;
+        atomic64_t              l_grant_reserve_head;
+        spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
+        struct list_head        l_writeq;
+        atomic64_t              l_grant_write_head;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -534,6 +537,9 @@ typedef struct log {
 } xlog_t;
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+        ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
@@ -562,6 +568,61 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
                                xlog_in_core_t **commit_iclog, uint flags);
 /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to sample and crack LSNs that are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+        xfs_lsn_t val = atomic64_read(lsn);
+        *cycle = CYCLE_LSN(val);
+        *block = BLOCK_LSN(val);
+}
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+        atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+        *cycle = val >> 32;
+        *space = val & 0xffffffff;
+}
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+        xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+        return ((int64_t)cycle << 32) | space;
+}
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+        atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+/*
 * Committed Item List interfaces
 */
 int     xlog_cil_init(struct log *log);
@@ -585,6 +646,21 @@ xlog_cil_force(struct log *log)
 */
 #define XLOG_UNMOUNT_REC_TYPE   (-1U)
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(wq, &wait);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
+        spin_unlock(lock);
+        schedule();
+        remove_wait_queue(wq, &wait);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458c..5cc464a17c93 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #endif
 /*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct list_head        bc_list;
+};
+/*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
@@ -81,7 +92,7 @@ xlog_get_bp(
        int             nbblks)
 {
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
@@ -90,7 +101,7 @@ xlog_get_bp(
        /*
         * We do log I/O in units of log sectors (a power-of-2
         * multiple of the basic block size), so we round up the
-         * requested size to acommodate the basic blocks required
+         * requested size to accommodate the basic blocks required
         * for complete log sectors.
         *
         * In addition, the buffer may be used for a non-sector-
@@ -101,7 +112,7 @@ xlog_get_bp(
         * an issue.  Nor will this be a problem if the log I/O is
         * done in basic blocks (sector size 1).  But otherwise we
         * extend the buffer by one extra log sector to ensure
-         * there's space to accomodate this possiblility.
+         * there's space to accommodate this possibility.
         */
        if (nbblks > 1 && log->l_sectBBsize > 1)
                nbblks += log->l_sectBBsize;
@@ -149,7 +160,7 @@ xlog_bread_noalign(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -208,7 +219,7 @@ xlog_bwrite(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -243,9 +254,9 @@ xlog_header_check_dump(
        xfs_mount_t             *mp,
        xlog_rec_header_t       *head)
 {
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-        cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
@@ -268,15 +279,15 @@ xlog_header_check_recover(
         * a dirty log created in IRIX.
         */
        if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log written in incompatible format - can't recover");
+        "dirty log written in incompatible format - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
                                 XFS_ERRLEVEL_HIGH, mp);
                return XFS_ERROR(EFSCORRUPTED);
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log entry has mismatched uuid - can't recover");
+        "dirty log entry has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -301,9 +312,9 @@ xlog_header_check_mount(
                 * h_fs_uuid is nil, we assume this log was last mounted
                 * by IRIX and continue.
                 */
-                xlog_warn("XFS: nil uuid in log - IRIX style log");
+                xfs_warn(mp, "nil uuid in log - IRIX style log");
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn("XFS: log has mismatched uuid - can't recover");
+                xfs_warn(mp, "log has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_mount",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -479,8 +490,8 @@ xlog_find_verify_log_record(
        for (i = (*last_blk) - 1; i >= 0; i--) {
                if (i < start_blk) {
                        /* valid log record not found */
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-                "XFS: Log inconsistent (didn't find previous header)");
+                "Log inconsistent (didn't find previous header)");
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        goto out;
@@ -580,12 +591,12 @@ xlog_find_head(
                         * mkfs etc write a dummy unmount record to a fresh
                         * log so we can store the uuid in there
                         */
-                        xlog_warn("XFS: totally zeroed log");
+                        xfs_warn(log->l_mp, "totally zeroed log");
                }
                return 0;
        } else if (error) {
-                xlog_warn("XFS: empty log check failed");
+                xfs_warn(log->l_mp, "empty log check failed");
                return error;
        }
@@ -808,7 +819,7 @@ validate_head:
        xlog_put_bp(bp);
        if (error)
-            xlog_warn("XFS: failed to find log head");
+                xfs_warn(log->l_mp, "failed to find log head");
        return error;
 }
@@ -901,7 +912,7 @@ xlog_find_tail(
                }
        }
        if (!found) {
-                xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -925,12 +936,12 @@ xlog_find_tail(
        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
        if (found == 2)
                log->l_curr_cycle++;
-        log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
+        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-        log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
+        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
-        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
-        log->l_grant_write_cycle = log->l_curr_cycle;
+        xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
-        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+                                        BBTOB(log->l_curr_block));
        /*
         * Look for unmount record.  If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
        }
        after_umount_blk = (i + hblks + (int)
                BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-        tail_lsn = log->l_tail_lsn;
+        tail_lsn = atomic64_read(&log->l_tail_lsn);
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
                         * log records will point recovery to after the
                         * current unmount record.
                         */
-                        log->l_tail_lsn =
+                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
-                                xlog_assign_lsn(log->l_curr_cycle,
+                                        log->l_curr_cycle, after_umount_blk);
-                                                after_umount_blk);
+                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-                        log->l_last_sync_lsn =
+                                        log->l_curr_cycle, after_umount_blk);
-                                xlog_assign_lsn(log->l_curr_cycle,
-                                                after_umount_blk);
                        *tail_blk = after_umount_blk;
                        /*
@@ -1019,7 +1028,7 @@ done:
        xlog_put_bp(bp);
        if (error)
-                xlog_warn("XFS: failed to locate log tail");
+                xfs_warn(log->l_mp, "failed to locate log tail");
        return error;
 }
@@ -1083,7 +1092,8 @@ xlog_find_zeroed(
                 * the first block must be 1. If it's not, maybe we're
                 * not looking at a log... Bail out.
                 */
-                xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+                xfs_warn(log->l_mp,
+                        "Log inconsistent or not a log (last==0, first!=1)");
                return XFS_ERROR(EINVAL);
        }
@@ -1497,8 +1507,8 @@ xlog_recover_add_to_trans(
        if (list_empty(&trans->r_itemq)) {
                /* we need to catch log corruptions here */
                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-                        xlog_warn("XFS: xlog_recover_add_to_trans: "
+                        xfs_warn(log->l_mp, "%s: bad header magic number",
-                                  "bad header magic number");
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1525,8 +1535,8 @@ xlog_recover_add_to_trans(
        if (item->ri_total == 0) {              /* first region to be added */
                if (in_f->ilf_size == 0 ||
                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: bad number of regions (%d) in inode log format",
+                "bad number of regions (%d) in inode log format",
                                  in_f->ilf_size);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
@@ -1583,8 +1593,9 @@ xlog_recover_reorder_trans(
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+                                "%s: unrecognized type of log operation",
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1605,82 +1616,45 @@ xlog_recover_reorder_trans(
 * record in the table to tell us how many times we expect to see this
 * record during the second pass.
 */
-STATIC void
+STATIC int
-xlog_recover_do_buffer_pass1(
+xlog_recover_buffer_pass1(
-        xlog_t                  *log,
+        struct log              *log,
-        xfs_buf_log_format_t    *buf_f)
+        xlog_recover_item_t     *item)
 {
-        xfs_buf_cancel_t        *bcp;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_buf_cancel_t        *nextp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
-        xfs_daddr_t             blkno = 0;
-        uint                    len = 0;
-        ushort                  flags = 0;
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        }
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLF_CANCEL)) {
+        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
-                return;
+                return 0;
-        }
-        /*
-         * Insert an xfs_buf_cancel record into the hash table of
-         * them.  If there is already an identical record, bump
-         * its reference count.
-         */
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        /*
-         * If the hash bucket is empty then just insert a new record into
-         * the bucket.
-         */
-        if (*bucket == NULL) {
-                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
-                                                     KM_SLEEP);
-                bcp->bc_blkno = blkno;
-                bcp->bc_len = len;
-                bcp->bc_refcount = 1;
-                bcp->bc_next = NULL;
-                *bucket = bcp;
-                return;
        }
        /*
-         * The hash bucket is not empty, so search for duplicates of our
+         * Insert an xfs_buf_cancel record into the hash table of them.
-         * record.  If we find one them just bump its refcount.  If not
+         * If there is already an identical record, bump its reference count.
-         * then add us at the end of the list.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
-        nextp = *bucket;
+        list_for_each_entry(bcp, bucket, bc_list) {
-        while (nextp != NULL) {
+                if (bcp->bc_blkno == buf_f->blf_blkno &&
-                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                    bcp->bc_len == buf_f->blf_len) {
-                        nextp->bc_refcount++;
+                        bcp->bc_refcount++;
                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
-                        return;
+                        return 0;
                }
-                prevp = nextp;
+        }
-                nextp = nextp->bc_next;
-        }
+        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
-        ASSERT(prevp != NULL);
+        bcp->bc_blkno = buf_f->blf_blkno;
-        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+        bcp->bc_len = buf_f->blf_len;
-                                             KM_SLEEP);
-        bcp->bc_blkno = blkno;
-        bcp->bc_len = len;
        bcp->bc_refcount = 1;
-        bcp->bc_next = NULL;
+        list_add_tail(&bcp->bc_list, bucket);
-        prevp->bc_next = bcp;
        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+        return 0;
 }
 /*
@@ -1698,14 +1672,13 @@ xlog_recover_do_buffer_pass1(
 */
 STATIC int
 xlog_check_buffer_cancelled(
-        xlog_t                  *log,
+        struct log              *log,
        xfs_daddr_t             blkno,
        uint                    len,
        ushort                  flags)
 {
-        xfs_buf_cancel_t        *bcp;
+        struct list_head        *bucket;
-        xfs_buf_cancel_t        *prevp;
+        struct xfs_buf_cancel   *bcp;
-        xfs_buf_cancel_t        **bucket;
        if (log->l_buf_cancel_table == NULL) {
                /*
@@ -1716,128 +1689,70 @@ xlog_check_buffer_cancelled(
                return 0;
        }
-        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
-                                          XLOG_BC_TABLE_SIZE];
-        bcp = *bucket;
-        if (bcp == NULL) {
-                /*
-                 * There is no corresponding entry in the table built
-                 * in pass one, so this buffer has not been cancelled.
-                 */
-                ASSERT(!(flags & XFS_BLF_CANCEL));
-                return 0;
-        }
        /*
-         * Search for an entry in the buffer cancel table that
+         * Search for an entry in the  cancel table that matches our buffer.
-         * matches our buffer.
         */
-        prevp = NULL;
+        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
-        while (bcp != NULL) {
+        list_for_each_entry(bcp, bucket, bc_list) {
-                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                        /*
+                        goto found;
-                         * We've go a match, so return 1 so that the
-                         * recovery of this buffer is cancelled.
-                         * If this buffer is actually a buffer cancel
-                         * log item, then decrement the refcount on the
-                         * one in the table and remove it if this is the
-                         * last reference.
-                         */
-                        if (flags & XFS_BLF_CANCEL) {
-                                bcp->bc_refcount--;
-                                if (bcp->bc_refcount == 0) {
-                                        if (prevp == NULL) {
-                                                *bucket = bcp->bc_next;
-                                        } else {
-                                                prevp->bc_next = bcp->bc_next;
-                                        }
-                                        kmem_free(bcp);
-                                }
-                        }
-                        return 1;
-                }
-                prevp = bcp;
-                bcp = bcp->bc_next;
        }
        /*
-         * We didn't find a corresponding entry in the table, so
+         * We didn't find a corresponding entry in the table, so return 0 so
-         * return 0 so that the buffer is NOT cancelled.
+         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
-}
-STATIC int
+found:
-xlog_recover_do_buffer_pass2(
+        /*
-        xlog_t                  *log,
+         * We've go a match, so return 1 so that the recovery of this buffer
-        xfs_buf_log_format_t    *buf_f)
+         * is cancelled.  If this buffer is actually a buffer cancel log
-{
+         * item, then decrement the refcount on the one in the table and
-        xfs_daddr_t             blkno = 0;
+         * remove it if this is the last reference.
-        ushort                  flags = 0;
+         */
-        uint                    len = 0;
+        if (flags & XFS_BLF_CANCEL) {
+                if (--bcp->bc_refcount == 0) {
-        switch (buf_f->blf_type) {
+                        list_del(&bcp->bc_list);
-        case XFS_LI_BUF:
+                        kmem_free(bcp);
-                blkno = buf_f->blf_blkno;
+                }
-                flags = buf_f->blf_flags;
-                len = buf_f->blf_len;
-                break;
        }
+        return 1;
-        return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
- * Perform recovery for a buffer full of inodes.  In these buffers,
+ * Perform recovery for a buffer full of inodes.  In these buffers, the only
- * the only data which should be recovered is that which corresponds
+ * data which should be recovered is that which corresponds to the
- * to the di_next_unlinked pointers in the on disk inode structures.
+ * di_next_unlinked pointers in the on disk inode structures.  The rest of the
- * The rest of the data for the inodes is always logged through the
+ * data for the inodes is always logged through the inodes themselves rather
- * inodes themselves rather than the inode buffer and is recovered
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- * in xlog_recover_do_inode_trans().
 *
- * The only time when buffers full of inodes are fully recovered is
+ * The only time when buffers full of inodes are fully recovered is when the
- * when the buffer is full of newly allocated inodes.  In this case
+ * buffer is full of newly allocated inodes.  In this case the buffer will
- * the buffer will not be marked as an inode buffer and so will be
+ * not be marked as an inode buffer and so will be sent to
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * xlog_recover_do_reg_buffer() below during recovery.
 */
 STATIC int
 xlog_recover_do_inode_buffer(
-        xfs_mount_t             *mp,
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
-        int                     item_index;
+        int                     item_index = 0;
-        int                     bit;
+        int                     bit = 0;
-        int                     nbits;
+        int                     nbits = 0;
-        int                     reg_buf_offset;
+        int                     reg_buf_offset = 0;
-        int                     reg_buf_bytes;
+        int                     reg_buf_bytes = 0;
        int                     next_unlinked_offset;
        int                     inodes_per_buf;
        xfs_agino_t             *logged_nextp;
        xfs_agino_t             *buffer_nextp;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
-        /*
-         * Set the variables corresponding to the current region to
-         * 0 so that we'll initialize them on the first pass through
-         * the loop.
-         */
-        reg_buf_offset = 0;
-        reg_buf_bytes = 0;
-        bit = 0;
-        nbits = 0;
-        item_index = 0;
        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
        for (i = 0; i < inodes_per_buf; i++) {
                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1767,18 @@ xlog_recover_do_inode_buffer(
                         * the current di_next_unlinked field.
                         */
                        bit += nbits;
-                        bit = xfs_next_bit(data_map, map_size, bit);
+                        bit = xfs_next_bit(buf_f->blf_data_map,
+                                           buf_f->blf_map_size, bit);
                        /*
                         * If there are no more logged regions in the
                         * buffer, then we're done.
                         */
-                        if (bit == -1) {
+                        if (bit == -1)
                                return 0;
-                        }
-                        nbits = xfs_contig_bits(data_map, map_size,
+                        nbits = xfs_contig_bits(buf_f->blf_data_map,
-                                                         bit);
+                                                buf_f->blf_map_size, bit);
                        ASSERT(nbits > 0);
                        reg_buf_offset = bit << XFS_BLF_SHIFT;
                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1790,8 @@ xlog_recover_do_inode_buffer(
                 * di_next_unlinked field, then move on to the next
                 * di_next_unlinked field.
                 */
-                if (next_unlinked_offset < reg_buf_offset) {
+                if (next_unlinked_offset < reg_buf_offset)
                        continue;
-                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1891,8 +1805,9 @@ xlog_recover_do_inode_buffer(
                logged_nextp = item->ri_buf[item_index].i_addr +
                                next_unlinked_offset - reg_buf_offset;
                if (unlikely(*logged_nextp == 0)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+                "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+                "Trying to replay bad (0) inode di_next_unlinked field.",
                                item, bp);
                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
                                         XFS_ERRLEVEL_LOW, mp);
@@ -1913,36 +1828,29 @@ xlog_recover_do_inode_buffer(
 * given buffer.  The bitmap in the buf log format structure indicates
 * where to place the logged data.
 */
-/*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
 {
        int                     i;
        int                     bit;
        int                     nbits;
-        unsigned int            *data_map = NULL;
-        unsigned int            map_size = 0;
        int                     error;
        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                data_map = buf_f->blf_data_map;
-                map_size = buf_f->blf_map_size;
-                break;
-        }
        bit = 0;
        i = 1;  /* 0 is the buf format structure */
        while (1) {
-                bit = xfs_next_bit(data_map, map_size, bit);
+                bit = xfs_next_bit(buf_f->blf_data_map,
+                                   buf_f->blf_map_size, bit);
                if (bit == -1)
                        break;
-                nbits = xfs_contig_bits(data_map, map_size, bit);
+                nbits = xfs_contig_bits(buf_f->blf_data_map,
+                                        buf_f->blf_map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -1958,17 +1866,17 @@ xlog_recover_do_reg_buffer(
                if (buf_f->blf_flags &
                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: NULL dquot in %s.", __func__);
                                goto next;
                        }
                        if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: dquot too small (%d) in %s.",
                                        item->ri_buf[i].i_len, __func__);
                                goto next;
                        }
-                        error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
+                        error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
                        if (error)
@@ -1993,6 +1901,7 @@ xlog_recover_do_reg_buffer(
 */
 int
 xfs_qm_dqcheck(
+        struct xfs_mount *mp,
        xfs_disk_dquot_t *ddq,
        xfs_dqid_t       id,
        uint             type,    /* used only when IO_dorepair is true */
@@ -2019,14 +1928,14 @@ xfs_qm_dqcheck(
         */
        if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
                errs++;
        }
        if (ddq->d_version != XFS_DQUOT_VERSION) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
                errs++;
@@ -2036,7 +1945,7 @@ xfs_qm_dqcheck(
            ddq->d_flags != XFS_DQ_PROJ &&
            ddq->d_flags != XFS_DQ_GROUP) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
                        str, id, ddq->d_flags);
                errs++;
@@ -2044,7 +1953,7 @@ xfs_qm_dqcheck(
        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : ondisk-dquot 0x%p, ID mismatch: "
                        "0x%x expected, found id 0x%x",
                        str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -2057,9 +1966,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_blk_softlimit)) {
                        if (!ddq->d_btimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-                                        "BLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -2069,9 +1977,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_ino_softlimit)) {
                        if (!ddq->d_itimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-                                        "INODE TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -2081,9 +1988,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_rtb_softlimit)) {
                        if (!ddq->d_rtbtimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-                                        "RTBLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -2094,7 +2000,7 @@ xfs_qm_dqcheck(
                return errs;
        if (flags & XFS_QMOPT_DOWARN)
-                cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+                xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
        /*
         * Typically, a repair is only requested by quotacheck.
@@ -2176,77 +2082,46 @@ xlog_recover_do_dquot_buffer(
 * for more details on the implementation of the table of cancel records.
 */
 STATIC int
-xlog_recover_do_buffer_trans(
+xlog_recover_buffer_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
-        int                     cancel;
-        xfs_daddr_t             blkno;
-        int                     len;
-        ushort                  flags;
        uint                    buf_flags;
-        if (pass == XLOG_RECOVER_PASS1) {
+        /*
-                /*
+         * In this pass we only want to recover all the buffers which have
-                 * In this pass we're only looking for buf items
+         * not been cancelled and are not cancellation buffers themselves.
-                 * with the XFS_BLF_CANCEL bit set.
+         */
-                 */
+        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
-                xlog_recover_do_buffer_pass1(log, buf_f);
+                        buf_f->blf_len, buf_f->blf_flags)) {
+                trace_xfs_log_recover_buf_cancel(log, buf_f);
                return 0;
-        } else {
-                /*
-                 * In this pass we want to recover all the buffers
-                 * which have not been cancelled and are not
-                 * cancellation buffers themselves.  The routine
-                 * we call here will tell us whether or not to
-                 * continue with the replay of this buffer.
-                 */
-                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
-                if (cancel) {
-                        trace_xfs_log_recover_buf_cancel(log, buf_f);
-                        return 0;
-                }
        }
        trace_xfs_log_recover_buf_recover(log, buf_f);
-        switch (buf_f->blf_type) {
-        case XFS_LI_BUF:
-                blkno = buf_f->blf_blkno;
-                len = buf_f->blf_len;
-                flags = buf_f->blf_flags;
-                break;
-        default:
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
-                        "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
-                        buf_f->blf_type, log->l_mp->m_logname ?
-                        log->l_mp->m_logname : "internal");
-                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
-                                 XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
-        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLF_INODE_BUF))
+        if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
-        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
+        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+                          buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
-                                  bp, blkno);
+                                  bp, buf_f->blf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
        error = 0;
-        if (flags & XFS_BLF_INODE_BUF) {
+        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
-        } else if (flags &
+        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
@@ -2286,16 +2161,14 @@ xlog_recover_do_buffer_trans(
 }
 STATIC int
-xlog_recover_do_inode_trans(
+xlog_recover_inode_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_inode_log_format_t  *in_f;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        xfs_dinode_t            *dip;
-        xfs_ino_t               ino;
        int                     len;
        xfs_caddr_t             src;
        xfs_caddr_t             dest;
@@ -2305,10 +2178,6 @@ xlog_recover_do_inode_trans(
        xfs_icdinode_t          *dicp;
        int                     need_free = 0;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
                in_f = item->ri_buf[0].i_addr;
        } else {
@@ -2318,8 +2187,6 @@ xlog_recover_do_inode_trans(
                if (error)
                        goto error;
        }
-        ino = in_f->ilf_ino;
-        mp = log->l_mp;
        /*
         * Inode buffers can be freed, look out for it,
@@ -2352,10 +2219,10 @@ xlog_recover_do_inode_trans(
         */
        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, ino);
+                        __func__, dip, bp, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2363,10 +2230,10 @@ xlog_recover_do_inode_trans(
        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, ino);
+                        __func__, item, in_f->ilf_ino);
-                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
                goto error;
@@ -2394,12 +2261,13 @@ xlog_recover_do_inode_trans(
        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad regular inode log record, rec ptr 0x%p, "
-                                item, dip, bp, ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2407,45 +2275,48 @@ xlog_recover_do_inode_trans(
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
-                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad dir inode log record, rec ptr 0x%p, "
-                                item, dip, bp, ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
        }
        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
-                        item, dip, bp, ino,
+        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+                        __func__, item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
-                        item, dip, bp, ino, dicp->di_forkoff);
+        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
+                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
        }
        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
-                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+                        "%s: Bad inode log record length %d, rec ptr 0x%p",
-                        item->ri_buf[1].i_len, item);
+                        __func__, item->ri_buf[1].i_len, item);
                error = EFSCORRUPTED;
                goto error;
        }
@@ -2532,7 +2403,7 @@ xlog_recover_do_inode_trans(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2556,18 +2427,11 @@ error:
 * of that type.
 */
 STATIC int
-xlog_recover_do_quotaoff_trans(
+xlog_recover_quotaoff_pass1(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_qoff_logformat_t    *qoff_f;
+        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
-        if (pass == XLOG_RECOVER_PASS2) {
-                return (0);
-        }
-        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2588,22 +2452,17 @@ xlog_recover_do_quotaoff_trans(
 * Recover a dquot record
 */
 STATIC int
-xlog_recover_do_dquot_trans(
+xlog_recover_dquot_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        struct xfs_disk_dquot   *ddq, *recddq;
        int                     error;
        xfs_dq_logformat_t      *dq_f;
        uint                    type;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
-        mp = log->l_mp;
        /*
         * Filesystems are required to send in quota flags at mount time.
@@ -2613,13 +2472,11 @@ xlog_recover_do_dquot_trans(
        recddq = item->ri_buf[1].i_addr;
        if (recddq == NULL) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-                        "XFS: NULL dquot in %s.", __func__);
                return XFS_ERROR(EIO);
        }
        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
-                        "XFS: dquot too small (%d) in %s.",
                        item->ri_buf[1].i_len, __func__);
                return XFS_ERROR(EIO);
        }
@@ -2644,12 +2501,10 @@ xlog_recover_do_dquot_trans(
         */
        dq_f = item->ri_buf[0].i_addr;
        ASSERT(dq_f);
-        if ((error = xfs_qm_dqcheck(recddq,
+        error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           dq_f->qlf_id,
+                           "xlog_recover_dquot_pass2 (log copy)");
-                           0, XFS_QMOPT_DOWARN,
+        if (error)
-                           "xlog_recover_do_dquot_trans (log copy)"))) {
                return XFS_ERROR(EIO);
-        }
        ASSERT(dq_f->qlf_len == 1);
        error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2669,8 +2524,9 @@ xlog_recover_do_dquot_trans(
         * was among a chunk of dquots created earlier, and we did some
         * minimal initialization then.
         */
-        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+        error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_do_dquot_trans")) {
+                           "xlog_recover_dquot_pass2");
+        if (error) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2693,38 +2549,31 @@ xlog_recover_do_dquot_trans(
 * LSN.
 */
 STATIC int
-xlog_recover_do_efi_trans(
+xlog_recover_efi_pass2(
        xlog_t                  *log,
        xlog_recover_item_t     *item,
-        xfs_lsn_t               lsn,
+        xfs_lsn_t               lsn)
-        int                     pass)
 {
        int                     error;
-        xfs_mount_t             *mp;
+        xfs_mount_t             *mp = log->l_mp;
        xfs_efi_log_item_t      *efip;
        xfs_efi_log_format_t    *efi_formatp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return 0;
-        }
        efi_formatp = item->ri_buf[0].i_addr;
-        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
        if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
                                         &(efip->efi_format)))) {
                xfs_efi_item_free(efip);
                return error;
        }
-        efip->efi_next_extent = efi_formatp->efi_nextents;
+        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-        efip->efi_flags |= XFS_EFI_COMMITTED;
        spin_lock(&log->l_ailp->xa_lock);
        /*
         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
        return 0;
 }
@@ -2737,11 +2586,10 @@ xlog_recover_do_efi_trans(
 * efd format structure.  If we find it, we remove the efi from the
 * AIL and free it.
 */
-STATIC void
+STATIC int
-xlog_recover_do_efd_trans(
+xlog_recover_efd_pass2(
        xlog_t                  *log,
-        xlog_recover_item_t     *item,
+        xlog_recover_item_t     *item)
-        int                     pass)
 {
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
@@ -2750,10 +2598,6 @@ xlog_recover_do_efd_trans(
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp = log->l_ailp;
-        if (pass == XLOG_RECOVER_PASS1) {
-                return;
-        }
        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2629,6 @@ xlog_recover_do_efd_trans(
        }
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-}
-/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now.  Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
- */
-STATIC int
-xlog_recover_do_trans(
-        xlog_t                  *log,
-        xlog_recover_t          *trans,
-        int                     pass)
-{
-        int                     error = 0;
-        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(log, trans, pass);
-        if (error)
-                return error;
-        list_for_each_entry(item, &trans->r_itemq, ri_list) {
-                trace_xfs_log_recover_item_recover(log, trans, item, pass);
-                switch (ITEM_TYPE(item)) {
-                case XFS_LI_BUF:
-                        error = xlog_recover_do_buffer_trans(log, item, pass);
-                        break;
-                case XFS_LI_INODE:
-                        error = xlog_recover_do_inode_trans(log, item, pass);
-                        break;
-                case XFS_LI_EFI:
-                        error = xlog_recover_do_efi_trans(log, item,
-                                                          trans->r_lsn, pass);
-                        break;
-                case XFS_LI_EFD:
-                        xlog_recover_do_efd_trans(log, item, pass);
-                        error = 0;
-                        break;
-                case XFS_LI_DQUOT:
-                        error = xlog_recover_do_dquot_trans(log, item, pass);
-                        break;
-                case XFS_LI_QUOTAOFF:
-                        error = xlog_recover_do_quotaoff_trans(log, item,
-                                                               pass);
-                        break;
-                default:
-                        xlog_warn(
-        "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
-                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
-                        break;
-                }
-                if (error)
-                        return error;
-        }
        return 0;
 }
@@ -2852,7 +2640,7 @@ xlog_recover_do_trans(
 */
 STATIC void
 xlog_recover_free_trans(
-        xlog_recover_t          *trans)
+        struct xlog_recover     *trans)
 {
        xlog_recover_item_t     *item, *n;
        int                     i;
@@ -2871,26 +2659,103 @@ xlog_recover_free_trans(
 }
 STATIC int
+xlog_recover_commit_pass1(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass1(log, item);
+        case XFS_LI_QUOTAOFF:
+                return xlog_recover_quotaoff_pass1(log, item);
+        case XFS_LI_INODE:
+        case XFS_LI_EFI:
+        case XFS_LI_EFD:
+        case XFS_LI_DQUOT:
+                /* nothing to do in pass 1 */
+                return 0;
+        default:
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+                        __func__, ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+STATIC int
+xlog_recover_commit_pass2(
+        struct log              *log,
+        struct xlog_recover     *trans,
+        xlog_recover_item_t     *item)
+{
+        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+        switch (ITEM_TYPE(item)) {
+        case XFS_LI_BUF:
+                return xlog_recover_buffer_pass2(log, item);
+        case XFS_LI_INODE:
+                return xlog_recover_inode_pass2(log, item);
+        case XFS_LI_EFI:
+                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+        case XFS_LI_EFD:
+                return xlog_recover_efd_pass2(log, item);
+        case XFS_LI_DQUOT:
+                return xlog_recover_dquot_pass2(log, item);
+        case XFS_LI_QUOTAOFF:
+                /* nothing to do in pass2 */
+                return 0;
+        default:
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+                        __func__, ITEM_TYPE(item));
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+}
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
 xlog_recover_commit_trans(
-        xlog_t                  *log,
+        struct log              *log,
-        xlog_recover_t          *trans,
+        struct xlog_recover     *trans,
        int                     pass)
 {
-        int                     error;
+        int                     error = 0;
+        xlog_recover_item_t     *item;
        hlist_del(&trans->r_list);
-        if ((error = xlog_recover_do_trans(log, trans, pass)))
+        error = xlog_recover_reorder_trans(log, trans, pass);
+        if (error)
                return error;
-        xlog_recover_free_trans(trans);                 /* no error */
+        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                if (pass == XLOG_RECOVER_PASS1)
+                        error = xlog_recover_commit_pass1(log, trans, item);
+                else
+                        error = xlog_recover_commit_pass2(log, trans, item);
+                if (error)
+                        return error;
+        }
+        xlog_recover_free_trans(trans);
        return 0;
 }
 STATIC int
 xlog_recover_unmount_trans(
+        struct log              *log,
        xlog_recover_t          *trans)
 {
        /* Do nothing now */
-        xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
        return 0;
 }
@@ -2933,8 +2798,8 @@ xlog_recover_process_data(
                dp += sizeof(xlog_op_header_t);
                if (ohead->oh_clientid != XFS_TRANSACTION &&
                    ohead->oh_clientid != XFS_LOG) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
-                "XFS: xlog_recover_process_data: bad clientid");
+                                        __func__, ohead->oh_clientid);
                        ASSERT(0);
                        return (XFS_ERROR(EIO));
                }
@@ -2947,8 +2812,8 @@ xlog_recover_process_data(
                                        be64_to_cpu(rhead->h_lsn));
                } else {
                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
-                        "XFS: xlog_recover_process_data: bad length");
+                                        __func__, be32_to_cpu(ohead->oh_len));
                                WARN_ON(1);
                                return (XFS_ERROR(EIO));
                        }
@@ -2961,7 +2826,7 @@ xlog_recover_process_data(
                                                                trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
-                                error = xlog_recover_unmount_trans(trans);
+                                error = xlog_recover_unmount_trans(log, trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
                                error = xlog_recover_add_to_cont_trans(log,
@@ -2969,8 +2834,8 @@ xlog_recover_process_data(
                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad transaction",
-                        "XFS: xlog_recover_process_data: bad transaction");
+                                        __func__);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -2980,8 +2845,8 @@ xlog_recover_process_data(
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
-                        "XFS: xlog_recover_process_data: bad flag");
+                                        __func__, flags);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -3011,7 +2876,7 @@ xlog_recover_process_efi(
        xfs_extent_t            *extp;
        xfs_fsblock_t           startblock_fsb;
-        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
        /*
         * First check the validity of the extents described by the
@@ -3050,7 +2915,7 @@ xlog_recover_process_efi(
                                         extp->ext_len);
        }
-        efip->efi_flags |= XFS_EFI_RECOVERED;
+        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -3107,7 +2972,7 @@ xlog_recover_process_efis(
                 * Skip EFIs that we've already processed.
                 */
                efip = (xfs_efi_log_item_t *)lip;
-                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
@@ -3166,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
 out_abort:
        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
-        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
-                        "failed to clear agi %d. Continuing.", agno);
        return;
 }
@@ -3418,7 +3282,7 @@ xlog_valid_rec_header(
        if (unlikely(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
-                xlog_warn("XFS: %s: unrecognised log version (%d).",
+                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
                        __func__, be32_to_cpu(rhead->h_version));
                return XFS_ERROR(EIO);
        }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
        xfs_daddr_t     head_blk,
        xfs_daddr_t     tail_blk)
 {
-        int             error;
+        int             error, i;
        ASSERT(head_blk != tail_blk);
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
         * First do a pass to find all of the cancelled buf log items.
         * Store them in the buf_cancel_table for use in the second pass.
         */
-        log->l_buf_cancel_table =
+        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
-                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(struct list_head),
-                                                 sizeof(xfs_buf_cancel_t*),
                                                 KM_SLEEP);
+        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
                int     i;
                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
        }
 #endif  /* DEBUG */
@@ -3874,10 +3740,9 @@ xlog_recover(
                        return error;
                }
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
-                        "Starting XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                error = xlog_do_recover(log, head_blk, tail_blk);
                log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3910,9 +3775,7 @@ xlog_recover_finish(
                int     error;
                error = xlog_recover_process_efis(log);
                if (error) {
-                        cmn_err(CE_ALERT,
+                        xfs_alert(log->l_mp, "Failed to recover EFIs");
-                                "Failed to recover EFIs on filesystem: %s",
-                                log->l_mp->m_fsname);
                        return error;
                }
                /*
@@ -3927,15 +3790,12 @@ xlog_recover_finish(
                xlog_recover_check_summary(log);
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
-                        "Ending XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
-                cmn_err(CE_DEBUG,
+                xfs_info(log->l_mp, "Ending clean mount");
-                        "!Ending clean XFS mount for filesystem: %s\n",
-                        log->l_mp->m_fsname);
        }
        return 0;
 }
@@ -3968,10 +3828,8 @@ xlog_recover_check_summary(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s agf read failed agno %d error %d",
-                                        "xlog_recover_check_summary(agf)"
+                                                __func__, agno, error);
-                                        "agf read failed agno %d error %d",
-                                                        agno, error);
                } else {
                        agfp = XFS_BUF_TO_AGF(agfbp);
                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3980,7 +3838,10 @@ xlog_recover_check_summary(
                }
                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                if (!error) {
+                if (error) {
+                        xfs_alert(mp, "%s agi read failed agno %d error %d",
+                                                __func__, agno, error);
+                } else {
                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
                        itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..bb3f9a7b24ed 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
                return 0;
        if (uuid_is_nil(uuid)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-                        "XFS: Filesystem %s has nil UUID - can't mount",
-                        mp->m_fsname);
                return XFS_ERROR(EINVAL);
        }
@@ -163,8 +161,7 @@ xfs_uuid_mount(
 out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
-        cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+        xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
-                         mp->m_fsname);
        return XFS_ERROR(EINVAL);
 }
@@ -311,6 +308,8 @@ xfs_mount_validate_sb(
        xfs_sb_t        *sbp,
        int             flags)
 {
+        int             loud = !(flags & XFS_MFSI_QUIET);
        /*
         * If the log device and data device have the
         * same device number, the log is internal.
@@ -319,28 +318,32 @@ xfs_mount_validate_sb(
         * a volume filesystem in a non-volume manner.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-                xfs_fs_mount_cmn_err(flags, "bad magic number");
+                if (loud)
+                        xfs_warn(mp, "bad magic number");
                return XFS_ERROR(EWRONGFS);
        }
        if (!xfs_sb_good_version(sbp)) {
-                xfs_fs_mount_cmn_err(flags, "bad version");
+                if (loud)
+                        xfs_warn(mp, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an external log; "
+                        xfs_warn(mp,
-                        "specify logdev on the\nmount command line.");
+                "filesystem is marked as having an external log; "
+                "specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an internal log; "
+                        xfs_warn(mp,
-                        "do not specify logdev on\nthe mount command line.");
+                "filesystem is marked as having an internal log; "
+                "do not specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
@@ -369,7 +372,8 @@ xfs_mount_validate_sb(
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
            (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 1 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -382,7 +386,8 @@ xfs_mount_validate_sb(
             (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
            sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
                              sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 2 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -390,12 +395,12 @@ xfs_mount_validate_sb(
         * Until this is fixed only page-sized or smaller data blocks work.
         */
        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud) {
-                        "file system with blocksize %d bytes",
+                        xfs_warn(mp,
-                        sbp->sb_blocksize);
+                "File system with blocksize %d bytes. "
-                xfs_fs_mount_cmn_err(flags,
+                "Only pagesize (%ld) or less will currently work.",
-                        "only pagesize (%ld) or less will currently work.",
+                                sbp->sb_blocksize, PAGE_SIZE);
-                        PAGE_SIZE);
+                }
                return XFS_ERROR(ENOSYS);
        }
@@ -409,21 +414,23 @@ xfs_mount_validate_sb(
        case 2048:
                break;
        default:
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "inode size of %d bytes not supported",
+                        xfs_warn(mp, "inode size of %d bytes not supported",
-                        sbp->sb_inodesize);
+                                sbp->sb_inodesize);
                return XFS_ERROR(ENOSYS);
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system too large to be mounted on this system.");
+                        xfs_warn(mp,
+                "file system too large to be mounted on this system.");
                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
-                xfs_fs_mount_cmn_err(flags, "file system busy");
+                if (loud)
+                        xfs_warn(mp, "file system busy");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -431,8 +438,9 @@ xfs_mount_validate_sb(
         * Version 1 directory format has never worked on Linux.
         */
        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system using version 1 directory format");
+                        xfs_warn(mp,
+                                "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
        }
@@ -472,7 +480,7 @@ xfs_initialize_perag(
                        goto out_unwind;
                pag->pag_agno = index;
                pag->pag_mount = mp;
-                rwlock_init(&pag->pag_ici_lock);
+                spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                spin_lock_init(&pag->pag_buf_lock);
@@ -673,6 +681,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
        unsigned int    sector_size;
        xfs_buf_t       *bp;
        int             error;
+        int             loud = !(flags & XFS_MFSI_QUIET);
        ASSERT(mp->m_sb_bp == NULL);
        ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +697,8 @@ reread:
        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
                                        XFS_SB_DADDR, sector_size, 0);
        if (!bp) {
-                xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+                if (loud)
+                        xfs_warn(mp, "SB buffer read failed");
                return EIO;
        }
@@ -699,7 +709,8 @@ reread:
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
-                xfs_fs_mount_cmn_err(flags, "SB validate failed");
+                if (loud)
+                        xfs_warn(mp, "SB validate failed");
                goto release_buf;
        }
@@ -707,9 +718,9 @@ reread:
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > mp->m_sb.sb_sectsize) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "device supports only %u byte sectors (not %u)",
+                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
-                        sector_size, mp->m_sb.sb_sectsize);
+                                sector_size, mp->m_sb.sb_sectsize);
                error = ENOSYS;
                goto release_buf;
        }
@@ -853,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
                        if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "alignment check 1 failed");
-                                        "XFS: alignment check 1 failed");
                                return XFS_ERROR(EINVAL);
                        }
                        mp->m_dalign = mp->m_swidth = 0;
@@ -867,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
                                        return XFS_ERROR(EINVAL);
                                }
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp,
-"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+                "stripe alignment turned off: sunit(%d)/swidth(%d) "
+                "incompatible with agsize(%d)",
                                        mp->m_dalign, mp->m_swidth,
                                        sbp->sb_agblocks);
@@ -878,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                        xfs_fs_cmn_err(CE_WARN, mp,
+                                        xfs_warn(mp,
-"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+                "stripe alignment turned off: sunit(%d) less than bsize(%d)",
-                                                mp->m_dalign,
+                                                mp->m_dalign,
                                                mp->m_blockmask +1);
                                        return XFS_ERROR(EINVAL);
                                }
@@ -975,6 +986,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+        struct xfs_mount        *mp)
+{
+        int i;
+        for (i = 0; i < XFS_LOWSP_MAX; i++) {
+                __uint64_t space = mp->m_sb.sb_dblocks;
+                do_div(space, 100);
+                mp->m_low_space[i] = space * (i + 1);
+        }
+}
+/*
 * Set whether we're using inode alignment.
 */
 STATIC void
@@ -1008,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-                cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
+                xfs_warn(mp, "filesystem size mismatch detected");
                return XFS_ERROR(EFBIG);
        }
        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
                                        BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
        if (!bp) {
-                cmn_err(CE_WARN, "XFS: last sector read failed");
+                xfs_warn(mp, "last sector read failed");
                return EIO;
        }
        xfs_buf_relse(bp);
@@ -1023,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        cmn_err(CE_WARN, "XFS: log size mismatch detected");
+                        xfs_warn(mp, "log size mismatch detected");
                        return XFS_ERROR(EFBIG);
                }
                bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_B(mp, 1), 0);
                if (!bp) {
-                        cmn_err(CE_WARN, "XFS: log device read failed");
+                        xfs_warn(mp, "log device read failed");
                        return EIO;
                }
                xfs_buf_relse(bp);
@@ -1068,7 +1097,7 @@ xfs_mount_reset_sbqflags(
                return 0;
 #ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1076,8 +1105,7 @@ xfs_mount_reset_sbqflags(
                                      XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
                return error;
        }
@@ -1143,8 +1171,7 @@ xfs_mountfs(
         * transaction subsystem is online.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "correcting sb_features alignment problem");
-                        "XFS: correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
                sbp->sb_bad_features2 = sbp->sb_features2;
                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1196,6 +1223,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
+        /* set the low space thresholds for dynamic preallocation */
+        xfs_set_low_space_thresholds(mp);
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
@@ -1220,7 +1250,7 @@ xfs_mountfs(
         */
        error = xfs_rtmount_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: RT mount failed");
+                xfs_warn(mp, "RT mount failed");
                goto out_remove_uuid;
        }
@@ -1251,12 +1281,12 @@ xfs_mountfs(
        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
-                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
+                xfs_warn(mp, "Failed per-ag init: %d", error);
                goto out_remove_uuid;
        }
        if (!sbp->sb_logblocks) {
-                cmn_err(CE_WARN, "XFS: no log defined");
+                xfs_warn(mp, "no log defined");
                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
                error = XFS_ERROR(EFSCORRUPTED);
                goto out_free_perag;
@@ -1269,7 +1299,7 @@ xfs_mountfs(
                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount failed");
+                xfs_warn(mp, "log mount failed");
                goto out_free_perag;
        }
@@ -1306,16 +1336,14 @@ xfs_mountfs(
         */
        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
        if (error) {
-                cmn_err(CE_WARN, "XFS: failed to read root inode");
+                xfs_warn(mp, "failed to read root inode");
                goto out_log_dealloc;
        }
        ASSERT(rip != NULL);
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
-                cmn_err(CE_WARN, "XFS: corrupted root inode");
+                xfs_warn(mp, "corrupted root inode %llu: not a directory",
-                cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
-                        XFS_BUFTARG_NAME(mp->m_ddev_targp),
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1335,7 +1363,7 @@ xfs_mountfs(
                /*
                 * Free up the root inode.
                 */
-                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+                xfs_warn(mp, "failed to read RT inodes");
                goto out_rele_rip;
        }
@@ -1347,7 +1375,7 @@ xfs_mountfs(
        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
-                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
+                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
                }
        }
@@ -1368,10 +1396,7 @@ xfs_mountfs(
                 * quotachecked license.
                 */
                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "resetting quota flags");
-                                "XFS: resetting qflags for filesystem %s",
-                                mp->m_fsname);
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
                                return error;
@@ -1385,7 +1410,7 @@ xfs_mountfs(
         */
        error = xfs_log_mount_finish(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount finish failed");
+                xfs_warn(mp, "log mount finish failed");
                goto out_rtunmount;
        }
@@ -1414,8 +1439,8 @@ xfs_mountfs(
                resblks = xfs_default_resblks(mp);
                error = xfs_reserve_blocks(mp, &resblks, NULL);
                if (error)
-                        cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+                        xfs_warn(mp,
-                                "blocks. Continuing without a reserve pool.");
+        "Unable to allocate reserve blocks. Continuing without reserve pool.");
        }
        return 0;
@@ -1504,12 +1529,12 @@ xfs_unmountfs(
        resblks = 0;
        error = xfs_reserve_blocks(mp, &resblks, NULL);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+                xfs_warn(mp, "Unable to free reserved block pool. "
                                "Freespace may not be correct on next mount.");
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
@@ -1992,10 +2017,8 @@ xfs_dev_is_read_only(
        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
            xfs_readonly_buftarg(mp->m_logdev_targp) ||
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "%s required on read-only device.", message);
-                        "XFS: %s required on read-only device.", message);
+                xfs_notice(mp, "write access unavailable, cannot proceed.");
-                cmn_err(CE_NOTE,
-                        "XFS: write access unavailable, cannot proceed.");
                return EROFS;
        }
        return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..19af0ab0d0c6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
        xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+        XFS_LOWSP_1_PCNT = 0,
+        XFS_LOWSP_2_PCNT,
+        XFS_LOWSP_3_PCNT,
+        XFS_LOWSP_4_PCNT,
+        XFS_LOWSP_5_PCNT,
+        XFS_LOWSP_MAX,
+};
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -193,15 +203,14 @@ typedef struct xfs_mount {
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-        struct task_struct      *m_sync_task;   /* generalised sync thread */
+        struct delayed_work     m_sync_work;    /* background sync work */
-        xfs_sync_work_t         m_sync_work;    /* work item for VFS_SYNC */
+        struct delayed_work     m_reclaim_work; /* background inode reclaim */
-        struct list_head        m_sync_list;    /* sync thread work item list */
+        struct work_struct      m_flush_work;   /* background inode flush */
-        spinlock_t              m_sync_lock;    /* work item list lock */
-        int                     m_sync_seq;     /* sync thread generation no. */
-        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
+        int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                                /* low free space thresholds */
 } xfs_mount_t;
 /*
@@ -379,6 +388,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
+extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
        if (!xfs_mru_elem_zone)
                goto out;
-        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
                goto out_destroy_mru_elem_zone;
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
        spin_lock(&mru->lock);
        if (mru->queued) {
                spin_unlock(&mru->lock);
-                cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+                cancel_delayed_work_sync(&mru->work);
                spin_lock(&mru->lock);
        }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd21..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
-extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+                                xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a6..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
         * it and some incremental backup programs won't work without it.
         */
        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a191385310..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
        xfs_mount_t     *mp,            /* file system mount point */
        xfs_extlen_t    oblocks,        /* old count of blocks */
        xfs_extlen_t    nblocks,        /* new count of blocks */
-        xfs_ino_t       ino)            /* inode number (bitmap/summary) */
+        xfs_inode_t     *ip)            /* inode (bitmap/summary) */
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
        xfs_fsblock_t   firstblock;     /* first block allocated in xaction */
        xfs_bmap_free_t flist;          /* list of freed blocks */
        xfs_fsblock_t   fsbno;          /* filesystem block for bno */
-        xfs_inode_t     *ip;            /* pointer to incore inode */
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
                /*
                 * Lock the inode.
                 */
-                if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
                xfs_bmap_init(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
                        /*
                         * Lock the bitmap inode.
                         */
-                        if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                        XFS_ILOCK_EXCL, &ip)))
+                        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                                goto error_cancel;
                        /*
                         * Get a buffer for the block.
                         */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
        xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
        /*
         * Allocate space to the bitmap and summary files, as necessary.
         */
-        if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+        error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
-                        mp->m_sb.sb_rbmino)))
+        if (error)
                return error;
-        if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+        error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
-                        mp->m_sb.sb_rsumino)))
+        if (error)
                return error;
        /*
         * Allocate a new (fake) mount/sb.
@@ -1972,10 +1969,8 @@ xfs_growfs_rt(
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
                 */
@@ -1986,10 +1981,8 @@ xfs_growfs_rt(
                /*
                 * Get the summary inode into the transaction.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+                xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
                 */
@@ -2075,15 +2068,15 @@ xfs_rtallocate_extent(
        xfs_extlen_t    prod,           /* extent product factor */
        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
 {
+        xfs_mount_t     *mp = tp->t_mountp;
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* inode for bitmap file */
-        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_rtblock_t   r;              /* result allocated block */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
        ASSERT(minlen > 0 && minlen <= maxlen);
-        mp = tp->t_mountp;
        /*
         * If prod is set then figure out what to do to minlen and maxlen.
         */
@@ -2099,12 +2092,7 @@ xfs_rtallocate_extent(
                        return 0;
                }
        }
-        /*
-         * Lock out other callers by grabbing the bitmap inode lock.
-         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
        sumbp = NULL;
        /*
         * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2111,12 @@ xfs_rtallocate_extent(
                                len, &sumbp, &sb, prod, &r);
                break;
        default:
+                error = EIO;
                ASSERT(0);
        }
-        if (error) {
+        if (error)
                return error;
-        }
        /*
         * If it worked, update the superblock.
         */
@@ -2155,7 +2144,6 @@ xfs_rtfree_extent(
        xfs_extlen_t    len)            /* length of extent freed */
 {
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* bitmap file inode */
        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
@@ -2164,9 +2152,9 @@ xfs_rtfree_extent(
        /*
         * Synchronize by locking the bitmap inode.
         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                        XFS_ILOCK_EXCL, &ip)))
+        xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                return error;
 #if defined(__KERNEL__) && defined(DEBUG)
        /*
         * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
         */
        if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
            mp->m_sb.sb_rextents) {
-                if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-                        ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                        mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-                *(__uint64_t *)&ip->i_d.di_atime = 0;
+                *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        }
        return 0;
 }
@@ -2222,8 +2210,8 @@ xfs_rtmount_init(
        if (sbp->sb_rblocks == 0)
                return 0;
        if (mp->m_rtdev_targp == NULL) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: This filesystem has a realtime volume, use rtdev=device option");
+        "Filesystem has a realtime volume, use rtdev=device option");
                return XFS_ERROR(ENODEV);
        }
        mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,7 +2225,7 @@ xfs_rtmount_init(
         */
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
-                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
+                xfs_warn(mp, "realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return XFS_ERROR(EFBIG);
@@ -2246,7 +2234,7 @@ xfs_rtmount_init(
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_B(mp, 1), 0);
        if (!bp) {
-                cmn_err(CE_WARN, "XFS: realtime device size check failed");
+                xfs_warn(mp, "realtime device size check failed");
                return EIO;
        }
        xfs_buf_relse(bp);
@@ -2306,20 +2294,16 @@ xfs_rtpick_extent(
        xfs_rtblock_t   *pick)          /* result rt extent */
 {
        xfs_rtblock_t   b;              /* result block */
-        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap incore inode */
        int             log2;           /* log of sequence number */
        __uint64_t      resid;          /* residual after log removed */
        __uint64_t      seq;            /* sequence number of file creation */
        __uint64_t      *seqp;          /* pointer to seqno in inode */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
+        seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
-        ASSERT(ip == mp->m_rbmip);
+        if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-        seqp = (__uint64_t *)&ip->i_d.di_atime;
+                mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-        if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-                ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
                *seqp = 0;
        }
        seq = *seqp;
@@ -2335,7 +2319,7 @@ xfs_rtpick_extent(
                        b = mp->m_sb.sb_rextents - len;
        }
        *seqp = seq + 1;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        *pick = b;
        return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
        if (mp->m_sb.sb_rblocks == 0)
                return 0;
-        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        xfs_warn(mp, "Not built with CONFIG_XFS_RT");
        return ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
        logerror = flags & SHUTDOWN_LOG_IO_ERROR;
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
+                xfs_notice(mp,
-                                 "line %d of file %s.  Return address = 0x%p",
+        "%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
-                        mp->m_fsname, flags, lnnum, fname, __return_address);
+                        __func__, flags, lnnum, fname, __return_address);
        }
        /*
         * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
                return;
        if (flags & SHUTDOWN_CORRUPT_INCORE) {
-                xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+    "Corruption of in-memory data detected.  Shutting down filesystem");
-                        mp->m_fsname);
+                if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
-                if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
                        xfs_stack_trace();
-                }
        } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
                if (logerror) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-                "Log I/O Error Detected.  Shutting down filesystem: %s",
+                "Log I/O Error Detected.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (flags & SHUTDOWN_DEVICE_REQ) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "All device paths lost.  Shutting down filesystem: %s",
+                "All device paths lost.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "I/O Error Detected.  Shutting down filesystem: %s",
+                "I/O Error Detected. Shutting down filesystem");
-                                mp->m_fsname);
                }
        }
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_ALERT, "Please umount the filesystem, "
+                xfs_alert(mp,
-                                  "and rectify the problem(s)");
+        "Please umount the filesystem and rectify the problem(s)");
        }
 }
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
        xfs_buf_t               *bp,
        xfs_daddr_t             blkno)
 {
-        cmn_err(CE_ALERT,
+        xfs_alert(mp,
- "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
+                 "I/O error occurred: meta-data dev %s block 0x%llx"
- "       (\"%s\") error %d buf count %zd",
+                 "       (\"%s\") error %d buf count %zd",
-                (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
                (__uint64_t)blkno, func,
                XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
 xfs_get_extsz_hint(
        struct xfs_inode        *ip)
 {
-        xfs_extlen_t            extsz;
+        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+                return ip->i_d.di_extsize;
-        if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
+        if (XFS_IS_REALTIME_INODE(ip))
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+                return ip->i_mount->m_sb.sb_rextsize;
-                                ? ip->i_d.di_extsize
+        return 0;
-                                : ip->i_mount->m_sb.sb_rextsize;
-                ASSERT(extsz);
-        } else {
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                                ? ip->i_d.di_extsize : 0;
-        }
-        return extsz;
 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711e..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
        if (blkdelta)
                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
 out:
-        ASSERT(error = 0);
+        ASSERT(error == 0);
        return;
 }
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-void
+static void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -1425,21 +1425,120 @@ xfs_trans_committed(
        xfs_trans_free(tp);
 }
+static inline void
+xfs_log_item_batch_insert(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     **log_items,
+        int                     nr_items,
+        xfs_lsn_t               commit_lsn)
+{
+        int     i;
+        spin_lock(&ailp->xa_lock);
+        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+        xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+        for (i = 0; i < nr_items; i++)
+                IOP_UNPIN(log_items[i], 0);
+}
 /*
- * Called from the trans_commit code when we notice that
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
- * the filesystem is in the middle of a forced shutdown.
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
+ */
+void
+xfs_trans_committed_bulk(
+        struct xfs_ail          *ailp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+#define LOG_ITEM_BATCH_SIZE     32
+        struct xfs_log_item     *log_items[LOG_ITEM_BATCH_SIZE];
+        struct xfs_log_vec      *lv;
+        int                     i = 0;
+        /* unpin all the log items */
+        for (lv = log_vector; lv; lv = lv->lv_next ) {
+                struct xfs_log_item     *lip = lv->lv_item;
+                xfs_lsn_t               item_lsn;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                item_lsn = IOP_COMMITTED(lip, commit_lsn);
+                /* item_lsn of -1 means the item was freed */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                        continue;
+                /*
+                 * if we are aborting the operation, no point in inserting the
+                 * object into the AIL as we are in a shutdown situation.
+                 */
+                if (aborted) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+                        IOP_UNPIN(lip, 1);
+                        continue;
+                }
+                if (item_lsn != commit_lsn) {
+                        /*
+                         * Not a bulk update option due to unusual item_lsn.
+                         * Push into AIL immediately, rechecking the lsn once
+                         * we have the ail lock. Then unpin the item.
+                         */
+                        spin_lock(&ailp->xa_lock);
+                        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+                                xfs_trans_ail_update(ailp, lip, item_lsn);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                        IOP_UNPIN(lip, 0);
+                        continue;
+                }
+                /* Item is a candidate for bulk AIL insert.  */
+                log_items[i++] = lv->lv_item;
+                if (i >= LOG_ITEM_BATCH_SIZE) {
+                        xfs_log_item_batch_insert(ailp, log_items,
+                                        LOG_ITEM_BATCH_SIZE, commit_lsn);
+                        i = 0;
+                }
+        }
+        /* make sure we insert the remainder! */
+        if (i)
+                xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+}
+/*
+ * Called from the trans_commit code when we notice that the filesystem is in
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
 */
 STATIC void
 xfs_trans_uncommit(
        struct xfs_trans        *tp,
        uint                    flags)
 {
-        struct xfs_log_item_desc *lidp;
+        struct xfs_log_item_desc *lidp, *n;
-        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+        list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
-                /*
-                 * Unpin all but those that aren't dirty.
-                 */
                if (lidp->lid_flags & XFS_LID_DIRTY)
                        IOP_UNPIN(lidp->lid_item, 1);
        }
@@ -1656,7 +1755,6 @@ xfs_trans_commit_cil(
        int                     flags)
 {
        struct xfs_log_vec      *log_vector;
-        int                     error;
        /*
         * Get each log item to allocate a vector structure for
@@ -1667,9 +1765,7 @@ xfs_trans_commit_cil(
        if (!log_vector)
                return ENOMEM;
-        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+        xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        if (error)
-                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a86..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
 #define XFS_ALLOC_BTREE_REF     2
 #define XFS_BMAP_BTREE_REF      2
 #define XFS_DIR_BTREE_REF       2
+#define XFS_INO_REF             2
 #define XFS_ATTR_BTREE_REF      1
-#define XFS_INO_REF             1
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
@@ -469,8 +469,6 @@ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
-                               xfs_ino_t , uint, uint, struct xfs_inode **);
 void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..acdb92f14d51 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+struct workqueue_struct *xfs_ail_wq;    /* AIL workqueue */
-STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
+/*
-#else
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *prev_lip;
+        if (list_empty(&ailp->xa_ail))
+                return;
+        /*
+         * Check the next and previous entries are valid.
+         */
+        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+#ifdef XFS_TRANS_DEBUG
+        /*
+         * Walk the list checking lsn ordering, and that every entry has the
+         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+         * when specifically debugging the transaction subsystem.
+         */
+        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+                if (&prev_lip->li_ail != &ailp->xa_ail)
+                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+                prev_lip = lip;
+        }
+#endif /* XFS_TRANS_DEBUG */
+}
+#else /* !DEBUG */
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_min(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+}
+ /*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        if (lip->li_ail.next == &ailp->xa_ail)
+                return NULL;
+        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
 /*
- * This is called by the log manager code to determine the LSN
+ * This is called by the log manager code to determine the LSN of the tail of
- * of the tail of the log.  This is exactly the LSN of the first
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
- * item in the AIL.  If the AIL is empty, then this function
+ * is empty, then this function returns 0.
- * returns 0.
 *
- * We need the AIL lock in order to get a coherent read of the
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
- * lsn of the last item in the AIL.
+ * item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_ail_tail(
+xfs_ail_min_lsn(
        struct xfs_ail  *ailp)
 {
-        xfs_lsn_t       lsn;
+        xfs_lsn_t       lsn = 0;
        xfs_log_item_t  *lip;
        spin_lock(&ailp->xa_lock);
        lip = xfs_ail_min(ailp);
-        if (lip == NULL) {
+        if (lip)
-                lsn = (xfs_lsn_t)0;
-        } else {
                lsn = lip->li_lsn;
-        }
        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
 /*
- * xfs_trans_push_ail
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
- *
- * This routine is called to move the tail of the AIL forward.  It does this by
- * trying to flush items in the AIL whose lsns are below the given
- * threshold_lsn.
- *
- * the push is run asynchronously in a separate thread, so we return the tail
- * of the log right now instead of the tail after the push. This means we will
- * either continue right away, or we will sleep waiting on the async thread to
- * do its work.
- *
- * We do this unlocked - we only need to know whether there is anything in the
- * AIL at the time we are called. We don't need to access the contents of
- * any of the objects, so the lock is not needed.
 */
-void
+static xfs_lsn_t
-xfs_trans_ail_push(
+xfs_ail_max_lsn(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp)
-        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_lsn_t       lsn = 0;
+        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(ailp);
+        spin_lock(&ailp->xa_lock);
-        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+        lip = xfs_ail_max(ailp);
-                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+        if (lip)
-                        xfsaild_wakeup(ailp, threshold_lsn);
+                lsn = lip->li_lsn;
-        }
+        spin_unlock(&ailp->xa_lock);
+        return lsn;
 }
 /*
@@ -236,16 +300,57 @@ out:
 }
 /*
- * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * splice the log item list into the AIL at the given LSN.
- * zero indicates that the caller should sleep until woken.
 */
-long
+static void
-xfsaild_push(
+xfs_ail_splice(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t       *last_lsn)
+        struct list_head *list,
+        xfs_lsn_t       lsn)
+{
+        xfs_log_item_t  *next_lip;
+        /* If the list is empty, just insert the item.  */
+        if (list_empty(&ailp->xa_ail)) {
+                list_splice(list, &ailp->xa_ail);
+                return;
+        }
+        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
+                        break;
+        }
+        ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
+               XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
+        list_splice_init(list, &next_lip->li_ail);
+}
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
 {
-        long            tout = 0;
+        xfs_ail_check(ailp, lip);
-        xfs_lsn_t       last_pushed_lsn = *last_lsn;
+        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
+}
+/*
+ * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
+ * to run at a later time if there is more work to do to complete the push.
+ */
+STATIC void
+xfs_ail_worker(
+        struct work_struct *work)
+{
+        struct xfs_ail  *ailp = container_of(to_delayed_work(work),
+                                        struct xfs_ail, xa_work);
+        long            tout;
        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
@@ -256,15 +361,15 @@ xfsaild_push(
        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_cursor_init(ailp, cur);
-        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
                xfs_trans_ail_cursor_done(ailp, cur);
                spin_unlock(&ailp->xa_lock);
-                *last_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
-                return tout;
+                return;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -301,13 +406,13 @@ xfsaild_push(
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
                        IOP_PUSH(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        break;
                case XFS_ITEM_PUSHBUF:
                        XFS_STATS_INC(xs_push_ail_pushbuf);
                        IOP_PUSHBUF(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        push_xfsbufd = 1;
                        break;
@@ -319,7 +424,7 @@ xfsaild_push(
                case XFS_ITEM_LOCKED:
                        XFS_STATS_INC(xs_push_ail_locked);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        stuck++;
                        break;
@@ -374,9 +479,23 @@ xfsaild_push(
                wake_up_process(mp->m_ddev_targp->bt_task);
        }
+        /* assume we have more work to do in a short while */
+        tout = 10;
        if (!count) {
                /* We're past our target or empty, so idle */
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
+                /*
+                 * Check for an updated push target before clearing the
+                 * XFS_AIL_PUSHING_BIT. If the target changed, we've got more
+                 * work to do. Wait a bit longer before starting that work.
+                 */
+                smp_rmb();
+                if (ailp->xa_target == target) {
+                        clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
+                        return;
+                }
+                tout = 50;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
                /*
                 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +503,7 @@ xfsaild_push(
                 * start the next scan from the start of the AIL.
                 */
                tout = 50;
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
        } else if ((stuck * 100) / count > 90) {
                /*
                 * Either there is a lot of contention on the AIL or we
@@ -396,14 +515,61 @@ xfsaild_push(
                 * continuing from where we were.
                 */
                tout = 20;
-        } else {
-                /* more to do, but wait a short while before continuing */
-                tout = 10;
        }
-        *last_lsn = last_pushed_lsn;
-        return tout;
+        /* There is more to do, requeue us.  */
+        queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
+                                        msecs_to_jiffies(tout));
 }
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       threshold_lsn)
+{
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+            XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+                return;
+        /*
+         * Ensure that the new target is noticed in push code before it clears
+         * the XFS_AIL_PUSHING_BIT.
+         */
+        smp_wmb();
+        ailp->xa_target = threshold_lsn;
+        if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+                queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+}
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+        struct xfs_ail  *ailp)
+{
+        xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+        if (threshold_lsn)
+                xfs_ail_push(ailp, threshold_lsn);
+}
 /*
 * This is to be called when an item is unlocked that may have
@@ -449,129 +615,152 @@ xfs_trans_unlocked_item(
                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
 /*
- * Update the position of the item in the AIL with the new
+ * xfs_trans_ail_update - bulk AIL insertion operation.
- * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ *
- * it to its new position by removing it and re-adding it.
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added.  Otherwise, it will be repositioned  by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
+ *
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
 *
- * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * To optimise the insert operation, we delete all the items from the AIL in
- * we move in the AIL is the minimum one, update the tail lsn in the
+ * the first pass, moving them into a temporary list, then splice the temporary
- * log manager.
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function must be called with the AIL lock held.  The lock is dropped
- * is dropped before returning.
+ * before returning.
 */
 void
-xfs_trans_ail_update(
+xfs_trans_ail_update_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip,
+        struct xfs_log_item     **log_items,
-        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
+        int                     nr_items,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip = NULL;
+        xfs_log_item_t          *mlip;
-        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
+        LIST_HEAD(tmp);
        mlip = xfs_ail_min(ailp);
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        for (i = 0; i < nr_items; i++) {
-                dlip = xfs_ail_delete(ailp, lip);
+                struct xfs_log_item *lip = log_items[i];
-                ASSERT(dlip == lip);
+                if (lip->li_flags & XFS_LI_IN_AIL) {
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+                        /* check if we really need to move the item */
-        } else {
+                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
-                lip->li_flags |= XFS_LI_IN_AIL;
+                                continue;
+                        xfs_ail_delete(ailp, lip);
+                        if (mlip == lip)
+                                mlip_changed = 1;
+                } else {
+                        lip->li_flags |= XFS_LI_IN_AIL;
+                }
+                lip->li_lsn = lsn;
+                list_add(&lip->li_ail, &tmp);
        }
-        lip->li_lsn = lsn;
+        xfs_ail_splice(ailp, &tmp, lsn);
-        xfs_ail_insert(ailp, lip);
-        if (mlip == dlip) {
+        if (!mlip_changed) {
-                mlip = xfs_ail_min(ailp);
-                /*
-                 * It is not safe to access mlip after the AIL lock is
-                 * dropped, so we must get a copy of li_lsn before we do
-                 * so.  This is especially important on 32-bit platforms
-                 * where accessing and updating 64-bit values like li_lsn
-                 * is not atomic.
-                 */
-                tail_lsn = mlip->li_lsn;
-                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-        } else {
                spin_unlock(&ailp->xa_lock);
+                return;
        }
+        /*
-}       /* xfs_trans_update_ail */
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip->li_lsn;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
- * Delete the given item from the AIL.  It must already be in
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
- * the AIL.
+ *
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
 *
- * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * For each log item to be removed, unlink it  from the AIL, clear the IN_AIL
- * we delete in the AIL is the minimum one, update the tail lsn in the
+ * flag from the item and reset the item's lsn to 0. If we remove the first
- * log manager.
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
 *
- * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * This function will not drop the AIL lock until all items are removed from
- * bump the AIL's generation count to indicate that the tree
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
- * has changed.
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
 *
- * This function must be called with the AIL lock held.  The lock
+ * This function must be called with the AIL lock held.  The lock is dropped
- * is dropped before returning.
+ * before returning.
 */
 void
-xfs_trans_ail_delete(
+xfs_trans_ail_delete_bulk(
-        struct xfs_ail  *ailp,
+        struct xfs_ail          *ailp,
-        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+        struct xfs_log_item     **log_items,
+        int                     nr_items) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        xfs_lsn_t               tail_lsn;
+        int                     mlip_changed = 0;
+        int                     i;
-        if (lip->li_flags & XFS_LI_IN_AIL) {
+        mlip = xfs_ail_min(ailp);
-                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(ailp, lip);
-                ASSERT(dlip == lip);
-                xfs_trans_ail_cursor_clear(ailp, dlip);
+        for (i = 0; i < nr_items; i++) {
+                struct xfs_log_item *lip = log_items[i];
+                if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+                        struct xfs_mount        *mp = ailp->xa_mount;
-                lip->li_flags &= ~XFS_LI_IN_AIL;
-                lip->li_lsn = 0;
-                if (mlip == dlip) {
-                        mlip = xfs_ail_min(ailp);
-                        /*
-                         * It is not safe to access mlip after the AIL lock
-                         * is dropped, so we must get a copy of li_lsn
-                         * before we do so.  This is especially important
-                         * on 32-bit platforms where accessing and updating
-                         * 64-bit values like li_lsn is not atomic.
-                         */
-                        tail_lsn = mlip ? mlip->li_lsn : 0;
-                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
-                } else {
                        spin_unlock(&ailp->xa_lock);
+                        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                                xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+                "%s: attempting to delete a log item that is not in the AIL",
+                                                __func__);
+                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                        }
+                        return;
                }
+                xfs_ail_delete(ailp, lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                if (mlip == lip)
+                        mlip_changed = 1;
        }
-        else {
-                /*
-                 * If the file system is not being shutdown, we are in
-                 * serious trouble if we get to this stage.
-                 */
-                struct xfs_mount        *mp = ailp->xa_mount;
+        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
-                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                return;
-                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-                "%s: attempting to delete a log item that is not in the AIL",
-                                        __func__);
-                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                }
        }
-}
+        /*
+         * It is not safe to access mlip after the AIL lock is dropped, so we
+         * must get a copy of li_lsn before we do so.  This is especially
+         * important on 32-bit platforms where accessing and updating 64-bit
+         * values like li_lsn is not atomic. It is possible we've emptied the
+         * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+         */
+        mlip = xfs_ail_min(ailp);
+        tail_lsn = mlip ? mlip->li_lsn : 0;
+        spin_unlock(&ailp->xa_lock);
+        xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
 /*
 * The active item list (AIL) is a doubly linked list of log
@@ -592,7 +781,6 @@ xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
        struct xfs_ail  *ailp;
-        int             error;
        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
        if (!ailp)
@@ -601,15 +789,9 @@ xfs_trans_ail_init(
        ailp->xa_mount = mp;
        INIT_LIST_HEAD(&ailp->xa_ail);
        spin_lock_init(&ailp->xa_lock);
-        error = xfsaild_start(ailp);
+        INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
-        if (error)
-                goto out_free_ailp;
        mp->m_ail = ailp;
        return 0;
-out_free_ailp:
-        kmem_free(ailp);
-        return error;
 }
 void
@@ -618,135 +800,6 @@ xfs_trans_ail_destroy(
 {
        struct xfs_ail  *ailp = mp->m_ail;
-        xfsaild_stop(ailp);
+        cancel_delayed_work_sync(&ailp->xa_work);
        kmem_free(ailp);
 }
-/*
- * Insert the given log item into the AIL.
- * We almost always insert at the end of the list, so on inserts
- * we search from the end of the list to find where the
- * new item belongs.
- */
-STATIC void
-xfs_ail_insert(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-/* ARGSUSED */
-{
-        xfs_log_item_t  *next_lip;
-        /*
-         * If the list is empty, just insert the item.
-         */
-        if (list_empty(&ailp->xa_ail)) {
-                list_add(&lip->li_ail, &ailp->xa_ail);
-                return;
-        }
-        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
-                        break;
-        }
-        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
-        list_add(&lip->li_ail, &next_lip->li_ail);
-        xfs_ail_check(ailp, lip);
-        return;
-}
-/*
- * Delete the given item from the AIL.  Return a pointer to the item.
- */
-/*ARGSUSED*/
-STATIC xfs_log_item_t *
-xfs_ail_delete(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-/* ARGSUSED */
-{
-        xfs_ail_check(ailp, lip);
-        list_del(&lip->li_ail);
-        return lip;
-}
-/*
- * Return a pointer to the first item in the AIL.
- * If the AIL is empty, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_min(
-        struct xfs_ail  *ailp)
-/* ARGSUSED */
-{
-        if (list_empty(&ailp->xa_ail))
-                return NULL;
-        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-/*
- * Return a pointer to the item which follows
- * the given item in the AIL.  If the given item
- * is the last item in the list, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_next(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-/* ARGSUSED */
-{
-        if (lip->li_ail.next == &ailp->xa_ail)
-                return NULL;
-        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
-}
-#ifdef DEBUG
-/*
- * Check that the list is sorted as it should be.
- */
-STATIC void
-xfs_ail_check(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_log_item_t  *prev_lip;
-        if (list_empty(&ailp->xa_ail))
-                return;
-        /*
-         * Check the next and previous entries are valid.
-         */
-        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
-        /*
-         * Walk the list checking lsn ordering, and that every entry has the
-         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * when specifically debugging the transaction subsystem.
-         */
-        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-                prev_lip = lip;
-        }
-#endif /* XFS_TRANS_DEBUG */
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a5..03b3b7f85a3b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
-                                        cmn_err(CE_DEBUG, "Returning error!\n");
+                                        xfs_debug(mp, "Returning error!");
                                        return XFS_ERROR(EIO);
                                }
                        }
@@ -383,7 +383,8 @@ xfs_trans_read_buf(
        bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
        if (bp == NULL) {
                *bpp = NULL;
-                return 0;
+                return (flags & XBF_TRYLOCK) ?
+                                        0 : XFS_ERROR(ENOMEM);
        }
        if (XFS_BUF_GETERROR(bp) != 0) {
            XFS_BUF_SUPER_STALE(bp);
@@ -403,7 +404,7 @@ xfs_trans_read_buf(
                                xfs_force_shutdown(tp->t_mountp,
                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
-                                cmn_err(CE_DEBUG, "Returning trans error!\n");
+                                xfs_debug(mp, "Returning trans error!");
                                return XFS_ERROR(EIO);
                        }
                }
@@ -427,7 +428,7 @@ shutdown_abort:
         */
 #if defined(DEBUG)
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
-                cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
+                xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
                                     (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        next_extent = efip->efi_next_extent;
+        /*
+         * atomic_inc_return gives us the value after the increment;
+         * we want to use it as an array index so we need to subtract 1 from
+         * it.
+         */
+        next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
        extp = &(efip->efi_format.efi_extents[next_extent]);
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
-        efip->efi_next_extent++;
 }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768b..048b0c689d3e 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
 #endif
 /*
- * Get an inode and join it to the transaction.
- */
-int
-xfs_trans_iget(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp)
-{
-        int                     error;
-        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-        if (!error && tp) {
-                xfs_trans_ijoin(tp, *ipp);
-                (*ipp)->i_itemp->ili_lock_flags = lock_flags;
-        }
-        return error;
-}
-/*
 * Add a locked inode to the transaction.
 *
 * The inode must be locked, and it cannot be associated with any transaction.
@@ -103,7 +81,7 @@ xfs_trans_ijoin(
 *
 *
 * Grabs a reference to the inode which will be dropped when the transaction
- * is commited.  The inode will also be unlocked at that point.  The inode
+ * is committed.  The inode will also be unlocked at that point.  The inode
 * must be locked, and it cannot be associated with any transaction.
 */
 void
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
 void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
                                int flags);
-void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+void    xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+                                xfs_lsn_t commit_lsn, int aborted);
 /*
 * AIL traversal cursor.
 *
@@ -63,28 +65,52 @@ struct xfs_ail_cursor {
 struct xfs_ail {
        struct xfs_mount        *xa_mount;
        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
        xfs_lsn_t               xa_target;
        struct xfs_ail_cursor   xa_cursors;
        spinlock_t              xa_lock;
+        struct delayed_work     xa_work;
+        xfs_lsn_t               xa_last_pushed_lsn;
+        unsigned long           xa_flags;
 };
+#define XFS_AIL_PUSHING_BIT     0
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_ail_update(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
+extern struct workqueue_struct  *xfs_ail_wq;    /* AIL workqueue */
-                                        __releases(ailp->xa_lock);
-void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
-                                        struct xfs_log_item *lip)
+                                struct xfs_log_item **log_items, int nr_items,
-                                        __releases(ailp->xa_lock);
+                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
-void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+static inline void
+xfs_trans_ail_update(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+}
+void    xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+                                struct xfs_log_item **log_items, int nr_items)
+                                __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
+{
+        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+void                    xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_ail_push_all(struct xfs_ail *);
+xfs_lsn_t               xfs_ail_min_lsn(struct xfs_ail *ailp);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
-xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
 struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur,
                                        xfs_lsn_t lsn);
@@ -93,11 +119,6 @@ struct xfs_log_item	*xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
 void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur);
-long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
-void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
-int     xfsaild_start(struct xfs_ail *);
-void    xfsaild_stop(struct xfs_ail *);
 #if BITS_PER_LONG != 64
 static inline void
 xfs_trans_ail_copy_lsn(
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151a..b7a5fe7c52c8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -953,7 +953,7 @@ xfs_release(
                 * If we previously truncated this file and removed old data
                 * in the process, we want to initiate "early" writeout on
                 * the last close.  This is an attempt to combat the notorious
-                 * NULL files problem which is particularly noticable from a
+                 * NULL files problem which is particularly noticeable from a
                 * truncate down, buffered (re-)write (delalloc), followed by
                 * a crash.  What we are effectively doing here is
                 * significantly reducing the time window where we'd otherwise
@@ -964,29 +964,48 @@ xfs_release(
                        xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
        }
-        if (ip->i_d.di_nlink != 0) {
+        if (ip->i_d.di_nlink == 0)
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                return 0;
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                    (!(ip->i_d.di_flags &
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                        /*
+        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                         * If we can't get the iolock just skip truncating
+             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                         * the blocks past EOF because we could deadlock
+               ip->i_delayed_blks > 0)) &&
-                         * with the mmap_sem otherwise.  We'll get another
+             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-                         * chance to drop them once the last reference to
+            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
-                         * the inode is dropped, so we'll never leak blocks
-                         * permanently.
+                /*
-                         */
+                 * If we can't get the iolock just skip truncating the blocks
-                        error = xfs_free_eofblocks(mp, ip,
+                 * past EOF because we could deadlock with the mmap_sem
-                                                   XFS_FREE_EOF_TRYLOCK);
+                 * otherwise.  We'll get another chance to drop them once the
-                        if (error)
+                 * last reference to the inode is dropped, so we'll never leak
-                                return error;
+                 * blocks permanently.
-                }
+                 *
-        }
+                 * Further, check if the inode is being opened, written and
+                 * closed frequently and we have delayed allocation blocks
+                 * outstanding (e.g. streaming writes from the NFS server),
+                 * truncating the blocks past EOF will cause fragmentation to
+                 * occur.
+                 *
+                 * In this case don't do the truncation, either, but we have to
+                 * be careful how we detect this case. Blocks beyond EOF show
+                 * up as i_delayed_blks even when the inode is clean, so we
+                 * need to truncate them away first before checking for a dirty
+                 * release. Hence on the first dirty close we will still remove
+                 * the speculative allocation, but after that we will leave it
+                 * in place.
+                 */
+                if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                        return 0;
+                error = xfs_free_eofblocks(mp, ip,
+                                           XFS_FREE_EOF_TRYLOCK);
+                if (error)
+                        return error;
+                /* delalloc blocks after truncation means it really is dirty */
+                if (ip->i_delayed_blks)
+                        xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+        }
        return 0;
 }
@@ -1170,9 +1189,8 @@ xfs_inactive(
                 * inode might be lost for a long time or forever.
                 */
                if (!XFS_FORCED_SHUTDOWN(mp)) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
-                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
+                                __func__, error);
-                                error, mp->m_fsname);
                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1189,12 +1207,12 @@ xfs_inactive(
                 */
                error = xfs_bmap_finish(&tp,  &free_list, &committed);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-                                "xfs_bmap_finish() returned error %d", error);
+                                __func__, error);
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-                                "xfs_trans_commit() returned error %d", error);
+                                __func__, error);
        }
        /*
@@ -1291,7 +1309,7 @@ xfs_create(
        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
-                goto std_return;
+                return error;
        if (is_dir) {
                rdev = 0;
@@ -1371,12 +1389,6 @@ xfs_create(
        }
        /*
-         * At this point, we've gotten a newly allocated inode.
-         * It is locked (and joined to the transaction).
-         */
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        /*
         * Now we join the directory inode to the transaction.  We do not do it
         * earlier because xfs_dir_ialloc might commit the previous transaction
         * (and release all the locks).  An error from here on will result in
@@ -1421,22 +1433,13 @@ xfs_create(
         */
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
-                goto out_abort_rele;
+                goto out_bmap_cancel;
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error) {
+        if (error)
-                IRELE(ip);
+                goto out_release_inode;
-                goto out_dqrele;
-        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -1450,27 +1453,21 @@ xfs_create(
        cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
- out_dqrele:
+ out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to
+         * release the inode.  This prevents recursive transactions
+         * and deadlocks from xfs_inactive.
+         */
+        if (ip)
+                IRELE(ip);
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
        return error;
- out_abort_rele:
-        /*
-         * Wait until after the current transaction is aborted to
-         * release the inode.  This prevents recursive transactions
-         * and deadlocks from xfs_inactive.
-         */
-        xfs_bmap_cancel(&free_list);
-        cancel_flags |= XFS_TRANS_ABORT;
-        xfs_trans_cancel(tp, cancel_flags);
-        IRELE(ip);
-        unlock_dp_on_error = B_FALSE;
-        goto out_dqrele;
 }
 #ifdef DEBUG
@@ -2095,9 +2092,8 @@ xfs_symlink(
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
                                  &free_list);
-                if (error) {
+                if (error)
-                        goto error1;
+                        goto error2;
-                }
                if (resblks)
                        resblks -= fs_blocks;
@@ -2129,7 +2125,7 @@ xfs_symlink(
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-                goto error1;
+                goto error2;
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -2142,13 +2138,6 @@ xfs_symlink(
                xfs_trans_set_sync(tp);
        }
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error) {
                goto error2;
@@ -2842,7 +2831,8 @@ xfs_change_file_space(
                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
+        if (attr_flags & XFS_ATTR_SYNC)
+                xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index f6702927eee4..3bcd23353d6c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
 #define XFS_ATTR_NOACL          0x08    /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC           0x10    /* synchronous operation required */
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_release(struct xfs_inode *ip);
author	Stefan Richter <stefanr@s5r6.in-berlin.de>	2011-05-10 14:52:07 -0400
committer	Stefan Richter <stefanr@s5r6.in-berlin.de>	2011-05-10 16:50:41 -0400
commit	020abf03cd659388f94cb328e1e1df0656e0d7ff (patch)
tree	40d05011708ad1b4a05928d167eb120420581aa6 /fs/xfs
parent	0ff8fbc61727c926883eec381fbd3d32d1fab504 (diff)
parent	693d92a1bbc9e42681c42ed190bd42b636ca876f (diff)