Merge branch 'master' into for-linus-merged

This merge pulls the XFS master branch into the latest Linus master. This results in a merge conflict whose best fix is not obvious. I manually fixed the conflict, in "fs/xfs/xfs_iget.c". Dave Chinner had done work that resulted in RCU freeing of inodes separate from what Nick Piggin had done, and their results differed slightly in xfs_inode_free(). The fix updates Nick's call_rcu() with the use of VFS_I(), while incorporating needed updates to some XFS inode fields implemented in Dave's series. Dave's RCU callback function has also been removed. Signed-off-by: Alex Elder <aelder@sgi.com>
author: Alex Elder <aelder@sgi.com> 2011-01-10 22:35:55 -0500
committer: Alex Elder <aelder@sgi.com> 2011-01-10 22:35:55 -0500
commit: 92f1c008ae79e32b83c0607d184b194f302bb3ee (patch)
tree: 070980c581ca39a050a1b86a50fe4c52437cdba1 /fs/xfs/linux-2.6
parent: e54be894eae10eca9892e965cc9532f5d5a11767 (diff)
parent: d0eb2f38b250b7d6c993adf81b0e4ded0565497e (diff)
10 files changed, 496 insertions, 447 deletions
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c37081..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-typedef struct sv_s {
-        wait_queue_head_t waiters;
-} sv_t;
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(TASK_UNINTERRUPTIBLE);
-        spin_unlock(lock);
-        schedule();
-        remove_wait_queue(&sv->waiters, &wait);
-}
-#define sv_init(sv,flag,name) \
-        init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-        /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock)
-#define sv_signal(sv) \
-        wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-        wake_up_all(&(sv)->waiters)
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed..ec7bbb5645b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-        IO_READ,        /* mapping for a read */
-        IO_DELAY,       /* mapping covers delalloc region */
-        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-        IO_NEW          /* just allocated */
-};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -244,10 +232,8 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IO_READ) {
+        error = xfs_setfilesize(ioend);
-                error = xfs_setfilesize(ioend);
+        ASSERT(!error || error == EAGAIN);
-                ASSERT(!error || error == EAGAIN);
-        }
        /*
         * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
 xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
-        ssize_t                 count,
        struct xfs_bmbt_irec    *imap,
-        int                     flags)
+        int                     type,
+        int                     nonblocking)
 {
-        int                     nmaps = 1;
+        struct xfs_inode        *ip = XFS_I(inode);
-        int                     new = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 count = 1 << inode->i_blkbits;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (type == IO_UNWRITTEN)
+                bmapi_flags |= XFS_BMAPI_IGSTATE;
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                if (nonblocking)
+                        return -XFS_ERROR(EAGAIN);
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        }
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               (ip->i_df.if_flags & XFS_IFEXTENTS));
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return -XFS_ERROR(error);
+        if (type == IO_DELALLOC &&
+            (!nimaps || isnullstartblock(imap->br_startblock))) {
+                error = xfs_iomap_write_allocate(ip, offset, count, imap);
+                if (!error)
+                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+                return -XFS_ERROR(error);
+        }
+#ifdef DEBUG
+        if (type == IO_UNWRITTEN) {
+                ASSERT(nimaps);
+                ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+        }
+#endif
+        if (nimaps)
+                trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+        return 0;
 }
 STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
                   WRITE_SYNC_PLUG : WRITE, bio);
-        ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-        bio_put(bio);
 }
 STATIC struct bio *
 xfs_alloc_ioend_bio(
        struct buffer_head      *bh)
 {
-        struct bio              *bio;
        int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+        struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-        do {
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (!bio);
        ASSERT(bio->bi_private == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
-        bio_get(bio);
        return bio;
 }
@@ -470,9 +497,8 @@ xfs_submit_ioend(
        /* Pass 1 - start writeback */
        do {
                next = ioend->io_list;
-                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
                        xfs_start_buffer_writeback(bh);
-                }
        } while ((ioend = next) != NULL);
        /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        lock_buffer(bh);
        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
 }
 /*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-        struct page             *page,
-        unsigned int            pg_offset)
-{
-        struct buffer_head      *bh, *head;
-        int                     ret = 0;
-        if (PageWriteback(page))
-                return 0;
-        if (!PageDirty(page))
-                return 0;
-        if (!page->mapping)
-                return 0;
-        if (!page_has_buffers(page))
-                return 0;
-        bh = head = page_buffers(page);
-        do {
-                if (!buffer_uptodate(bh))
-                        break;
-                if (!buffer_mapped(bh))
-                        break;
-                ret += bh->b_size;
-                if (ret >= pg_offset)
-                        break;
-        } while ((bh = bh->b_this_page) != head);
-        return ret;
-}
-STATIC size_t
-xfs_probe_cluster(
-        struct inode            *inode,
-        struct page             *startpage,
-        struct buffer_head      *bh,
-        struct buffer_head      *head)
-{
-        struct pagevec          pvec;
-        pgoff_t                 tindex, tlast, tloff;
-        size_t                  total = 0;
-        int                     done = 0, i;
-        /* First sum forwards in this page */
-        do {
-                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
-                        return total;
-                total += bh->b_size;
-        } while ((bh = bh->b_this_page) != head);
-        /* if we reached the end of the page, sum forwards in following pages */
-        tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        tindex = startpage->index + 1;
-        /* Prune this back to avoid pathological behavior */
-        tloff = min(tlast, startpage->index + 64);
-        pagevec_init(&pvec, 0);
-        while (!done && tindex <= tloff) {
-                unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-                if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                        break;
-                for (i = 0; i < pagevec_count(&pvec); i++) {
-                        struct page *page = pvec.pages[i];
-                        size_t pg_offset, pg_len = 0;
-                        if (tindex == tlast) {
-                                pg_offset =
-                                    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-                                if (!pg_offset) {
-                                        done = 1;
-                                        break;
-                                }
-                        } else
-                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset);
-                                unlock_page(page);
-                        }
-                        if (!pg_len) {
-                                done = 1;
-                                break;
-                        }
-                        total += pg_len;
-                        tindex++;
-                }
-                pagevec_release(&pvec);
-                cond_resched();
-        }
-        return total;
-}
-/*
 * Test if a given page is suitable for writing as part of an unwritten
 * or delayed allocate extent.
 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
                        if (buffer_unwritten(bh))
                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IO_DELAY);
+                                acceptable = (type == IO_DELALLOC);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IO_NEW);
+                                acceptable = (type == IO_OVERWRITE);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
        loff_t                  tindex,
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
-        struct writeback_control *wbc,
+        struct writeback_control *wbc)
-        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
                        continue;
                }
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
+                if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                    buffer_mapped(bh)) {
                        if (buffer_unwritten(bh))
                                type = IO_UNWRITTEN;
+                        else if (buffer_delay(bh))
+                                type = IO_DELALLOC;
                        else
-                                type = IO_DELAY;
+                                type = IO_OVERWRITE;
                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+                        lock_buffer(bh);
-                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        if (type != IO_OVERWRITE)
+                                xfs_map_at_offset(inode, bh, imap, offset);
-                        xfs_map_at_offset(inode, bh, imap, offset);
                        xfs_add_to_ioend(inode, bh, offset, type,
                                         ioendp, done);
                        page_dirty--;
                        count++;
                } else {
-                        type = IO_NEW;
+                        done = 1;
-                        if (buffer_mapped(bh) && all_bh) {
-                                lock_buffer(bh);
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                                count++;
-                                page_dirty--;
-                        } else {
-                                done = 1;
-                        }
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -876,7 +790,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     all_bh,
        pgoff_t                 tlast)
 {
        struct pagevec          pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, all_bh);
+                                        imap, ioendp, wbc);
                        if (done)
                                break;
                }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        if (!xfs_is_delayed_page(page, IO_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELALLOC))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
-        ssize_t                 size, len;
+        ssize_t                 len;
-        int                     flags, err, imap_valid = 0, uptodate = 1;
+        int                     err, imap_valid = 0, uptodate = 1;
        int                     count = 0;
-        int                     all_bh = 0;
+        int                     nonblocking = 0;
        trace_xfs_writepage(inode, page, 0);
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
        bh = head = page_buffers(page);
        offset = page_offset(page);
-        flags = BMAPI_READ;
+        type = IO_OVERWRITE;
-        type = IO_NEW;
+        if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+                nonblocking = 1;
        do {
+                int new_ioend = 0;
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
                        continue;
                }
-                if (imap_valid)
+                if (buffer_unwritten(bh)) {
-                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                        if (type != IO_UNWRITTEN) {
-                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                        int new_ioend = 0;
-                        /*
-                         * Make sure we don't use a read-only iomap
-                         */
-                        if (flags == BMAPI_READ)
-                                imap_valid = 0;
-                        if (buffer_unwritten(bh)) {
                                type = IO_UNWRITTEN;
-                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
+                                imap_valid = 0;
-                        } else if (buffer_delay(bh)) {
-                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE;
-                                if (wbc->sync_mode == WB_SYNC_NONE)
-                                        flags |= BMAPI_TRYLOCK;
-                        }
-                        if (!imap_valid) {
-                                /*
-                                 * If we didn't have a valid mapping then we
-                                 * need to ensure that we put the new mapping
-                                 * in a new ioend structure. This needs to be
-                                 * done to ensure that the ioends correctly
-                                 * reflect the block mappings at io completion
-                                 * for unwritten extent conversion.
-                                 */
-                                new_ioend = 1;
-                                err = xfs_map_blocks(inode, offset, len,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
-                        if (imap_valid) {
+                } else if (buffer_delay(bh)) {
-                                xfs_map_at_offset(inode, bh, &imap, offset);
+                        if (type != IO_DELALLOC) {
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                type = IO_DELALLOC;
-                                                 &ioend, new_ioend);
+                                imap_valid = 0;
-                                count++;
                        }
                } else if (buffer_uptodate(bh)) {
-                        /*
+                        if (type != IO_OVERWRITE) {
-                         * we got here because the buffer is already mapped.
+                                type = IO_OVERWRITE;
-                         * That means it must already have extents allocated
+                                imap_valid = 0;
-                         * underneath it. Map the extent by reading it.
-                         */
-                        if (!imap_valid || flags != BMAPI_READ) {
-                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh, head);
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &imap, flags);
-                                if (err)
-                                        goto error;
-                                imap_valid = xfs_imap_valid(inode, &imap,
-                                                            offset);
                        }
+                } else {
+                        if (PageUptodate(page)) {
+                                ASSERT(buffer_mapped(bh));
+                                imap_valid = 0;
+                        }
+                        continue;
+                }
+                if (imap_valid)
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
+                if (!imap_valid) {
                        /*
-                         * We set the type to IO_NEW in case we are doing a
+                         * If we didn't have a valid mapping then we need to
-                         * small write at EOF that is extending the file but
+                         * put the new mapping into a separate ioend structure.
-                         * without needing an allocation. We need to update the
+                         * This ensures non-contiguous extents always have
-                         * file size on I/O completion in this case so it is
+                         * separate ioends, which is particularly important
-                         * the same case as having just allocated a new extent
+                         * for unwritten extent conversion at I/O completion
-                         * that we are writing into for the first time.
+                         * time.
                         */
-                        type = IO_NEW;
+                        new_ioend = 1;
-                        if (trylock_buffer(bh)) {
+                        err = xfs_map_blocks(inode, offset, &imap, type,
-                                if (imap_valid)
+                                             nonblocking);
-                                        all_bh = 1;
+                        if (err)
-                                xfs_add_to_ioend(inode, bh, offset, type,
+                                goto error;
-                                                &ioend, !imap_valid);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                                count++;
+                }
-                        } else {
+                if (imap_valid) {
-                                imap_valid = 0;
+                        lock_buffer(bh);
-                        }
+                        if (type != IO_OVERWRITE)
-                } else if (PageUptodate(page)) {
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                        ASSERT(buffer_mapped(bh));
+                        xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                        imap_valid = 0;
+                                         new_ioend);
+                        count++;
                }
                if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, all_bh, end_index);
+                                  wbc, end_index);
        }
        if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
        int                     create,
        int                     direct)
 {
-        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        int                     error = 0;
+        int                     lockmode = 0;
        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     nimap = 1;
        int                     new = 0;
-        int                     error;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        if (direct && create)
+        if (create) {
-                flags |= BMAPI_DIRECT;
+                lockmode = XFS_ILOCK_EXCL;
+                xfs_ilock(ip, lockmode);
+        } else {
+                lockmode = xfs_ilock_map_shared(ip);
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if (offset + size > mp->m_maxioffset)
+                size = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                          &new);
+                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
        if (error)
-                return -error;
+                goto out_unlock;
-        if (nimap == 0)
-                return 0;
+        if (create &&
+            (!nimaps ||
+             (imap.br_startblock == HOLESTARTBLOCK ||
+              imap.br_startblock == DELAYSTARTBLOCK))) {
+                if (direct) {
+                        error = xfs_iomap_write_direct(ip, offset, size,
+                                                       &imap, nimaps);
+                } else {
+                        error = xfs_iomap_write_delay(ip, offset, size, &imap);
+                }
+                if (error)
+                        goto out_unlock;
+                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+        } else if (nimaps) {
+                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+        } else {
+                trace_xfs_get_blocks_notfound(ip, offset, size);
+                goto out_unlock;
+        }
+        xfs_iunlock(ip, lockmode);
        if (imap.br_startblock != HOLESTARTBLOCK &&
            imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
        }
        return 0;
+out_unlock:
+        xfs_iunlock(ip, lockmode);
+        return -error;
 }
 int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
        ssize_t                 ret;
        if (rw & WRITE) {
-                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
+                iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237..71f721e1a71 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 /*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_DIRECT = 0,  /* special case for direct I/O ioends */
+        IO_DELALLOC,    /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+#define XFS_IO_TYPES \
+        { 0,                    "" }, \
+        { IO_DELALLOC,          "delalloc" }, \
+        { IO_UNWRITTEN,         "unwritten" }, \
+        { IO_OVERWRITE,         "overwrite" }
+/*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e3..92f1f2acc6a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-static struct shrinker xfs_buf_shake = {
-        .shrink = xfsbufd_wakeup,
-        .seeks = DEFAULT_SEEKS,
-};
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
 }
 /*
- *      Internal xfs_buf_t object manipulation
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
 */
+STATIC void
+xfs_buf_lru_add(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        spin_lock(&btp->bt_lru_lock);
+        if (list_empty(&bp->b_lru)) {
+                atomic_inc(&bp->b_hold);
+                list_add_tail(&bp->b_lru, &btp->bt_lru);
+                btp->bt_lru_nr++;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+        struct xfs_buf  *bp)
+{
+        struct xfs_buftarg *btp = bp->b_target;
+        if (list_empty(&bp->b_lru))
+                return;
+        spin_lock(&btp->bt_lru_lock);
+        if (!list_empty(&bp->b_lru)) {
+                list_del_init(&bp->b_lru);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+}
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+        struct xfs_buf  *bp)
+{
+        bp->b_flags |= XBF_STALE;
+        atomic_set(&(bp)->b_lru_ref, 0);
+        if (!list_empty(&bp->b_lru)) {
+                struct xfs_buftarg *btp = bp->b_target;
+                spin_lock(&btp->bt_lru_lock);
+                if (!list_empty(&bp->b_lru)) {
+                        list_del_init(&bp->b_lru);
+                        btp->bt_lru_nr--;
+                        atomic_dec(&bp->b_hold);
+                }
+                spin_unlock(&btp->bt_lru_lock);
+        }
+        ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
 STATIC void
 _xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
+        atomic_set(&bp->b_lru_ref, 1);
        init_completion(&bp->b_iowait);
+        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
        RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
+        ASSERT(list_empty(&bp->b_lru));
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -828,6 +897,7 @@ xfs_buf_rele(
        if (!pag) {
                ASSERT(!bp->b_relse);
+                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
@@ -835,13 +905,19 @@ xfs_buf_rele(
        }
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
                        spin_unlock(&pag->pag_buf_lock);
                        bp->b_relse(bp);
+                } else if (!(bp->b_flags & XBF_STALE) &&
+                           atomic_read(&bp->b_lru_ref)) {
+                        xfs_buf_lru_add(bp);
+                        spin_unlock(&pag->pag_buf_lock);
                } else {
+                        xfs_buf_lru_del(bp);
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
                        spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
 */
 /*
- *      Wait for any bufs with callbacks that have been submitted but
+ * Wait for any bufs with callbacks that have been submitted but have not yet
- *      have not yet returned... walk the hash list for the target.
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
 */
 void
 xfs_wait_buftarg(
        struct xfs_buftarg      *btp)
 {
-        struct xfs_perag        *pag;
+        struct xfs_buf          *bp;
-        uint                    i;
-        for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+restart:
-                pag = xfs_perag_get(btp->bt_mount, i);
+        spin_lock(&btp->bt_lru_lock);
-                spin_lock(&pag->pag_buf_lock);
+        while (!list_empty(&btp->bt_lru)) {
-                while (rb_first(&pag->pag_buf_tree)) {
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-                        spin_unlock(&pag->pag_buf_lock);
+                if (atomic_read(&bp->b_hold) > 1) {
+                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
-                        spin_lock(&pag->pag_buf_lock);
+                        goto restart;
                }
-                spin_unlock(&pag->pag_buf_lock);
+                /*
-                xfs_perag_put(pag);
+                 * clear the LRU reference count so the bufer doesn't get
+                 * ignored in xfs_buf_rele().
+                 */
+                atomic_set(&bp->b_lru_ref, 0);
+                spin_unlock(&btp->bt_lru_lock);
+                xfs_buf_rele(bp);
+                spin_lock(&btp->bt_lru_lock);
        }
+        spin_unlock(&btp->bt_lru_lock);
 }
-/*
+int
- *      buftarg list for delwrite queue processing
+xfs_buftarg_shrink(
- */
+        struct shrinker         *shrink,
-static LIST_HEAD(xfs_buftarg_list);
+        int                     nr_to_scan,
-static DEFINE_SPINLOCK(xfs_buftarg_lock);
+        gfp_t                   mask)
-STATIC void
-xfs_register_buftarg(
-        xfs_buftarg_t           *btp)
 {
-        spin_lock(&xfs_buftarg_lock);
+        struct xfs_buftarg      *btp = container_of(shrink,
-        list_add(&btp->bt_list, &xfs_buftarg_list);
+                                        struct xfs_buftarg, bt_shrinker);
-        spin_unlock(&xfs_buftarg_lock);
+        struct xfs_buf          *bp;
-}
+        LIST_HEAD(dispose);
-STATIC void
+        if (!nr_to_scan)
-xfs_unregister_buftarg(
+                return btp->bt_lru_nr;
-        xfs_buftarg_t           *btp)
-{
+        spin_lock(&btp->bt_lru_lock);
-        spin_lock(&xfs_buftarg_lock);
+        while (!list_empty(&btp->bt_lru)) {
-        list_del(&btp->bt_list);
+                if (nr_to_scan-- <= 0)
-        spin_unlock(&xfs_buftarg_lock);
+                        break;
+                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+                /*
+                 * Decrement the b_lru_ref count unless the value is already
+                 * zero. If the value is already zero, we need to reclaim the
+                 * buffer, otherwise it gets another trip through the LRU.
+                 */
+                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
+                        continue;
+                }
+                /*
+                 * remove the buffer from the LRU now to avoid needing another
+                 * lock round trip inside xfs_buf_rele().
+                 */
+                list_move(&bp->b_lru, &dispose);
+                btp->bt_lru_nr--;
+        }
+        spin_unlock(&btp->bt_lru_lock);
+        while (!list_empty(&dispose)) {
+                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+                list_del_init(&bp->b_lru);
+                xfs_buf_rele(bp);
+        }
+        return btp->bt_lru_nr;
 }
 void
@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
        struct xfs_mount        *mp,
        struct xfs_buftarg      *btp)
 {
+        unregister_shrinker(&btp->bt_shrinker);
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
        iput(btp->bt_mapping->host);
-        /* Unregister the buftarg first so that we don't get a
-         * wakeup finding a non-existent task
-         */
-        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
 }
@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        int     error = 0;
-        INIT_LIST_HEAD(&btp->bt_list);
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-        if (IS_ERR(btp->bt_task)) {
+        if (IS_ERR(btp->bt_task))
-                error = PTR_ERR(btp->bt_task);
+                return PTR_ERR(btp->bt_task);
-                goto out_error;
+        return 0;
-        }
-        xfs_register_buftarg(btp);
-out_error:
-        return error;
 }
 xfs_buftarg_t *
@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        INIT_LIST_HEAD(&btp->bt_lru);
+        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
+        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+        register_shrinker(&btp->bt_shrinker);
        return btp;
 error:
@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
        flush_workqueue(queue);
 }
-STATIC int
-xfsbufd_wakeup(
-        struct shrinker         *shrink,
-        int                     priority,
-        gfp_t                   mask)
-{
-        xfs_buftarg_t           *btp;
-        spin_lock(&xfs_buftarg_lock);
-        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-                        continue;
-                if (list_empty(&btp->bt_delwrite_queue))
-                        continue;
-                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-                wake_up_process(btp->bt_task);
-        }
-        spin_unlock(&xfs_buftarg_lock);
-        return 0;
-}
 /*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
-        register_shrinker(&xfs_buf_shake);
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-        unregister_shrinker(&xfs_buf_shake);
        destroy_workqueue(xfsconvertd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf9..a76c2428faf 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_list;
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+        /* LRU control structures */
+        struct shrinker         bt_shrinker;
+        struct list_head        bt_lru;
+        spinlock_t              bt_lru_lock;
+        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
 /*
@@ -164,9 +169,11 @@ typedef struct xfs_buf {
        xfs_off_t               b_file_offset;  /* offset in file */
        size_t                  b_buffer_length;/* size of buffer in bytes */
        atomic_t                b_hold;         /* reference count */
+        atomic_t                b_lru_ref;      /* lru reclaim ref count */
        xfs_buf_flags_t         b_flags;        /* status flags */
        struct semaphore        b_sema;         /* semaphore for lockables */
+        struct list_head        b_lru;          /* lru list */
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        struct xfs_perag        *b_pag;         /* contains rbtree root */
@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_ZEROFLAGS(bp)   ((bp)->b_flags &= \
                ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
-#define XFS_BUF_STALE(bp)       ((bp)->b_flags |= XBF_STALE)
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp) do {                            \
@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    do { } while (0)
+static inline void
+xfs_buf_set_ref(
+        struct xfs_buf  *bp,
+        int             lru_ref)
+{
+        atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
 #define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790e..fc0114da7fd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
-        /* filesystem may contain 64bit inode numbers */
+        /*
-        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS))
+         * If the the filesystem may contain 64bit inode numbers, we need
+         * to use larger file handles that can represent them.
+         *
+         * While we only allocate inodes that do not fit into 32 bits any
+         * large enough filesystem may contain them, thus the slightly
+         * confusing looking conditional below.
+         */
+        if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+            (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
                fileid_type |= XFS_FILEID_TYPE_64FLAG;
        /*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff7..09649499774 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
 #include <kmem.h>
 #include <mrlock.h>
-#include <sv.h>
 #include <time.h>
 #include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3..c51faaa5e29 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -834,8 +834,11 @@ xfsaild_wakeup(
        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        ailp->xa_target = threshold_lsn;
+        /* only ever move the target forwards */
-        wake_up_process(ailp->xa_task);
+        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
+                ailp->xa_target = threshold_lsn;
+                wake_up_process(ailp->xa_task);
+        }
 }
 STATIC int
@@ -847,8 +850,17 @@ xfsaild(
        long            tout = 0; /* milliseconds */
        while (!kthread_should_stop()) {
-                schedule_timeout_interruptible(tout ?
+                /*
-                                msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                 * for short sleeps indicating congestion, don't allow us to
+                 * get woken early. Otherwise all we do is bang on the AIL lock
+                 * without making progress.
+                 */
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
                /* swsusp */
                try_to_freeze();
@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
         */
        ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                        &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
        xfs_inactive(ip);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1..a02480de975 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +833,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-        write_lock(&pag->pag_ici_lock);
+        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-        write_unlock(&pag->pag_ici_lock);
+        spin_unlock(&pag->pag_ici_lock);
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +902,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -891,14 +929,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c59..647af2a2e7a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __field(int, curr_res)
                __field(int, unit_res)
                __field(unsigned int, flags)
-                __field(void *, reserve_headq)
+                __field(int, reserveq)
-                __field(void *, write_headq)
+                __field(int, writeq)
                __field(int, grant_reserve_cycle)
                __field(int, grant_reserve_bytes)
                __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-                __entry->reserve_headq = log->l_reserve_headq;
+                __entry->reserveq = list_empty(&log->l_reserveq);
-                __entry->write_headq = log->l_write_headq;
+                __entry->writeq = list_empty(&log->l_writeq);
-                __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
+                xlog_crack_grant_head(&log->l_grant_reserve_head,
-                __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
+                                &__entry->grant_reserve_cycle,
-                __entry->grant_write_cycle = log->l_grant_write_cycle;
+                                &__entry->grant_reserve_bytes);
-                __entry->grant_write_bytes = log->l_grant_write_bytes;
+                xlog_crack_grant_head(&log->l_grant_write_head,
+                                &__entry->grant_write_cycle,
+                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
                __entry->curr_block = log->l_curr_block;
-                __entry->tail_lsn = log->l_tail_lsn;
+                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
        TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                  "t_unit_res %u t_flags %s reserve_headq 0x%p "
+                  "t_unit_res %u t_flags %s reserveq %s "
-                  "write_headq 0x%p grant_reserve_cycle %d "
+                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                  __entry->curr_res,
                  __entry->unit_res,
                  __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                  __entry->reserve_headq,
+                  __entry->reserveq ? "empty" : "active",
-                  __entry->write_headq,
+                  __entry->writeq ? "empty" : "active",
                  __entry->grant_reserve_cycle,
                  __entry->grant_reserve_bytes,
                  __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-DECLARE_EVENT_CLASS(xfs_iomap_class,
+DECLARE_EVENT_CLASS(xfs_imap_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                 int flags, struct xfs_bmbt_irec *irec),
+                 int type, struct xfs_bmbt_irec *irec),
-        TP_ARGS(ip, offset, count, flags, irec),
+        TP_ARGS(ip, offset, count, type, irec),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
-                __field(int, flags)
+                __field(int, type)
                __field(xfs_fileoff_t, startoff)
                __field(xfs_fsblock_t, startblock)
                __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
-                __entry->flags = flags;
+                __entry->type = type;
                __entry->startoff = irec ? irec->br_startoff : 0;
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                  "offset 0x%llx count %zd flags %s "
+                  "offset 0x%llx count %zd type %s "
                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+                  __print_symbolic(__entry->type, XFS_IO_TYPES),
                  __entry->startoff,
                  (__int64_t)__entry->startblock,
                  __entry->blockcount)
 )
 #define DEFINE_IOMAP_EVENT(name)        \
-DEFINE_EVENT(xfs_iomap_class, name,     \
+DEFINE_EVENT(xfs_imap_class, name,      \
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                 int flags, struct xfs_bmbt_irec *irec),                \
+                 int type, struct xfs_bmbt_irec *irec),         \
-        TP_ARGS(ip, offset, count, flags, irec))
+        TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_iomap_enter);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name,	\
        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
        TP_PROTO(struct xfs_alloc_arg *args), \
        TP_ARGS(args))
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
 DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
 DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
author	Alex Elder <aelder@sgi.com>	2011-01-10 22:35:55 -0500
committer	Alex Elder <aelder@sgi.com>	2011-01-10 22:35:55 -0500
commit	92f1c008ae79e32b83c0607d184b194f302bb3ee (patch)
tree	070980c581ca39a050a1b86a50fe4c52437cdba1 /fs/xfs/linux-2.6
parent	e54be894eae10eca9892e965cc9532f5d5a11767 (diff)
parent	d0eb2f38b250b7d6c993adf81b0e4ded0565497e (diff)