Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs: xfs: only run xfs_error_test if error injection is active xfs: avoid moving stale inodes in the AIL xfs: delayed alloc blocks beyond EOF are valid after writeback xfs: push stale, pinned buffers on trylock failures xfs: fix failed write truncation handling.
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-12-02 12:13:36 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-12-02 12:13:36 -0500
commit: 8cb280c90f9cfaab3ba3afbace0b1711dee80d0c (patch)
tree: b98d29b0159dd763afab1670d58019b6cb58cfa0
parent: 8fed709f343346a77888c2eef8f2d41bc637bef6 (diff)
parent: c76febef574fd86566bbdf1a73a547a439115c25 (diff)
8 files changed, 188 insertions, 83 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7d287afccde5..691f61223ed6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1d353def29..4c5deb6e9e31 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
        if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
 */
 /*
- *      Locks a buffer object, if it is not already locked.
+ *      Locks a buffer object, if it is not already locked.  Note that this in
- *      Note that this in no way locks the underlying pages, so it is only
+ *      no way locks the underlying pages, so it is only useful for
- *      useful for synchronizing concurrent use of buffer objects, not for
+ *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      synchronizing independent access to the underlying pages.
+ *      independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we are
+ *      being asked to lock a buffer that has been reallocated. Because it is
+ *      pinned, we know that the log has not been pushed to disk and hence it
+ *      will still be locked.  Rather than continuing to have trylock attempts
+ *      fail until someone else pushes the log, push it ourselves before
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
 */
 int
 xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e13..4111cd3966c7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdfc..3651191daea1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..e60490bc00a6 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..c78cc6a3d87c 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..f338847f80b8 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705df..7c8d30c453c3 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                return lsn - 1;
        return lsn;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-12-02 12:13:36 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-12-02 12:13:36 -0500
commit	8cb280c90f9cfaab3ba3afbace0b1711dee80d0c (patch)
tree	b98d29b0159dd763afab1670d58019b6cb58cfa0
parent	8fed709f343346a77888c2eef8f2d41bc637bef6 (diff)
parent	c76febef574fd86566bbdf1a73a547a439115c25 (diff)

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7d287afccde5..691f61223ed6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
934	struct xfs_inode *ip = XFS_I(inode);	934	struct xfs_inode *ip = XFS_I(inode);
935	struct buffer_head bh, head;	935	struct buffer_head bh, head;
936	loff_t offset = page_offset(page);	936	loff_t offset = page_offset(page);
937	ssize_t len = 1 << inode->i_blkbits;
938		937
939	if (!xfs_is_delayed_page(page, IO_DELAY))	938	if (!xfs_is_delayed_page(page, IO_DELAY))
940	goto out_invalidate;	939	goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
949	xfs_ilock(ip, XFS_ILOCK_EXCL);	948	xfs_ilock(ip, XFS_ILOCK_EXCL);
950	bh = head = page_buffers(page);	949	bh = head = page_buffers(page);
951	do {	950	do {
952	int done;
953	xfs_fileoff_t offset_fsb;
954	xfs_bmbt_irec_t imap;
955	int nimaps = 1;
956	int error;	951	int error;
957	xfs_fsblock_t firstblock;	952	xfs_fileoff_t start_fsb;
958	xfs_bmap_free_t flist;
959		953
960	if (!buffer_delay(bh))	954	if (!buffer_delay(bh))
961	goto next_buffer;	955	goto next_buffer;
962		956
963	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);	957	start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
964		958	error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
965	/*
966	* Map the range first and check that it is a delalloc extent
967	* before trying to unmap the range. Otherwise we will be
968	* trying to remove a real extent (which requires a
969	* transaction) or a hole, which is probably a bad idea...
970	*/
971	error = xfs_bmapi(NULL, ip, offset_fsb, 1,
972	XFS_BMAPI_ENTIRE, NULL, 0, &imap,
973	&nimaps, NULL);
974
975	if (error) {
976	/* something screwed, just bail */
977	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
978	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
979	"page discard failed delalloc mapping lookup.");
980	}
981	break;
982	}
983	if (!nimaps) {
984	/* nothing there */
985	goto next_buffer;
986	}
987	if (imap.br_startblock != DELAYSTARTBLOCK) {
988	/* been converted, ignore */
989	goto next_buffer;
990	}
991	WARN_ON(imap.br_blockcount == 0);
992
993	/*
994	* Note: while we initialise the firstblock/flist pair, they
995	* should never be used because blocks should never be
996	* allocated or freed for a delalloc extent and hence we need
997	* don't cancel or finish them after the xfs_bunmapi() call.
998	*/
999	xfs_bmap_init(&flist, &firstblock);
1000	error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1001	&flist, &done);
1002
1003	ASSERT(!flist.xbf_count && !flist.xbf_first);
1004	if (error) {	959	if (error) {
1005	/* something screwed, just bail */	960	/* something screwed, just bail */
1006	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {	961	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
1010	break;	965	break;
1011	}	966	}
1012	next_buffer:	967	next_buffer:
1013	offset += len;	968	offset += 1 << inode->i_blkbits;
1014		969
1015	} while ((bh = bh->b_this_page) != head);	970	} while ((bh = bh->b_this_page) != head);
1016		971
@@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
1505	struct inode *inode = mapping->host;	1460	struct inode *inode = mapping->host;
1506		1461
1507	if (to > inode->i_size) {	1462	if (to > inode->i_size) {
1508	struct iattr ia = {	1463	/*
1509	.ia_valid = ATTR_SIZE \| ATTR_FORCE,	1464	* punch out the delalloc blocks we have already allocated. We
1510	.ia_size = inode->i_size,	1465	* don't call xfs_setattr() to do this as we may be in the
1511	};	1466	* middle of a multi-iovec write and so the vfs inode->i_size
1512	xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);	1467	* will not match the xfs ip->i_size and so it will zero too
		1468	* much. Hence we jus truncate the page cache to zero what is
		1469	* necessary and punch the delalloc blocks directly.
		1470	*/
		1471	struct xfs_inode *ip = XFS_I(inode);
		1472	xfs_fileoff_t start_fsb;
		1473	xfs_fileoff_t end_fsb;
		1474	int error;
		1475
		1476	truncate_pagecache(inode, to, inode->i_size);
		1477
		1478	/*
		1479	* Check if there are any blocks that are outside of i_size
		1480	* that need to be trimmed back.
		1481	*/
		1482	start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
		1483	end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
		1484	if (end_fsb <= start_fsb)
		1485	return;
		1486
		1487	xfs_ilock(ip, XFS_ILOCK_EXCL);
		1488	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
		1489	end_fsb - start_fsb);
		1490	if (error) {
		1491	/* something screwed, just bail */
		1492	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		1493	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
		1494	"xfs_vm_write_failed: unable to clean up ino %lld",
		1495	ip->i_ino);
		1496	}
		1497	}
		1498	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1513	}	1499	}
1514	}	1500	}
1515		1501


diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index aa1d353def29..4c5deb6e9e31 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
488	spin_unlock(&pag->pag_buf_lock);	488	spin_unlock(&pag->pag_buf_lock);
489	xfs_perag_put(pag);	489	xfs_perag_put(pag);
490		490
491	/* Attempt to get the semaphore without sleeping,	491	if (xfs_buf_cond_lock(bp)) {
492	* if this does not work then we need to drop the	492	/* failed, so wait for the lock if requested. */
493	* spinlock and do a hard attempt on the semaphore.
494	*/
495	if (down_trylock(&bp->b_sema)) {
496	if (!(flags & XBF_TRYLOCK)) {	493	if (!(flags & XBF_TRYLOCK)) {
497	/* wait for buffer ownership */
498	xfs_buf_lock(bp);	494	xfs_buf_lock(bp);
499	XFS_STATS_INC(xb_get_locked_waited);	495	XFS_STATS_INC(xb_get_locked_waited);
500	} else {	496	} else {
501	/* We asked for a trylock and failed, no need
502	* to look at file offset and length here, we
503	* know that this buffer at least overlaps our
504	* buffer and is locked, therefore our buffer
505	* either does not exist, or is this buffer.
506	*/
507	xfs_buf_rele(bp);	497	xfs_buf_rele(bp);
508	XFS_STATS_INC(xb_busy_locked);	498	XFS_STATS_INC(xb_busy_locked);
509	return NULL;	499	return NULL;
510	}	500	}
511	} else {
512	/* trylock worked */
513	XB_SET_OWNER(bp);
514	}	501	}
515		502
516	if (bp->b_flags & XBF_STALE) {	503	if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
876	*/	863	*/
877		864
878	/*	865	/*
879	* Locks a buffer object, if it is not already locked.	866	* Locks a buffer object, if it is not already locked. Note that this in
880	* Note that this in no way locks the underlying pages, so it is only	867	* no way locks the underlying pages, so it is only useful for
881	* useful for synchronizing concurrent use of buffer objects, not for	868	* synchronizing concurrent use of buffer objects, not for synchronizing
882	* synchronizing independent access to the underlying pages.	869	* independent access to the underlying pages.
		870	*
		871	* If we come across a stale, pinned, locked buffer, we know that we are
		872	* being asked to lock a buffer that has been reallocated. Because it is
		873	* pinned, we know that the log has not been pushed to disk and hence it
		874	* will still be locked. Rather than continuing to have trylock attempts
		875	* fail until someone else pushes the log, push it ourselves before
		876	* returning. This means that the xfsaild will not get stuck trying
		877	* to push on stale inode buffers.
883	*/	878	*/
884	int	879	int
885	xfs_buf_cond_lock(	880	xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
890	locked = down_trylock(&bp->b_sema) == 0;	885	locked = down_trylock(&bp->b_sema) == 0;
891	if (locked)	886	if (locked)
892	XB_SET_OWNER(bp);	887	XB_SET_OWNER(bp);
		888	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
		889	xfs_log_force(bp->b_target->bt_mount, 0);
893		890
894	trace_xfs_buf_cond_lock(bp, _RET_IP_);	891	trace_xfs_buf_cond_lock(bp, _RET_IP_);
895	return locked ? 0 : -EBUSY;	892	return locked ? 0 : -EBUSY;


diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8abd12e32e13..4111cd3966c7 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
5471	if (error)	5471	if (error)
5472	goto out_unlock_iolock;	5472	goto out_unlock_iolock;
5473	}	5473	}
5474		5474	/*
5475	ASSERT(ip->i_delayed_blks == 0);	5475	* even after flushing the inode, there can still be delalloc
		5476	* blocks on the inode beyond EOF due to speculative
		5477	* preallocation. These are not removed until the release
		5478	* function is called or the inode is inactivated. Hence we
		5479	* cannot assert here that ip->i_delayed_blks == 0.
		5480	*/
5476	}	5481	}
5477		5482
5478	lock = xfs_ilock_map_shared(ip);	5483	lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
6070	*count += xfs_bmbt_disk_get_blockcount(frp);	6075	*count += xfs_bmbt_disk_get_blockcount(frp);
6071	}	6076	}
6072	}	6077	}
		6078
		6079	/*
		6080	* dead simple method of punching delalyed allocation blocks from a range in
		6081	* the inode. Walks a block at a time so will be slow, but is only executed in
		6082	* rare error cases so the overhead is not critical. This will alays punch out
		6083	* both the start and end blocks, even if the ranges only partially overlap
		6084	* them, so it is up to the caller to ensure that partial blocks are not
		6085	* passed in.
		6086	*/
		6087	int
		6088	xfs_bmap_punch_delalloc_range(
		6089	struct xfs_inode *ip,
		6090	xfs_fileoff_t start_fsb,
		6091	xfs_fileoff_t length)
		6092	{
		6093	xfs_fileoff_t remaining = length;
		6094	int error = 0;
		6095
		6096	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
		6097
		6098	do {
		6099	int done;
		6100	xfs_bmbt_irec_t imap;
		6101	int nimaps = 1;
		6102	xfs_fsblock_t firstblock;
		6103	xfs_bmap_free_t flist;
		6104
		6105	/*
		6106	* Map the range first and check that it is a delalloc extent
		6107	* before trying to unmap the range. Otherwise we will be
		6108	* trying to remove a real extent (which requires a
		6109	* transaction) or a hole, which is probably a bad idea...
		6110	*/
		6111	error = xfs_bmapi(NULL, ip, start_fsb, 1,
		6112	XFS_BMAPI_ENTIRE, NULL, 0, &imap,
		6113	&nimaps, NULL);
		6114
		6115	if (error) {
		6116	/* something screwed, just bail */
		6117	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		6118	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
		6119	"Failed delalloc mapping lookup ino %lld fsb %lld.",
		6120	ip->i_ino, start_fsb);
		6121	}
		6122	break;
		6123	}
		6124	if (!nimaps) {
		6125	/* nothing there */
		6126	goto next_block;
		6127	}
		6128	if (imap.br_startblock != DELAYSTARTBLOCK) {
		6129	/* been converted, ignore */
		6130	goto next_block;
		6131	}
		6132	WARN_ON(imap.br_blockcount == 0);
		6133
		6134	/*
		6135	* Note: while we initialise the firstblock/flist pair, they
		6136	* should never be used because blocks should never be
		6137	* allocated or freed for a delalloc extent and hence we need
		6138	* don't cancel or finish them after the xfs_bunmapi() call.
		6139	*/
		6140	xfs_bmap_init(&flist, &firstblock);
		6141	error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
		6142	&flist, &done);
		6143	if (error)
		6144	break;
		6145
		6146	ASSERT(!flist.xbf_count && !flist.xbf_first);
		6147	next_block:
		6148	start_fsb++;
		6149	remaining--;
		6150	} while(remaining > 0);
		6151
		6152	return error;
		6153	}


diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 71ec9b6ecdfc..3651191daea1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
394	int whichfork,	394	int whichfork,
395	int *count);	395	int *count);
396		396
		397	int
		398	xfs_bmap_punch_delalloc_range(
		399	struct xfs_inode *ip,
		400	xfs_fileoff_t start_fsb,
		401	xfs_fileoff_t length);
397	#endif /* __KERNEL__ */	402	#endif /* __KERNEL__ */
398		403
399	#endif /* __XFS_BMAP_H__ */	404	#endif /* __XFS_BMAP_H__ */


diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3b9582c60a22..e60490bc00a6 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
377	ip->i_d.di_format = tip->i_d.di_format;	377	ip->i_d.di_format = tip->i_d.di_format;
378	tip->i_d.di_format = tmp;	378	tip->i_d.di_format = tmp;
379		379
		380	/*
		381	* The extents in the source inode could still contain speculative
		382	* preallocation beyond EOF (e.g. the file is open but not modified
		383	* while defrag is in progress). In that case, we need to copy over the
		384	* number of delalloc blocks the data fork in the source inode is
		385	* tracking beyond EOF so that when the fork is truncated away when the
		386	* temporary inode is unlinked we don't underrun the i_delayed_blks
		387	* counter on that inode.
		388	*/
		389	ASSERT(tip->i_delayed_blks == 0);
		390	tip->i_delayed_blks = ip->i_delayed_blks;
		391	ip->i_delayed_blks = 0;
		392
380	ilf_fields = XFS_ILOG_CORE;	393	ilf_fields = XFS_ILOG_CORE;
381		394
382	switch(ip->i_d.di_format) {	395	switch(ip->i_d.di_format) {


diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed9990267661..c78cc6a3d87c 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
58	int xfs_etest[XFS_NUM_INJECT_ERROR];	58	int xfs_etest[XFS_NUM_INJECT_ERROR];
59	int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];	59	int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
60	char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];	60	char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
		61	int xfs_error_test_active;
61		62
62	int	63	int
63	xfs_error_test(int error_tag, int fsidp, char expression,	64	xfs_error_test(int error_tag, int fsidp, char expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
108	len = strlen(mp->m_fsname);	109	len = strlen(mp->m_fsname);
109	xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);	110	xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
110	strcpy(xfs_etest_fsname[i], mp->m_fsname);	111	strcpy(xfs_etest_fsname[i], mp->m_fsname);
		112	xfs_error_test_active++;
111	return 0;	113	return 0;
112	}	114	}
113	}	115	}
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
137	xfs_etest_fsid[i] = 0LL;	139	xfs_etest_fsid[i] = 0LL;
138	kmem_free(xfs_etest_fsname[i]);	140	kmem_free(xfs_etest_fsname[i]);
139	xfs_etest_fsname[i] = NULL;	141	xfs_etest_fsname[i] = NULL;
		142	xfs_error_test_active--;
140	}	143	}
141	}	144	}
142		145


diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c2c1a072bb82..f338847f80b8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
127	#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT	127	#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
128		128
129	#ifdef DEBUG	129	#ifdef DEBUG
		130	extern int xfs_error_test_active;
130	extern int xfs_error_test(int, int , char , int, char *, unsigned long);	131	extern int xfs_error_test(int, int , char , int, char *, unsigned long);
131		132
132	#define XFS_NUM_INJECT_ERROR 10	133	#define XFS_NUM_INJECT_ERROR 10
133	#define XFS_TEST_ERROR(expr, mp, tag, rf) \	134	#define XFS_TEST_ERROR(expr, mp, tag, rf) \
134	((expr) \|\| \	135	((expr) \|\| (xfs_error_test_active && \
135	xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \	136	xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
136	(rf)))	137	(rf))))
137		138
138	extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);	139	extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
139	extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);	140	extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);


diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c7ac020705df..7c8d30c453c3 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
657	}	657	}
658		658
659	/*	659	/*
660	* This is called to find out where the oldest active copy of the	660	* This is called to find out where the oldest active copy of the inode log
661	* inode log item in the on disk log resides now that the last log	661	* item in the on disk log resides now that the last log write of it completed
662	* write of it completed at the given lsn. Since we always re-log	662	* at the given lsn. Since we always re-log all dirty data in an inode, the
663	* all dirty data in an inode, the latest copy in the on disk log	663	* latest copy in the on disk log is the only one that matters. Therefore,
664	* is the only one that matters. Therefore, simply return the	664	* simply return the given lsn.
665	* given lsn.	665	*
		666	* If the inode has been marked stale because the cluster is being freed, we
		667	* don't want to (re-)insert this inode into the AIL. There is a race condition
		668	* where the cluster buffer may be unpinned before the inode is inserted into
		669	* the AIL during transaction committed processing. If the buffer is unpinned
		670	* before the inode item has been committed and inserted, then it is possible
		671	* for the buffer to be written and IO completions before the inode is inserted
		672	* into the AIL. In that case, we'd be inserting a clean, stale inode into the
		673	* AIL which will never get removed. It will, however, get reclaimed which
		674	* triggers an assert in xfs_inode_free() complaining about freein an inode
		675	* still in the AIL.
		676	*
		677	* To avoid this, return a lower LSN than the one passed in so that the
		678	* transaction committed code will not move the inode forward in the AIL but
		679	* will still unpin it properly.
666	*/	680	*/
667	STATIC xfs_lsn_t	681	STATIC xfs_lsn_t
668	xfs_inode_item_committed(	682	xfs_inode_item_committed(
669	struct xfs_log_item *lip,	683	struct xfs_log_item *lip,
670	xfs_lsn_t lsn)	684	xfs_lsn_t lsn)
671	{	685	{
		686	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
		687	struct xfs_inode *ip = iip->ili_inode;
		688
		689	if (xfs_iflags_test(ip, XFS_ISTALE))
		690	return lsn - 1;
672	return lsn;	691	return lsn;
673	}	692	}
674		693