12 files changed, 211 insertions, 87 deletions
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3..39f4f809bb6 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7d287afccde..691f61223ed 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1d353def2..4c5deb6e9e3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
        if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
 */
 /*
- *      Locks a buffer object, if it is not already locked.
+ *      Locks a buffer object, if it is not already locked.  Note that this in
- *      Note that this in no way locks the underlying pages, so it is only
+ *      no way locks the underlying pages, so it is only useful for
- *      useful for synchronizing concurrent use of buffer objects, not for
+ *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      synchronizing independent access to the underlying pages.
+ *      independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we are
+ *      being asked to lock a buffer that has been reallocated. Because it is
+ *      pinned, we know that the log has not been pushed to disk and hence it
+ *      will still be locked.  Rather than continuing to have trylock attempts
+ *      fail until someone else pushes the log, push it ourselves before
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
 */
 int
 xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d..11dd72070cb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e1..4111cd3966c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdf..3651191daea 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a2..e60490bc00a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed999026766..c78cc6a3d87 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb8..f338847f80b 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8..d7de5a3f786 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,6 +91,17 @@ xfs_inode_alloc(
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +145,7 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
 }
 /*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705d..7c8d30c453c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                return lsn - 1;
        return lsn;
 }
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a..77a59891734 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
         * it and some incremental backup programs won't work without it.
         */
        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when