Merge branch 'v2.6.36-rc8' into for-2.6.37/barrier

Conflicts: block/blk-core.c drivers/block/loop.c mm/swapfile.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
author: Jens Axboe <jaxboe@fusionio.com> 2010-10-19 03:13:04 -0400
committer: Jens Axboe <jaxboe@fusionio.com> 2010-10-19 03:13:04 -0400
commit: fa251f89903d73989e2f63e13d0eaed1e07ce0da (patch)
tree: 3f7fe779941e3b6d67754dd7c44a32f48ea47c74 /fs/xfs
parent: dd3932eddf428571762596e17b65f5dc92ca361b (diff)
parent: cd07202cc8262e1669edff0d97715f3dd9260917 (diff)
19 files changed, 332 insertions, 239 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3a..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
                SetPageUptodate(page);
        if (count) {
-                wbc->nr_to_write--;
+                if (--wbc->nr_to_write <= 0 &&
-                if (wbc->nr_to_write <= 0)
+                    wbc->sync_mode == WB_SYNC_NONE)
                        done = 1;
        }
        xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
         * by themselves.
         */
        if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
-                goto out_fail;
+                goto redirty;
        /*
         * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
         */
        xfs_count_page_state(page, &delalloc, &unwritten);
        if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
-                goto out_fail;
+                goto redirty;
        /* Is this page beyond the end of the file? */
        offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
        if (iohead)
                xfs_cancel_ioend(iohead);
+        if (err == -EAGAIN)
+                goto redirty;
        xfs_aops_discard_page(page);
        ClearPageUptodate(page);
        unlock_page(page);
        return err;
-out_fail:
+redirty:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
        return 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b93ea3342281..1846a0dd7035 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
                ASSERT(btp == bp->b_target);
                if (bp->b_file_offset == range_base &&
                    bp->b_buffer_length == range_length) {
-                        /*
-                         * If we look at something, bring it to the
-                         * front of the list for next time.
-                         */
                        atomic_inc(&bp->b_hold);
-                        list_move(&bp->b_hash_list, &hash->bh_list);
                        goto found;
                }
        }
@@ -1431,8 +1426,7 @@ xfs_alloc_bufhash(
 {
        unsigned int            i;
-        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
+        btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
-        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
                                         sizeof(xfs_bufhash_t));
        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
@@ -1926,7 +1920,8 @@ xfs_buf_init(void)
        if (!xfs_buf_zone)
                goto out;
-        xfslogd_workqueue = create_workqueue("xfslogd");
+        xfslogd_workqueue = alloc_workqueue("xfslogd",
+                                        WQ_RESCUER | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d533d64e2c3e..9d021c73ea52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,7 +128,6 @@ typedef struct xfs_buftarg {
        size_t                  bt_smask;
        /* per device buffer hash table */
-        uint                    bt_hashmask;
        uint                    bt_hashshift;
        xfs_bufhash_t           *bt_hash;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee8..3b9e626f7cd1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr(
 {
        struct fsxattr          fa;
+        memset(&fa, 0, sizeof(struct fsxattr));
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        fa.fsx_xflags = xfs_ip2xflags(ip);
        fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
@@ -907,6 +909,13 @@ xfs_ioctl_setattr(
                return XFS_ERROR(EIO);
        /*
+         * Disallow 32bit project ids because on-disk structure
+         * is 16bit only.
+         */
+        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+                return XFS_ERROR(EINVAL);
+        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
         * before we start any other transactions. Trying to do this later
         * is messy. We don't care to take a readlock to look at the ids
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd301..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
                                        fieinfo->fi_extents_max + 1;
        bm.bmv_count = min_t(__s32, bm.bmv_count,
                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-        bm.bmv_iflags = BMV_IF_PREALLOC;
+        bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5fa7a30cc3f0..08fd3102128c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1225,6 +1225,7 @@ xfs_fs_statfs(
        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
        xfs_extlen_t            lsize;
+        __int64_t               ffree;
        statp->f_type = XFS_SB_MAGIC;
        statp->f_namelen = MAXNAMELEN - 1;
@@ -1248,7 +1249,11 @@ xfs_fs_statfs(
                statp->f_files = min_t(typeof(statp->f_files),
                                        statp->f_files,
                                        mp->m_maxicount);
-        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        /* make sure statp->f_ffree does not underflow */
+        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        statp->f_ffree = max_t(__int64_t, ffree, 0);
        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1401,7 +1406,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp);
+        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d1599..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
 }
 STATIC int
-xfs_commit_dummy_trans(
-        struct xfs_mount        *mp,
-        uint                    flags)
-{
-        struct xfs_inode        *ip = mp->m_rootip;
-        struct xfs_trans        *tp;
-        int                     error;
-        /*
-         * Put a dummy transaction in the log to tell recovery
-         * that all others are OK.
-         */
-        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        /* the log force ensures this transaction is pushed to disk */
-        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-        return error;
-}
-STATIC int
 xfs_sync_fsdata(
        struct xfs_mount        *mp)
 {
@@ -432,7 +401,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
 /*
 * Every sync period we need to unpin all items, reclaim inodes and sync
 * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle.
+ * filesystem is idle and not frozen.
 */
 STATIC void
 xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                if (xfs_log_need_covered(mp))
+                if (mp->m_super->s_frozen == SB_UNFROZEN &&
-                        error = xfs_commit_dummy_trans(mp, 0);
+                    xfs_log_need_covered(mp))
+                        error = xfs_fs_log_dummy(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -698,14 +668,11 @@ xfs_inode_set_reclaim_tag(
        xfs_perag_put(pag);
 }
-void
+STATIC void
-__xfs_inode_clear_reclaim_tag(
+__xfs_inode_clear_reclaim(
-        xfs_mount_t     *mp,
        xfs_perag_t     *pag,
        xfs_inode_t     *ip)
 {
-        radix_tree_tag_clear(&pag->pag_ici_root,
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        pag->pag_ici_reclaimable--;
        if (!pag->pag_ici_reclaimable) {
                /* clear the reclaim tag from the perag radix tree */
@@ -719,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
        }
 }
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_inode_clear_reclaim(pag, ip);
+}
 /*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -868,6 +846,7 @@ reclaim:
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
+        __xfs_inode_clear_reclaim(pag, ip);
        write_unlock(&pag->pag_ici_lock);
        /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c18..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
                                        map[i].br_startblock))
                                goto out_free_map;
-                        nexleft--;
                        bmv->bmv_offset =
                                out[cur_ext].bmv_offset +
                                out[cur_ext].bmv_length;
                        bmv->bmv_length =
                                max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+                        /*
+                         * In case we don't want to return the hole,
+                         * don't increase cur_ext so that we can reuse
+                         * it in the next loop.
+                         */
+                        if ((iflags & BMV_IF_NO_HOLES) &&
+                            map[i].br_startblock == HOLESTARTBLOCK) {
+                                memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+                                continue;
+                        }
+                        nexleft--;
                        bmv->bmv_entries++;
                        cur_ext++;
                }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
 #define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC         0x8     /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES         0x10    /* Do not return holes */
 #define BMV_IF_VALID    \
-        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
+        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|  \
+         BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
 /*      bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37ba..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
        return 0;
 }
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
+        xfs_mount_t     *mp,
+        int             flags)
 {
        xfs_trans_t     *tp;
-        xfs_inode_t     *ip;
        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                        XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
-        ip = mp->m_rootip;
+        /* log the UUID because it is an unchanging field */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_mod_sb(tp, XFS_SB_UUID);
+        if (flags & SYNC_WAIT)
-        xfs_trans_ijoin(tp, ip);
+                xfs_trans_set_sync(tp);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        return xfs_trans_commit(tp, 0);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return error;
 }
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95b..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
        struct xfs_inobt_rec_incore rec;
        struct xfs_btree_cur    *cur;
        struct xfs_buf          *agbp;
-        xfs_agino_t             startino;
        int                     error;
        int                     i;
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
        }
        /*
-         * derive and lookup the exact inode record for the given agino. If the
+         * Lookup the inode record for the given agino. If the record cannot be
-         * record cannot be found, then it's an invalid inode number and we
+         * found, then it's an invalid inode number and we should abort. Once
-         * should abort.
+         * we have a record, we need to ensure it contains the inode number
+         * we are looking up.
         */
        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-        startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-        error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
        if (!error) {
                if (i)
                        error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
        if (error)
                return error;
+        /* check that the returned record contains the required inode */
+        if (rec.ir_startino > agino ||
+            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+                return EINVAL;
        /* for untrusted inodes check it is allocated first */
        if ((flags & XFS_IGET_UNTRUSTED) &&
            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23c..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
        return 0;
 }
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
 STATIC void
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
        }
        for (j = 0; j < nbufs; j++, inum += ninodes) {
-                int     found = 0;
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
                /*
                 * Walk the inodes already attached to the buffer and mark them
                 * stale. These will all have the flush locks held, so an
-                 * in-memory inode walk can't lock them.
+                 * in-memory inode walk can't lock them. By marking them all
+                 * stale first, we will not attempt to lock them in the loop
+                 * below as the XFS_ISTALE flag will be set.
                 */
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
                                                        &iip->ili_flush_lsn,
                                                        &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                found++;
                        }
                        lip = lip->li_bio_list;
                }
                /*
                 * For each inode in memory attempt to add it to the inode
                 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
                 * even trying to lock them.
                 */
                for (i = 0; i < ninodes; i++) {
+retry:
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
                                continue;
                        }
-                        /* don't try to lock/unlock the current inode */
+                        /*
+                         * Don't try to lock/unlock the current inode, but we
+                         * _cannot_ skip the other inodes that we did not find
+                         * in the list attached to the buffer and are not
+                         * already marked stale. If we can't lock it, back off
+                         * and retry.
+                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
                                read_unlock(&pag->pag_ici_lock);
-                                continue;
+                                delay(1);
+                                goto retry;
                        }
                        read_unlock(&pag->pag_ici_lock);
-                        if (!xfs_iflock_nowait(ip)) {
+                        xfs_iflock(ip);
-                                if (ip != free_ip)
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                continue;
-                        }
                        xfs_iflags_set(ip, XFS_ISTALE);
-                        if (xfs_inode_clean(ip)) {
-                                ASSERT(ip != free_ip);
-                                xfs_ifunlock(ip);
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                continue;
-                        }
+                        /*
+                         * we don't need to attach clean inodes or those only
+                         * with unlogged changes (which we throw away, anyway).
+                         */
                        iip = ip->i_itemp;
-                        if (!iip) {
+                        if (!iip || xfs_inode_clean(ip)) {
-                                /* inode with unlogged changes only */
                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
-                        found++;
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                }
-                if (found)
+                xfs_trans_stale_inode_buf(tp, bp);
-                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 430a8fc02c1f..ba8e36e0b4e7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3002,7 +3002,8 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
-        xlog_cil_push(log, 1);
+        if (log->l_cilp)
+                xlog_cil_force(log);
        spin_lock(&log->l_icloglock);
@@ -3154,7 +3155,7 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
        if (log->l_cilp) {
-                lsn = xlog_cil_push_lsn(log, lsn);
+                lsn = xlog_cil_force_lsn(log, lsn);
                if (lsn == NULLCOMMITLSN)
                        return 0;
        }
@@ -3711,7 +3712,7 @@ xfs_log_force_umount(
         * call below.
         */
        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
-                xlog_cil_push(log, 1);
+                xlog_cil_force(log);
        /*
         * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19ac..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
        ctx->sequence = 1;
        ctx->cil = cil;
        cil->xc_ctx = ctx;
+        cil->xc_current_sequence = ctx->sequence;
        cil->xc_log = log;
        log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
 static void
 xlog_cil_format_items(
        struct log              *log,
-        struct xfs_log_vec      *log_vector,
+        struct xfs_log_vec      *log_vector)
-        struct xlog_ticket      *ticket,
-        xfs_lsn_t               *start_lsn)
 {
        struct xfs_log_vec *lv;
-        if (start_lsn)
-                *start_lsn = log->l_cilp->xc_ctx->sequence;
        ASSERT(log_vector);
        for (lv = log_vector; lv; lv = lv->lv_next) {
                void    *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
                        ptr += vec->i_len;
                }
                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+        }
+}
+static void
+xlog_cil_insert_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next)
                xlog_cil_insert(log, ticket, lv->lv_item, lv);
-        }
 }
 static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
 }
 /*
- * Commit a transaction with the given vector to the Committed Item List.
- *
- * To do this, we need to format the item, pin it in memory if required and
- * account for the space used by the transaction. Once we have done that we
- * need to release the unused reservation for the transaction, attach the
- * transaction to the checkpoint context so we carry the busy extents through
- * to checkpoint completion, and then unlock all the items in the transaction.
- *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
- * Called with the context lock already held in read mode to lock out
- * background commit, returns without it held once background commits are
- * allowed again.
- */
-int
-xfs_log_commit_cil(
-        struct xfs_mount        *mp,
-        struct xfs_trans        *tp,
-        struct xfs_log_vec      *log_vector,
-        xfs_lsn_t               *commit_lsn,
-        int                     flags)
-{
-        struct log              *log = mp->m_log;
-        int                     log_flags = 0;
-        int                     push = 0;
-        if (flags & XFS_TRANS_RELEASE_LOG_RES)
-                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
-        /* lock out background commit */
-        down_read(&log->l_cilp->xc_ctx_lock);
-        xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
-        /* check we didn't blow the reservation */
-        if (tp->t_ticket->t_curr_res < 0)
-                xlog_print_tic_res(log->l_mp, tp->t_ticket);
-        /* attach the transaction to the CIL if it has any busy extents */
-        if (!list_empty(&tp->t_busy)) {
-                spin_lock(&log->l_cilp->xc_cil_lock);
-                list_splice_init(&tp->t_busy,
-                                        &log->l_cilp->xc_ctx->busy_extents);
-                spin_unlock(&log->l_cilp->xc_cil_lock);
-        }
-        tp->t_commit_lsn = *commit_lsn;
-        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        xfs_trans_unreserve_and_mod_sb(tp);
-        /* check for background commit before unlock */
-        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
-                push = 1;
-        up_read(&log->l_cilp->xc_ctx_lock);
-        /*
-         * We need to push CIL every so often so we don't cache more than we
-         * can fit in the log. The limit really is that a checkpoint can't be
-         * more than half the log (the current checkpoint is not allowed to
-         * overwrite the previous checkpoint), but commit latency and memory
-         * usage limit this to a smaller size in most cases.
-         */
-        if (push)
-                xlog_cil_push(log, 0);
-        return 0;
-}
-/*
 * Mark all items committed and clear busy extents. We free the log vector
 * chains in a separate pass so that we unpin the log items as quickly as
 * possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
 }
 /*
- * Push the Committed Item List to the log. If the push_now flag is not set,
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
- * then it is a background flush and so we can chose to ignore it.
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
 */
-int
+STATIC int
 xlog_cil_push(
        struct log              *log,
-        int                     push_now)
+        xfs_lsn_t               push_seq)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_log_vec      *lv;
@@ -453,12 +400,20 @@ xlog_cil_push(
        if (!cil)
                return 0;
+        ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
-        /* lock out transaction commit, but don't block on background push */
+        /*
+         * Lock out transaction commit, but don't block for background pushes
+         * unless we are well over the CIL space limit. See the definition of
+         * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+         * used here.
+         */
        if (!down_write_trylock(&cil->xc_ctx_lock)) {
-                if (!push_now)
+                if (!push_seq &&
+                    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
                        goto out_free_ticket;
                down_write(&cil->xc_ctx_lock);
        }
@@ -469,7 +424,11 @@ xlog_cil_push(
                goto out_skip;
        /* check for spurious background flush */
-        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+        if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /* check for a previously pushed seqeunce */
+        if (push_seq && push_seq < cil->xc_ctx->sequence)
                goto out_skip;
        /*
@@ -515,6 +474,13 @@ xlog_cil_push(
        cil->xc_ctx = new_ctx;
        /*
+         * mirror the new sequence into the cil structure so that we can do
+         * unlocked checks against the current sequence in log forces without
+         * risking deferencing a freed context pointer.
+         */
+        cil->xc_current_sequence = new_ctx->sequence;
+        /*
         * The switch is now done, so we can drop the context lock and move out
         * of a shared context. We can't just go straight to the commit record,
         * though - we need to synchronise with previous and future commits so
@@ -626,6 +592,102 @@ out_abort:
 }
 /*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * do all the hard work of formatting items (including memory
+         * allocation) outside the CIL context lock. This prevents stalling CIL
+         * pushes when we are low on memory and a transaction commit spends a
+         * lot of time in memory reclaim.
+         */
+        xlog_cil_format_items(log, log_vector);
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /*
+         * Once all the items of the transaction have been copied to the CIL,
+         * the items can be unlocked and freed.
+         *
+         * This needs to be done before we drop the CIL context lock because we
+         * have to update state in the log items and unlock them before they go
+         * to disk. If we don't, then the CIL checkpoint can race with us and
+         * we can run checkpoint completion before we've updated and unlocked
+         * the log items. This affects (at least) processing of stale buffers,
+         * inodes and EFIs.
+         */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
 * Conditionally push the CIL based on the sequence passed in.
 *
 * We only need to push if we haven't already pushed the sequence
@@ -639,39 +701,34 @@ out_abort:
 * commit lsn is there. It'll be empty, so this is broken for now.
 */
 xfs_lsn_t
-xlog_cil_push_lsn(
+xlog_cil_force_lsn(
        struct log      *log,
-        xfs_lsn_t       push_seq)
+        xfs_lsn_t       sequence)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_cil_ctx      *ctx;
        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
-restart:
+        ASSERT(sequence <= cil->xc_current_sequence);
-        down_write(&cil->xc_ctx_lock);
-        ASSERT(push_seq <= cil->xc_ctx->sequence);
+        /*
+         * check to see if we need to force out the current context.
-        /* check to see if we need to force out the current context */
+         * xlog_cil_push() handles racing pushes for the same sequence,
-        if (push_seq == cil->xc_ctx->sequence) {
+         * so no need to deal with it here.
-                up_write(&cil->xc_ctx_lock);
+         */
-                xlog_cil_push(log, 1);
+        if (sequence == cil->xc_current_sequence)
-                goto restart;
+                xlog_cil_push(log, sequence);
-        }
        /*
         * See if we can find a previous sequence still committing.
-         * We can drop the flush lock as soon as we have the cil lock
-         * because we are now only comparing contexts protected by
-         * the cil lock.
-         *
         * We need to wait for all previous sequence commits to complete
         * before allowing the force of push_seq to go ahead. Hence block
         * on commits for those as well.
         */
+restart:
        spin_lock(&cil->xc_cil_lock);
-        up_write(&cil->xc_ctx_lock);
        list_for_each_entry(ctx, &cil->xc_committing, committing) {
-                if (ctx->sequence > push_seq)
+                if (ctx->sequence > sequence)
                        continue;
                if (!ctx->commit_lsn) {
                        /*
@@ -681,7 +738,7 @@ restart:
                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
                        goto restart;
                }
-                if (ctx->sequence != push_seq)
+                if (ctx->sequence != sequence)
                        continue;
                /* found it! */
                commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,16 +422,17 @@ struct xfs_cil {
        struct rw_semaphore     xc_ctx_lock;
        struct list_head        xc_committing;
        sv_t                    xc_commit_wait;
+        xfs_lsn_t               xc_current_sequence;
 };
 /*
- * The amount of log space we should the CIL to aggregate is difficult to size.
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
- * Whatever we chose we have to make we can get a reservation for the log space
+ * Whatever we choose, we have to make sure we can get a reservation for the
- * effectively, that it is large enough to capture sufficient relogging to
+ * log space effectively, that it is large enough to capture sufficient
- * reduce log buffer IO significantly, but it is not too large for the log or
+ * relogging to reduce log buffer IO significantly, but it is not too large for
- * induces too much latency when writing out through the iclogs. We track both
+ * the log or induces too much latency when writing out through the iclogs. We
- * space consumed and the number of vectors in the checkpoint context, so we
+ * track both space consumed and the number of vectors in the checkpoint
- * need to decide which to use for limiting.
+ * context, so we need to decide which to use for limiting.
 *
 * Every log buffer we write out during a push needs a header reserved, which
 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -458,16 +459,21 @@ struct xfs_cil {
 * checkpoint transaction ticket is specific to the checkpoint context, rather
 * than the CIL itself.
 *
- * With dynamic reservations, we can basically make up arbitrary limits for the
+ * With dynamic reservations, we can effectively make up arbitrary limits for
- * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the checkpoint size so long as they don't violate any other size rules.
- * the initial maximum size for the checkpoint transaction will be set to a
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
- * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * limited by that.  Furthermore, the log transaction reservation subsystem
- * right now based on the latency of writing out a large amount of data through
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
- * the circular iclog buffers.
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
 */
+#define XLOG_CIL_SPACE_LIMIT(log)       (log->l_logsize >> 3)
-#define XLOG_CIL_SPACE_LIMIT(log)       \
+#define XLOG_CIL_HARD_SPACE_LIMIT(log)  (3 * (log->l_logsize >> 4))
-        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
 /*
 * The reservation head lsn is not made up of a cycle number and block number.
@@ -562,8 +568,16 @@ int	xlog_cil_init(struct log *log);
 void    xlog_cil_init_post_recovery(struct log *log);
 void    xlog_cil_destroy(struct log *log);
-int     xlog_cil_push(struct log *log, int push_now);
+/*
-xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+static inline void
+xlog_cil_force(struct log *log)
+{
+        xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c754..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
 * Unlock all of the items of a transaction and free all the descriptors
 * of that transaction.
 */
-STATIC void
+void
 xfs_trans_free_items(
        struct xfs_trans        *tp,
        xfs_lsn_t               commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        /* xfs_trans_free_items() unlocks them first */
-        xfs_trans_free_items(tp, *commit_lsn, 0);
        xfs_trans_free(tp);
        return 0;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7b..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
 void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void    xfs_trans_del_item(struct xfs_log_item *);
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+                                int flags);
 void    xfs_trans_item_committed(struct xfs_log_item *lip,
                                xfs_lsn_t commit_lsn, int aborted);
 void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917c..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
                        e = allocatesize_fsb;
                }
+                /*
+                 * The transaction reservation is limited to a 32-bit block
+                 * count, hence we need to limit the number of blocks we are
+                 * trying to reserve to avoid an overflow. We can't allocate
+                 * more than @nimaps extents, and an extent is limited on disk
+                 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+                 */
+                resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
                if (unlikely(rt)) {
-                        resrtextents = qblocks = (uint)(e - s);
+                        resrtextents = qblocks = resblks;
                        resrtextents /= mp->m_sb.sb_rextsize;
                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                        quota_flag = XFS_QMOPT_RES_RTBLKS;
                } else {
                        resrtextents = 0;
-                        resblks = qblocks = \
+                        resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
-                                XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
                        quota_flag = XFS_QMOPT_RES_REGBLKS;
                }
author	Jens Axboe <jaxboe@fusionio.com>	2010-10-19 03:13:04 -0400
committer	Jens Axboe <jaxboe@fusionio.com>	2010-10-19 03:13:04 -0400
commit	fa251f89903d73989e2f63e13d0eaed1e07ce0da (patch)
tree	3f7fe779941e3b6d67754dd7c44a32f48ea47c74 /fs/xfs
parent	dd3932eddf428571762596e17b65f5dc92ca361b (diff)
parent	cd07202cc8262e1669edff0d97715f3dd9260917 (diff)