7 files changed, 98 insertions, 94 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5105015a75ad..98e0e86093b4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
                if (unlikely(page == NULL)) {
                        if (flags & XBF_READ_AHEAD) {
                                bp->b_page_count = i;
+                                for (i = 0; i < bp->b_page_count; i++)
+                                        unlock_page(bp->b_pages[i]);
                                return -ENOMEM;
                        }
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
                ASSERT(!PagePrivate(page));
                if (!PageUptodate(page)) {
                        page_count--;
-                        if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) {
+                        if (blocksize >= PAGE_CACHE_SIZE) {
+                                if (flags & XBF_READ)
+                                        bp->b_flags |= _XBF_PAGE_LOCKED;
+                        } else if (!PagePrivate(page)) {
                                if (test_page_region(page, offset, nbytes))
                                        page_count++;
                        }
                }
-                unlock_page(page);
                bp->b_pages[i] = page;
                offset = 0;
        }
+        if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+                for (i = 0; i < bp->b_page_count; i++)
+                        unlock_page(bp->b_pages[i]);
+        }
        if (page_count == bp->b_page_count)
                bp->b_flags |= XBF_DONE;
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
        bp->b_count_desired = len;
        bp->b_buffer_length = buflen;
        bp->b_flags |= XBF_MAPPED;
+        bp->b_flags &= ~_XBF_PAGE_LOCKED;
        return 0;
 }
@@ -1093,8 +1103,10 @@ _xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
 {
-        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+                bp->b_flags &= ~_XBF_PAGE_LOCKED;
                xfs_buf_ioend(bp, schedule);
+        }
 }
 STATIC void
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
                if (--bvec >= bio->bi_io_vec)
                        prefetchw(&bvec->bv_page->flags);
+                if (bp->b_flags & _XBF_PAGE_LOCKED)
+                        unlock_page(page);
        } while (bvec >= bio->bi_io_vec);
        _xfs_buf_ioend(bp, 1);
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
         * filesystem block size is not smaller than the page size.
         */
        if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-            (bp->b_flags & XBF_READ) &&
+            ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
+              (XBF_READ|_XBF_PAGE_LOCKED)) &&
            (blocksize >= PAGE_CACHE_SIZE)) {
                bio = bio_alloc(GFP_NOIO, 1);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 841d7883528d..f948ec7ba9a4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -66,6 +66,25 @@ typedef enum {
        _XBF_PAGES = (1 << 18),     /* backed by refcounted pages          */
        _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue         */
        _XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue             */
+        /*
+         * Special flag for supporting metadata blocks smaller than a FSB.
+         *
+         * In this case we can have multiple xfs_buf_t on a single page and
+         * need to lock out concurrent xfs_buf_t readers as they only
+         * serialise access to the buffer.
+         *
+         * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+         * between reads of the page. Hence we can have one thread read the
+         * page and modify it, but then race with another thread that thinks
+         * the page is not up-to-date and hence reads it again.
+         *
+         * The result is that the first modifcation to the page is lost.
+         * This sort of AGF/AGI reading race can happen when unlinking inodes
+         * that require truncation and results in the AGI unlinked list
+         * modifications being lost.
+         */
+        _XBF_PAGE_LOCKED = (1 << 22),
 } xfs_buf_flags_t;
 typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4ae..5f60363b9343 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
        return -xfs_release(XFS_I(inode));
 }
+/*
+ * We ignore the datasync flag here because a datasync is effectively
+ * identical to an fsync. That is, datasync implies that we need to write
+ * only the metadata needed to be able to access the data that is written
+ * if we crash after the call completes. Hence if we are writing beyond
+ * EOF we have to log the inode size change as well, which makes it a
+ * full fsync. If we don't write beyond EOF, the inode core will be
+ * clean in memory and so we don't need to log the inode, just like
+ * fsync.
+ */
 STATIC int
 xfs_file_fsync(
        struct file     *filp,
        struct dentry   *dentry,
        int             datasync)
 {
-        int             flags = FSYNC_WAIT;
-        if (datasync)
-                flags |= FSYNC_DATA;
        xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-        return -xfs_fsync(XFS_I(dentry->d_inode), flags,
+        return -xfs_fsync(XFS_I(dentry->d_inode));
-                        (xfs_off_t)0, (xfs_off_t)-1);
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc7..25eb2a9e8d9b 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 #define ATTR_NOSIZETOK  0x400   /* Don't get the SIZE token */
 /*
- * Flags to vop_fsync/reclaim.
- */
-#define FSYNC_NOWAIT    0       /* asynchronous flush */
-#define FSYNC_WAIT      0x1     /* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL     0x2     /* flush and invalidate cached data */
-#define FSYNC_DATA      0x4     /* synchronous fsync of data only */
-/*
 * Tracking vnode activity.
 */
 #if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index cf0bb9c1d621..e569bf5d6cf0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
        xfs_mount_t             *mp = ip->i_mount;
        xfs_perag_t             *pag = xfs_get_perag(mp, ip->i_ino);
        unsigned long           first_index, mask;
+        unsigned long           inodes_per_cluster;
        int                     ilist_size;
        xfs_inode_t             **ilist;
        xfs_inode_t             *iq;
@@ -2985,8 +2986,9 @@ xfs_iflush_cluster(
        ASSERT(pag->pagi_inodeok);
        ASSERT(pag->pag_ici_init);
-        ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
-        ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+        ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
        if (!ilist)
                return 0;
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
        read_lock(&pag->pag_ici_lock);
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
-                                        first_index,
+                                        first_index, inodes_per_cluster);
-                                        XFS_INODE_CLUSTER_SIZE(mp));
        if (nr_found == 0)
                goto out_free;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4bb..e475e3717eb3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
 /*
 * xfs_fsync
 *
- * This is called to sync the inode and its data out to disk.
+ * This is called to sync the inode and its data out to disk.  We need to hold
- * We need to hold the I/O lock while flushing the data, and
+ * the I/O lock while flushing the data, and the inode lock while flushing the
- * the inode lock while flushing the inode.  The inode lock CANNOT
+ * inode.  The inode lock CANNOT be held while flushing the data, so acquire
- * be held while flushing the data, so acquire after we're done
+ * after we're done with that.
- * with that.
 */
 int
 xfs_fsync(
-        xfs_inode_t     *ip,
+        xfs_inode_t     *ip)
-        int             flag,
-        xfs_off_t       start,
-        xfs_off_t       stop)
 {
        xfs_trans_t     *tp;
        int             error;
@@ -875,103 +871,79 @@ xfs_fsync(
        xfs_itrace_entry(ip);
-        ASSERT(start >= 0 && stop >= -1);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return XFS_ERROR(EIO);
-        if (flag & FSYNC_DATA)
+        /* capture size updates in I/O completion before writing the inode. */
-                filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        if (error)
+                return XFS_ERROR(error);
        /*
-         * We always need to make sure that the required inode state
+         * We always need to make sure that the required inode state is safe on
-         * is safe on disk.  The vnode might be clean but because
+         * disk.  The vnode might be clean but we still might need to force the
-         * of committed transactions that haven't hit the disk yet.
+         * log because of committed transactions that haven't hit the disk yet.
-         * Likewise, there could be unflushed non-transactional
+         * Likewise, there could be unflushed non-transactional changes to the
-         * changes to the inode core that have to go to disk.
+         * inode core that have to go to disk and this requires us to issue
+         * a synchronous transaction to capture these changes correctly.
         *
-         * The following code depends on one assumption:  that
+         * This code relies on the assumption that if the update_* fields
-         * any transaction that changes an inode logs the core
+         * of the inode are clear and the inode is unpinned then it is clean
-         * because it has to change some field in the inode core
+         * and no action is required.
-         * (typically nextents or nblocks).  That assumption
-         * implies that any transactions against an inode will
-         * catch any non-transactional updates.  If inode-altering
-         * transactions exist that violate this assumption, the
-         * code breaks.  Right now, it figures that if the involved
-         * update_* field is clear and the inode is unpinned, the
-         * inode is clean.  Either it's been flushed or it's been
-         * committed and the commit has hit the disk unpinning the inode.
-         * (Note that xfs_inode_item_format() called at commit clears
-         * the update_* fields.)
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        /* If we are flushing data then we care about update_size
+        if (!(ip->i_update_size || ip->i_update_core)) {
-         * being set, otherwise we care about update_core
-         */
-        if ((flag & FSYNC_DATA) ?
-                        (ip->i_update_size == 0) :
-                        (ip->i_update_core == 0)) {
                /*
-                 * Timestamps/size haven't changed since last inode
+                 * Timestamps/size haven't changed since last inode flush or
-                 * flush or inode transaction commit.  That means
+                 * inode transaction commit.  That means either nothing got
-                 * either nothing got written or a transaction
+                 * written or a transaction committed which caught the updates.
-                 * committed which caught the updates.  If the
+                 * If the latter happened and the transaction hasn't hit the
-                 * latter happened and the transaction hasn't
+                 * disk yet, the inode will be still be pinned.  If it is,
-                 * hit the disk yet, the inode will be still
+                 * force the log.
-                 * be pinned.  If it is, force the log.
                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                if (xfs_ipincount(ip)) {
-                        _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+                        error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-                                      XFS_LOG_FORCE |
+                                      XFS_LOG_FORCE | XFS_LOG_SYNC,
-                                      ((flag & FSYNC_WAIT)
-                                       ? XFS_LOG_SYNC : 0),
                                      &log_flushed);
                } else {
                        /*
-                         * If the inode is not pinned and nothing
+                         * If the inode is not pinned and nothing has changed
-                         * has changed we don't need to flush the
+                         * we don't need to flush the cache.
-                         * cache.
                         */
                        changed = 0;
                }
-                error = 0;
        } else  {
                /*
-                 * Kick off a transaction to log the inode
+                 * Kick off a transaction to log the inode core to get the
-                 * core to get the updates.  Make it
+                 * updates.  The sync transaction will also force the log.
-                 * sync if FSYNC_WAIT is passed in (which
-                 * is done by everybody but specfs).  The
-                 * sync transaction will also force the log.
                 */
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-                if ((error = xfs_trans_reserve(tp, 0,
+                error = xfs_trans_reserve(tp, 0,
-                                XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+                                XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
-                                0, 0, 0)))  {
+                if (error) {
                        xfs_trans_cancel(tp, 0);
                        return error;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                /*
-                 * Note - it's possible that we might have pushed
+                 * Note - it's possible that we might have pushed ourselves out
-                 * ourselves out of the way during trans_reserve
+                 * of the way during trans_reserve which would flush the inode.
-                 * which would flush the inode.  But there's no
+                 * But there's no guarantee that the inode buffer has actually
-                 * guarantee that the inode buffer has actually
+                 * gone out yet (it's delwri).  Plus the buffer could be pinned
-                 * gone out yet (it's delwri).  Plus the buffer
+                 * anyway if it's part of an inode in another recent
-                 * could be pinned anyway if it's part of an
+                 * transaction.  So we play it safe and fire off the
-                 * inode in another recent transaction.  So we
+                 * transaction anyway.
-                 * play it safe and fire off the transaction anyway.
                 */
                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                xfs_trans_ihold(tp, ip);
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                if (flag & FSYNC_WAIT)
+                xfs_trans_set_sync(tp);
-                        xfs_trans_set_sync(tp);
                error = _xfs_trans_commit(tp, 0, &log_flushed);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e20..57335ba4ce53 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
 int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
                struct cred *credp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
+int xfs_fsync(struct xfs_inode *ip);
-                xfs_off_t stop);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,