aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2011-12-20 15:08:41 -0500
committerBen Myers <bpm@sgi.com>2011-12-23 17:41:47 -0500
commitbe4f1ac828776bbc7868a68b465cd8eedb733cfd (patch)
treee1200a933beeb93702ce0c571a4ca3304f3cab72
parent0b8fd3033c308e4088760aa1d38ce77197b4e074 (diff)
xfs: log all dirty inodes in xfs_fs_sync_fs
Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Tested-by: Mark Tinguely <tinguely@sgi.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/xfs_super.c28
-rw-r--r--fs/xfs/xfs_sync.c36
-rw-r--r--fs/xfs/xfs_sync.h2
3 files changed, 42 insertions, 24 deletions
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1add17ca3350..8a899496fd5f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -869,27 +869,6 @@ xfs_fs_dirty_inode(
869} 869}
870 870
871STATIC int 871STATIC int
872xfs_log_inode(
873 struct xfs_inode *ip)
874{
875 struct xfs_mount *mp = ip->i_mount;
876 struct xfs_trans *tp;
877 int error;
878
879 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
880 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
881 if (error) {
882 xfs_trans_cancel(tp, 0);
883 return error;
884 }
885
886 xfs_ilock(ip, XFS_ILOCK_EXCL);
887 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
888 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
889 return xfs_trans_commit(tp, 0);
890}
891
892STATIC int
893xfs_fs_write_inode( 872xfs_fs_write_inode(
894 struct inode *inode, 873 struct inode *inode,
895 struct writeback_control *wbc) 874 struct writeback_control *wbc)
@@ -902,8 +881,6 @@ xfs_fs_write_inode(
902 881
903 if (XFS_FORCED_SHUTDOWN(mp)) 882 if (XFS_FORCED_SHUTDOWN(mp))
904 return -XFS_ERROR(EIO); 883 return -XFS_ERROR(EIO);
905 if (!ip->i_update_core)
906 return 0;
907 884
908 if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { 885 if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
909 /* 886 /*
@@ -913,11 +890,14 @@ xfs_fs_write_inode(
913 * ->sync_fs call do that for thus, which reduces the number 890 * ->sync_fs call do that for thus, which reduces the number
914 * of synchronous log forces dramatically. 891 * of synchronous log forces dramatically.
915 */ 892 */
916 error = xfs_log_inode(ip); 893 error = xfs_log_dirty_inode(ip, NULL, 0);
917 if (error) 894 if (error)
918 goto out; 895 goto out;
919 return 0; 896 return 0;
920 } else { 897 } else {
898 if (!ip->i_update_core)
899 return 0;
900
921 /* 901 /*
922 * We make this non-blocking if the inode is contended, return 902 * We make this non-blocking if the inode is contended, return
923 * EAGAIN to indicate to the caller that they did not succeed. 903 * EAGAIN to indicate to the caller that they did not succeed.
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index be5c51d8f757..f0994aedcd15 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -336,6 +336,32 @@ xfs_sync_fsdata(
336 return error; 336 return error;
337} 337}
338 338
339int
340xfs_log_dirty_inode(
341 struct xfs_inode *ip,
342 struct xfs_perag *pag,
343 int flags)
344{
345 struct xfs_mount *mp = ip->i_mount;
346 struct xfs_trans *tp;
347 int error;
348
349 if (!ip->i_update_core)
350 return 0;
351
352 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
353 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
354 if (error) {
355 xfs_trans_cancel(tp, 0);
356 return error;
357 }
358
359 xfs_ilock(ip, XFS_ILOCK_EXCL);
360 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
361 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
362 return xfs_trans_commit(tp, 0);
363}
364
339/* 365/*
340 * When remounting a filesystem read-only or freezing the filesystem, we have 366 * When remounting a filesystem read-only or freezing the filesystem, we have
341 * two phases to execute. This first phase is syncing the data before we 367 * two phases to execute. This first phase is syncing the data before we
@@ -359,6 +385,16 @@ xfs_quiesce_data(
359{ 385{
360 int error, error2 = 0; 386 int error, error2 = 0;
361 387
388 /*
389 * Log all pending size and timestamp updates. The vfs writeback
390 * code is supposed to do this, but due to its overagressive
391 * livelock detection it will skip inodes where appending writes
392 * were written out in the first non-blocking sync phase if their
393 * completion took long enough that it happened after taking the
394 * timestamp for the cut-off in the blocking phase.
395 */
396 xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
397
362 xfs_qm_sync(mp, SYNC_TRYLOCK); 398 xfs_qm_sync(mp, SYNC_TRYLOCK);
363 xfs_qm_sync(mp, SYNC_WAIT); 399 xfs_qm_sync(mp, SYNC_WAIT);
364 400
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..fa965479d788 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
34 34
35void xfs_flush_inodes(struct xfs_inode *ip); 35void xfs_flush_inodes(struct xfs_inode *ip);
36 36
37int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
38
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 39int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 40int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 41void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);