aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2011-04-07 22:45:07 -0400
committerDave Chinner <david@fromorbit.com>2011-04-07 22:45:07 -0400
commita7b339f1b8698667eada006e717cdb4523be2ed5 (patch)
tree77c44400c32284bdcf15829e10d01eb15ddd1d41 /fs/xfs
parent89e4cb550a492cfca038a555fcc1bdac58822ec3 (diff)
xfs: introduce background inode reclaim work
Background inode reclaim needs to run more frequently that the XFS syncd work is run as 30s is too long between optimal reclaim runs. Add a new periodic work item to the xfs syncd workqueue to run a fast, non-blocking inode reclaim scan. Background inode reclaim is kicked by the act of marking inodes for reclaim. When an AG is first marked as having reclaimable inodes, the background reclaim work is kicked. It will continue to run periodically untill it detects that there are no more reclaimable inodes. It will be kicked again when the first inode is queued for reclaim. To ensure shrinker based inode reclaim throttles to the inode cleaning and reclaim rate but still reclaim inodes efficiently, make it kick the background inode reclaim so that when we are low on memory we are trying to reclaim inodes as efficiently as possible. This kick shoul d not be necessary, but it will protect against failures to kick the background reclaim when inodes are first dirtied. To provide the rate throttling, make the shrinker pass do synchronous inode reclaim so that it blocks on inodes under IO. This means that the shrinker will reclaim inodes rather than just skipping over them, but it does not adversely affect the rate of reclaim because most dirty inodes are already under IO due to the background reclaim work the shrinker kicked. These two modifications solve one of the two OOM killer invocations Chris Mason reported recently when running a stress testing script. The particular workload trigger for the OOM killer invocation is where there are more threads than CPUs all unlinking files in an extremely memory constrained environment. Unlike other solutions, this one does not have a performance impact on performance when memory is not constrained or the number of concurrent threads operating is <= to the number of CPUs. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c69
-rw-r--r--fs/xfs/xfs_mount.h1
2 files changed, 67 insertions, 3 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index af3275965c7..debe2822c93 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -461,7 +461,6 @@ xfs_sync_worker(
461 error = xfs_fs_log_dummy(mp); 461 error = xfs_fs_log_dummy(mp);
462 else 462 else
463 xfs_log_force(mp, 0); 463 xfs_log_force(mp, 0);
464 xfs_reclaim_inodes(mp, 0);
465 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 464 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466 } 465 }
467 466
@@ -470,6 +469,52 @@ xfs_sync_worker(
470} 469}
471 470
472/* 471/*
472 * Queue a new inode reclaim pass if there are reclaimable inodes and there
473 * isn't a reclaim pass already in progress. By default it runs every 5s based
474 * on the xfs syncd work default of 30s. Perhaps this should have it's own
475 * tunable, but that can be done if this method proves to be ineffective or too
476 * aggressive.
477 */
478static void
479xfs_syncd_queue_reclaim(
480 struct xfs_mount *mp)
481{
482
483 /*
484 * We can have inodes enter reclaim after we've shut down the syncd
485 * workqueue during unmount, so don't allow reclaim work to be queued
486 * during unmount.
487 */
488 if (!(mp->m_super->s_flags & MS_ACTIVE))
489 return;
490
491 rcu_read_lock();
492 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
493 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
494 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
495 }
496 rcu_read_unlock();
497}
498
499/*
500 * This is a fast pass over the inode cache to try to get reclaim moving on as
501 * many inodes as possible in a short period of time. It kicks itself every few
502 * seconds, as well as being kicked by the inode cache shrinker when memory
503 * goes low. It scans as quickly as possible avoiding locked inodes or those
504 * already being flushed, and once done schedules a future pass.
505 */
506STATIC void
507xfs_reclaim_worker(
508 struct work_struct *work)
509{
510 struct xfs_mount *mp = container_of(to_delayed_work(work),
511 struct xfs_mount, m_reclaim_work);
512
513 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
514 xfs_syncd_queue_reclaim(mp);
515}
516
517/*
473 * Flush delayed allocate data, attempting to free up reserved space 518 * Flush delayed allocate data, attempting to free up reserved space
474 * from existing allocations. At this point a new allocation attempt 519 * from existing allocations. At this point a new allocation attempt
475 * has failed with ENOSPC and we are in the process of scratching our 520 * has failed with ENOSPC and we are in the process of scratching our
@@ -508,7 +553,10 @@ xfs_syncd_init(
508{ 553{
509 INIT_WORK(&mp->m_flush_work, xfs_flush_worker); 554 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
510 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); 555 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
556 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
557
511 xfs_syncd_queue_sync(mp); 558 xfs_syncd_queue_sync(mp);
559 xfs_syncd_queue_reclaim(mp);
512 560
513 return 0; 561 return 0;
514} 562}
@@ -518,6 +566,7 @@ xfs_syncd_stop(
518 struct xfs_mount *mp) 566 struct xfs_mount *mp)
519{ 567{
520 cancel_delayed_work_sync(&mp->m_sync_work); 568 cancel_delayed_work_sync(&mp->m_sync_work);
569 cancel_delayed_work_sync(&mp->m_reclaim_work);
521 cancel_work_sync(&mp->m_flush_work); 570 cancel_work_sync(&mp->m_flush_work);
522} 571}
523 572
@@ -537,6 +586,10 @@ __xfs_inode_set_reclaim_tag(
537 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 586 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
538 XFS_ICI_RECLAIM_TAG); 587 XFS_ICI_RECLAIM_TAG);
539 spin_unlock(&ip->i_mount->m_perag_lock); 588 spin_unlock(&ip->i_mount->m_perag_lock);
589
590 /* schedule periodic background inode reclaim */
591 xfs_syncd_queue_reclaim(ip->i_mount);
592
540 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 593 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
541 -1, _RET_IP_); 594 -1, _RET_IP_);
542 } 595 }
@@ -953,7 +1006,13 @@ xfs_reclaim_inodes(
953} 1006}
954 1007
955/* 1008/*
956 * Shrinker infrastructure. 1009 * Inode cache shrinker.
1010 *
1011 * When called we make sure that there is a background (fast) inode reclaim in
1012 * progress, while we will throttle the speed of reclaim via doiing synchronous
1013 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1014 * them to be cleaned, which we hope will not be very long due to the
1015 * background walker having already kicked the IO off on those dirty inodes.
957 */ 1016 */
958static int 1017static int
959xfs_reclaim_inode_shrink( 1018xfs_reclaim_inode_shrink(
@@ -968,10 +1027,14 @@ xfs_reclaim_inode_shrink(
968 1027
969 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1028 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
970 if (nr_to_scan) { 1029 if (nr_to_scan) {
1030 /* kick background reclaimer */
1031 xfs_syncd_queue_reclaim(mp);
1032
971 if (!(gfp_mask & __GFP_FS)) 1033 if (!(gfp_mask & __GFP_FS))
972 return -1; 1034 return -1;
973 1035
974 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); 1036 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
1037 &nr_to_scan);
975 /* terminate if we don't exhaust the scan */ 1038 /* terminate if we don't exhaust the scan */
976 if (nr_to_scan > 0) 1039 if (nr_to_scan > 0)
977 return -1; 1040 return -1;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a0ad90e9529..19af0ab0d0c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -204,6 +204,7 @@ typedef struct xfs_mount {
204#endif 204#endif
205 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 205 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
206 struct delayed_work m_sync_work; /* background sync work */ 206 struct delayed_work m_sync_work; /* background sync work */
207 struct delayed_work m_reclaim_work; /* background inode reclaim */
207 struct work_struct m_flush_work; /* background inode flush */ 208 struct work_struct m_flush_work; /* background inode flush */
208 __int64_t m_update_flags; /* sb flags we need to update 209 __int64_t m_update_flags; /* sb flags we need to update
209 on the next remount,rw */ 210 on the next remount,rw */