aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2011-10-31 20:07:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-31 20:30:46 -0400
commit92df3a723f84cdf8133560bbff950a7a99e92bc9 (patch)
tree503efc278236d877508da66ea7ec7cbb81203d64
parentf84f6e2b0868f198f97a32ba503d6f9f319a249a (diff)
mm: vmscan: throttle reclaim if encountering too many dirty pages under writeback
Workloads that are allocating frequently and writing files place a large number of dirty pages on the LRU. With use-once logic, it is possible for them to reach the end of the LRU quickly requiring the reclaimer to scan more to find clean pages. Ordinarily, processes that are dirtying memory will get throttled by dirty balancing but this is a global heuristic and does not take into account that LRUs are maintained on a per-zone basis. This can lead to a situation whereby reclaim is scanning heavily, skipping over a large number of pages under writeback and recycling them around the LRU consuming CPU. This patch checks how many of the number of pages isolated from the LRU were dirty and under writeback. If a percentage of them under writeback, the process will be throttled if a backing device or the zone is congested. Note that this applies whether it is anonymous or file-backed pages that are under writeback meaning that swapping is potentially throttled. This is intentional due to the fact if the swap device is congested, scanning more pages and dispatching more IO is not going to help matters. The percentage that must be in writeback depends on the priority. At default priority, all of them must be dirty. At DEF_PRIORITY-1, 50% of them must be, DEF_PRIORITY-2, 25% etc. i.e. as pressure increases the greater the likelihood the process will get throttled to allow the flusher threads to make some progress. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Acked-by: Johannes Weiner <jweiner@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Alex Elder <aelder@sgi.com> Cc: Theodore Ts'o <tytso@mit.edu> Cc: Chris Mason <chris.mason@oracle.com> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/vmscan.c42
1 files changed, 39 insertions, 3 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 15e3a29fdb23..7b0573f33a27 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -751,7 +751,9 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
751static unsigned long shrink_page_list(struct list_head *page_list, 751static unsigned long shrink_page_list(struct list_head *page_list,
752 struct zone *zone, 752 struct zone *zone,
753 struct scan_control *sc, 753 struct scan_control *sc,
754 int priority) 754 int priority,
755 unsigned long *ret_nr_dirty,
756 unsigned long *ret_nr_writeback)
755{ 757{
756 LIST_HEAD(ret_pages); 758 LIST_HEAD(ret_pages);
757 LIST_HEAD(free_pages); 759 LIST_HEAD(free_pages);
@@ -759,6 +761,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
759 unsigned long nr_dirty = 0; 761 unsigned long nr_dirty = 0;
760 unsigned long nr_congested = 0; 762 unsigned long nr_congested = 0;
761 unsigned long nr_reclaimed = 0; 763 unsigned long nr_reclaimed = 0;
764 unsigned long nr_writeback = 0;
762 765
763 cond_resched(); 766 cond_resched();
764 767
@@ -795,6 +798,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
795 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 798 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
796 799
797 if (PageWriteback(page)) { 800 if (PageWriteback(page)) {
801 nr_writeback++;
798 /* 802 /*
799 * Synchronous reclaim cannot queue pages for 803 * Synchronous reclaim cannot queue pages for
800 * writeback due to the possibility of stack overflow 804 * writeback due to the possibility of stack overflow
@@ -1000,6 +1004,8 @@ keep_lumpy:
1000 1004
1001 list_splice(&ret_pages, page_list); 1005 list_splice(&ret_pages, page_list);
1002 count_vm_events(PGACTIVATE, pgactivate); 1006 count_vm_events(PGACTIVATE, pgactivate);
1007 *ret_nr_dirty += nr_dirty;
1008 *ret_nr_writeback += nr_writeback;
1003 return nr_reclaimed; 1009 return nr_reclaimed;
1004} 1010}
1005 1011
@@ -1460,6 +1466,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1460 unsigned long nr_taken; 1466 unsigned long nr_taken;
1461 unsigned long nr_anon; 1467 unsigned long nr_anon;
1462 unsigned long nr_file; 1468 unsigned long nr_file;
1469 unsigned long nr_dirty = 0;
1470 unsigned long nr_writeback = 0;
1463 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1471 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1464 1472
1465 while (unlikely(too_many_isolated(zone, file, sc))) { 1473 while (unlikely(too_many_isolated(zone, file, sc))) {
@@ -1512,12 +1520,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1512 1520
1513 spin_unlock_irq(&zone->lru_lock); 1521 spin_unlock_irq(&zone->lru_lock);
1514 1522
1515 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority); 1523 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1524 &nr_dirty, &nr_writeback);
1516 1525
1517 /* Check if we should syncronously wait for writeback */ 1526 /* Check if we should syncronously wait for writeback */
1518 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1527 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1519 set_reclaim_mode(priority, sc, true); 1528 set_reclaim_mode(priority, sc, true);
1520 nr_reclaimed += shrink_page_list(&page_list, zone, sc, priority); 1529 nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1530 priority, &nr_dirty, &nr_writeback);
1521 } 1531 }
1522 1532
1523 local_irq_disable(); 1533 local_irq_disable();
@@ -1527,6 +1537,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1527 1537
1528 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1538 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1529 1539
1540 /*
1541 * If reclaim is isolating dirty pages under writeback, it implies
1542 * that the long-lived page allocation rate is exceeding the page
1543 * laundering rate. Either the global limits are not being effective
1544 * at throttling processes due to the page distribution throughout
1545 * zones or there is heavy usage of a slow backing device. The
1546 * only option is to throttle from reclaim context which is not ideal
1547 * as there is no guarantee the dirtying process is throttled in the
1548 * same way balance_dirty_pages() manages.
1549 *
1550 * This scales the number of dirty pages that must be under writeback
1551 * before throttling depending on priority. It is a simple backoff
1552 * function that has the most effect in the range DEF_PRIORITY to
1553 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1554 * in trouble and reclaim is considered to be in trouble.
1555 *
1556 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
1557 * DEF_PRIORITY-1 50% must be PageWriteback
1558 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
1559 * ...
1560 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1561 * isolated page is PageWriteback
1562 */
1563 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1564 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1565
1530 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1566 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1531 zone_idx(zone), 1567 zone_idx(zone),
1532 nr_scanned, nr_reclaimed, 1568 nr_scanned, nr_reclaimed,