aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2009-01-06 17:40:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-06 18:59:06 -0500
commita79311c14eae4bb946a97af25f3e1b17d625985d (patch)
treef12391481dfd7a1c3ffcff75bd04d3a88957d16c
parentebdd4aea8d736e3b5ce27ab0a26860c9fded341b (diff)
vmscan: bail out of direct reclaim after swap_cluster_max pages
When the VM is under pressure, it can happen that several direct reclaim processes are in the pageout code simultaneously. It also happens that the reclaiming processes run into mostly referenced, mapped and dirty pages in the first round. This results in multiple direct reclaim processes having a lower pageout priority, which corresponds to a higher target of pages to scan. This in turn can result in each direct reclaim process freeing many pages. Together, they can end up freeing way too many pages. This kicks useful data out of memory (in some cases more than half of all memory is swapped out). It also impacts performance by keeping tasks stuck in the pageout code for too long. A 30% improvement in hackbench has been observed with this patch. The fix is relatively simple: in shrink_zone() we can check how many pages we have already freed, direct reclaim tasks break out of the scanning loop if they have already freed enough pages and have reached a lower priority level. We do not break out of shrink_zone() when priority == DEF_PRIORITY, to ensure that equal pressure is applied to every zone in the common case. However, in order to do this we do need to know how many pages we already freed, so move nr_reclaimed into scan_control. akpm: a historical interlude... We tried this in 2004: :commit e468e46a9bea3297011d5918663ce6d19094cf87 :Author: akpm <akpm> :Date: Thu Jun 24 15:53:52 2004 +0000 : :[PATCH] vmscan.c: dont reclaim too many pages : : The shrink_zone() logic can, under some circumstances, cause far too many : pages to be reclaimed. Say, we're scanning at high priority and suddenly hit : a large number of reclaimable pages on the LRU. : Change things so we bale out when SWAP_CLUSTER_MAX pages have been reclaimed. And we reverted it in 2006: :commit 210fe530305ee50cd889fe9250168228b2994f32 :Author: Andrew Morton <akpm@osdl.org> :Date: Fri Jan 6 00:11:14 2006 -0800 : : [PATCH] vmscan: balancing fix : : Revert a patch which went into 2.6.8-rc1. The changelog for that patch was: : : The shrink_zone() logic can, under some circumstances, cause far too many : pages to be reclaimed. Say, we're scanning at high priority and suddenly : hit a large number of reclaimable pages on the LRU. : : Change things so we bale out when SWAP_CLUSTER_MAX pages have been : reclaimed. : : Problem is, this change caused significant imbalance in inter-zone scan : balancing by truncating scans of larger zones. : : Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL. The zone : balancing algorithm would require that if we're scanning 100 pages of : ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL. But this logic will : cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are : reclaimed. Thus effectively causing smaller zones to be scanned relatively : harder than large ones. : : Now I need to remember what the workload was which caused me to write this : patch originally, then fix it up in a different way... And we haven't demonstrated that whatever problem caused that reversion is not being reintroduced by this change in 2008. Signed-off-by: Rik van Riel <riel@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/vmscan.c62
1 files changed, 33 insertions, 29 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ef5a2eeb298..5faa7739487f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 52 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 53 unsigned long nr_scanned;
54 54
55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed;
57
55 /* This context's GFP mask */ 58 /* This context's GFP mask */
56 gfp_t gfp_mask; 59 gfp_t gfp_mask;
57 60
@@ -1400,12 +1403,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1400/* 1403/*
1401 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1404 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1402 */ 1405 */
1403static unsigned long shrink_zone(int priority, struct zone *zone, 1406static void shrink_zone(int priority, struct zone *zone,
1404 struct scan_control *sc) 1407 struct scan_control *sc)
1405{ 1408{
1406 unsigned long nr[NR_LRU_LISTS]; 1409 unsigned long nr[NR_LRU_LISTS];
1407 unsigned long nr_to_scan; 1410 unsigned long nr_to_scan;
1408 unsigned long nr_reclaimed = 0;
1409 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1411 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1410 enum lru_list l; 1412 enum lru_list l;
1411 1413
@@ -1446,10 +1448,21 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1446 (unsigned long)sc->swap_cluster_max); 1448 (unsigned long)sc->swap_cluster_max);
1447 nr[l] -= nr_to_scan; 1449 nr[l] -= nr_to_scan;
1448 1450
1449 nr_reclaimed += shrink_list(l, nr_to_scan, 1451 sc->nr_reclaimed += shrink_list(l, nr_to_scan,
1450 zone, sc, priority); 1452 zone, sc, priority);
1451 } 1453 }
1452 } 1454 }
1455 /*
1456 * On large memory systems, scan >> priority can become
1457 * really large. This is fine for the starting priority;
1458 * we want to put equal scanning pressure on each zone.
1459 * However, if the VM has a harder time of freeing pages,
1460 * with multiple processes reclaiming pages, the total
1461 * freeing target can get unreasonably large.
1462 */
1463 if (sc->nr_reclaimed > sc->swap_cluster_max &&
1464 priority < DEF_PRIORITY && !current_is_kswapd())
1465 break;
1453 } 1466 }
1454 1467
1455 /* 1468 /*
@@ -1462,7 +1475,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1462 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1475 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1463 1476
1464 throttle_vm_writeout(sc->gfp_mask); 1477 throttle_vm_writeout(sc->gfp_mask);
1465 return nr_reclaimed;
1466} 1478}
1467 1479
1468/* 1480/*
@@ -1476,16 +1488,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1476 * b) The zones may be over pages_high but they must go *over* pages_high to 1488 * b) The zones may be over pages_high but they must go *over* pages_high to
1477 * satisfy the `incremental min' zone defense algorithm. 1489 * satisfy the `incremental min' zone defense algorithm.
1478 * 1490 *
1479 * Returns the number of reclaimed pages.
1480 *
1481 * If a zone is deemed to be full of pinned pages then just give it a light 1491 * If a zone is deemed to be full of pinned pages then just give it a light
1482 * scan then give up on it. 1492 * scan then give up on it.
1483 */ 1493 */
1484static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 1494static void shrink_zones(int priority, struct zonelist *zonelist,
1485 struct scan_control *sc) 1495 struct scan_control *sc)
1486{ 1496{
1487 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1497 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1488 unsigned long nr_reclaimed = 0;
1489 struct zoneref *z; 1498 struct zoneref *z;
1490 struct zone *zone; 1499 struct zone *zone;
1491 1500
@@ -1516,10 +1525,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1516 priority); 1525 priority);
1517 } 1526 }
1518 1527
1519 nr_reclaimed += shrink_zone(priority, zone, sc); 1528 shrink_zone(priority, zone, sc);
1520 } 1529 }
1521
1522 return nr_reclaimed;
1523} 1530}
1524 1531
1525/* 1532/*
@@ -1544,7 +1551,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1544 int priority; 1551 int priority;
1545 unsigned long ret = 0; 1552 unsigned long ret = 0;
1546 unsigned long total_scanned = 0; 1553 unsigned long total_scanned = 0;
1547 unsigned long nr_reclaimed = 0;
1548 struct reclaim_state *reclaim_state = current->reclaim_state; 1554 struct reclaim_state *reclaim_state = current->reclaim_state;
1549 unsigned long lru_pages = 0; 1555 unsigned long lru_pages = 0;
1550 struct zoneref *z; 1556 struct zoneref *z;
@@ -1572,7 +1578,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1572 sc->nr_scanned = 0; 1578 sc->nr_scanned = 0;
1573 if (!priority) 1579 if (!priority)
1574 disable_swap_token(); 1580 disable_swap_token();
1575 nr_reclaimed += shrink_zones(priority, zonelist, sc); 1581 shrink_zones(priority, zonelist, sc);
1576 /* 1582 /*
1577 * Don't shrink slabs when reclaiming memory from 1583 * Don't shrink slabs when reclaiming memory from
1578 * over limit cgroups 1584 * over limit cgroups
@@ -1580,13 +1586,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1580 if (scan_global_lru(sc)) { 1586 if (scan_global_lru(sc)) {
1581 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1587 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1582 if (reclaim_state) { 1588 if (reclaim_state) {
1583 nr_reclaimed += reclaim_state->reclaimed_slab; 1589 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1584 reclaim_state->reclaimed_slab = 0; 1590 reclaim_state->reclaimed_slab = 0;
1585 } 1591 }
1586 } 1592 }
1587 total_scanned += sc->nr_scanned; 1593 total_scanned += sc->nr_scanned;
1588 if (nr_reclaimed >= sc->swap_cluster_max) { 1594 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1589 ret = nr_reclaimed; 1595 ret = sc->nr_reclaimed;
1590 goto out; 1596 goto out;
1591 } 1597 }
1592 1598
@@ -1609,7 +1615,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1609 } 1615 }
1610 /* top priority shrink_zones still had more to do? don't OOM, then */ 1616 /* top priority shrink_zones still had more to do? don't OOM, then */
1611 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1617 if (!sc->all_unreclaimable && scan_global_lru(sc))
1612 ret = nr_reclaimed; 1618 ret = sc->nr_reclaimed;
1613out: 1619out:
1614 /* 1620 /*
1615 * Now that we've scanned all the zones at this priority level, note 1621 * Now that we've scanned all the zones at this priority level, note
@@ -1704,7 +1710,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1704 int priority; 1710 int priority;
1705 int i; 1711 int i;
1706 unsigned long total_scanned; 1712 unsigned long total_scanned;
1707 unsigned long nr_reclaimed;
1708 struct reclaim_state *reclaim_state = current->reclaim_state; 1713 struct reclaim_state *reclaim_state = current->reclaim_state;
1709 struct scan_control sc = { 1714 struct scan_control sc = {
1710 .gfp_mask = GFP_KERNEL, 1715 .gfp_mask = GFP_KERNEL,
@@ -1723,7 +1728,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1723 1728
1724loop_again: 1729loop_again:
1725 total_scanned = 0; 1730 total_scanned = 0;
1726 nr_reclaimed = 0; 1731 sc.nr_reclaimed = 0;
1727 sc.may_writepage = !laptop_mode; 1732 sc.may_writepage = !laptop_mode;
1728 count_vm_event(PAGEOUTRUN); 1733 count_vm_event(PAGEOUTRUN);
1729 1734
@@ -1809,11 +1814,11 @@ loop_again:
1809 */ 1814 */
1810 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1815 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1811 end_zone, 0)) 1816 end_zone, 0))
1812 nr_reclaimed += shrink_zone(priority, zone, &sc); 1817 shrink_zone(priority, zone, &sc);
1813 reclaim_state->reclaimed_slab = 0; 1818 reclaim_state->reclaimed_slab = 0;
1814 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1819 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1815 lru_pages); 1820 lru_pages);
1816 nr_reclaimed += reclaim_state->reclaimed_slab; 1821 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1817 total_scanned += sc.nr_scanned; 1822 total_scanned += sc.nr_scanned;
1818 if (zone_is_all_unreclaimable(zone)) 1823 if (zone_is_all_unreclaimable(zone))
1819 continue; 1824 continue;
@@ -1827,7 +1832,7 @@ loop_again:
1827 * even in laptop mode 1832 * even in laptop mode
1828 */ 1833 */
1829 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1834 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1830 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1835 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1831 sc.may_writepage = 1; 1836 sc.may_writepage = 1;
1832 } 1837 }
1833 if (all_zones_ok) 1838 if (all_zones_ok)
@@ -1845,7 +1850,7 @@ loop_again:
1845 * matches the direct reclaim path behaviour in terms of impact 1850 * matches the direct reclaim path behaviour in terms of impact
1846 * on zone->*_priority. 1851 * on zone->*_priority.
1847 */ 1852 */
1848 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1853 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1849 break; 1854 break;
1850 } 1855 }
1851out: 1856out:
@@ -1867,7 +1872,7 @@ out:
1867 goto loop_again; 1872 goto loop_again;
1868 } 1873 }
1869 1874
1870 return nr_reclaimed; 1875 return sc.nr_reclaimed;
1871} 1876}
1872 1877
1873/* 1878/*
@@ -2219,7 +2224,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2219 struct task_struct *p = current; 2224 struct task_struct *p = current;
2220 struct reclaim_state reclaim_state; 2225 struct reclaim_state reclaim_state;
2221 int priority; 2226 int priority;
2222 unsigned long nr_reclaimed = 0;
2223 struct scan_control sc = { 2227 struct scan_control sc = {
2224 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2228 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2225 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2229 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2252,9 +2256,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2252 priority = ZONE_RECLAIM_PRIORITY; 2256 priority = ZONE_RECLAIM_PRIORITY;
2253 do { 2257 do {
2254 note_zone_scanning_priority(zone, priority); 2258 note_zone_scanning_priority(zone, priority);
2255 nr_reclaimed += shrink_zone(priority, zone, &sc); 2259 shrink_zone(priority, zone, &sc);
2256 priority--; 2260 priority--;
2257 } while (priority >= 0 && nr_reclaimed < nr_pages); 2261 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2258 } 2262 }
2259 2263
2260 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2264 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2278,13 +2282,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2278 * Update nr_reclaimed by the number of slab pages we 2282 * Update nr_reclaimed by the number of slab pages we
2279 * reclaimed from this zone. 2283 * reclaimed from this zone.
2280 */ 2284 */
2281 nr_reclaimed += slab_reclaimable - 2285 sc.nr_reclaimed += slab_reclaimable -
2282 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2286 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2283 } 2287 }
2284 2288
2285 p->reclaim_state = NULL; 2289 p->reclaim_state = NULL;
2286 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2290 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2287 return nr_reclaimed >= nr_pages; 2291 return sc.nr_reclaimed >= nr_pages;
2288} 2292}
2289 2293
2290int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 2294int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)