aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2011-01-13 18:45:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:31 -0500
commit88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 (patch)
tree6f39beef8cf918eb2ca9f64ae1bcd1ea79ca487a /mm/page_alloc.c
parent43bb40c9e3aa51a3b038c9df2c9afb4d4685614d (diff)
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reported-by: Shaohua Li <shaohua.li@intel.com> Reviewed-by: Christoph Lameter <cl@linux.com> Tested-by: Nicolas Bareil <nico@chdir.org> Cc: David Rientjes <rientjes@google.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c35
1 files changed, 27 insertions, 8 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..22a1bb7723e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1460#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1460#endif /* CONFIG_FAIL_PAGE_ALLOC */
1461 1461
1462/* 1462/*
1463 * Return 1 if free pages are above 'mark'. This takes into account the order 1463 * Return true if free pages are above 'mark'. This takes into account the order
1464 * of the allocation. 1464 * of the allocation.
1465 */ 1465 */
1466int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1466static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1467 int classzone_idx, int alloc_flags) 1467 int classzone_idx, int alloc_flags, long free_pages)
1468{ 1468{
1469 /* free_pages my go negative - that's OK */ 1469 /* free_pages my go negative - that's OK */
1470 long min = mark; 1470 long min = mark;
1471 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1472 int o; 1471 int o;
1473 1472
1473 free_pages -= (1 << order) + 1;
1474 if (alloc_flags & ALLOC_HIGH) 1474 if (alloc_flags & ALLOC_HIGH)
1475 min -= min / 2; 1475 min -= min / 2;
1476 if (alloc_flags & ALLOC_HARDER) 1476 if (alloc_flags & ALLOC_HARDER)
1477 min -= min / 4; 1477 min -= min / 4;
1478 1478
1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1480 return 0; 1480 return false;
1481 for (o = 0; o < order; o++) { 1481 for (o = 0; o < order; o++) {
1482 /* At the next order, this order's pages become unavailable */ 1482 /* At the next order, this order's pages become unavailable */
1483 free_pages -= z->free_area[o].nr_free << o; 1483 free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1486 min >>= 1; 1486 min >>= 1;
1487 1487
1488 if (free_pages <= min) 1488 if (free_pages <= min)
1489 return 0; 1489 return false;
1490 } 1490 }
1491 return 1; 1491 return true;
1492}
1493
1494bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1495 int classzone_idx, int alloc_flags)
1496{
1497 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1498 zone_page_state(z, NR_FREE_PAGES));
1499}
1500
1501bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1502 int classzone_idx, int alloc_flags)
1503{
1504 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1505
1506 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1507 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1508
1509 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1510 free_pages);
1492} 1511}
1493 1512
1494#ifdef CONFIG_NUMA 1513#ifdef CONFIG_NUMA
@@ -2442,7 +2461,7 @@ void show_free_areas(void)
2442 " all_unreclaimable? %s" 2461 " all_unreclaimable? %s"
2443 "\n", 2462 "\n",
2444 zone->name, 2463 zone->name,
2445 K(zone_nr_free_pages(zone)), 2464 K(zone_page_state(zone, NR_FREE_PAGES)),
2446 K(min_wmark_pages(zone)), 2465 K(min_wmark_pages(zone)),
2447 K(low_wmark_pages(zone)), 2466 K(low_wmark_pages(zone)),
2448 K(high_wmark_pages(zone)), 2467 K(high_wmark_pages(zone)),