aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2011-03-22 19:33:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-22 20:44:04 -0400
commit8afdcece4911e51cfff2b50a269418914cab8a3f (patch)
treefcfb966822f0f6c128c754f3876a80106c9cc654
parent7571966189e54adf0a8bc1384d6f13f44052ba63 (diff)
mm: vmscan: kswapd should not free an excessive number of pages when balancing small zones
When reclaiming for order-0 pages, kswapd requires that all zones be balanced. Each cycle through balance_pgdat() does background ageing on all zones if necessary and applies equal pressure on the inactive zone unless a lot of pages are free already. A "lot of free pages" is defined as a "balance gap" above the high watermark which is currently 7*high_watermark. Historically this was reasonable as min_free_kbytes was small. However, on systems using huge pages, it is recommended that min_free_kbytes is higher and it is tuned with hugeadm --set-recommended-min_free_kbytes. With the introduction of transparent huge page support, this recommended value is also applied. On X86-64 with 4G of memory, min_free_kbytes becomes 67584 so one would expect around 68M of memory to be free. The Normal zone is approximately 35000 pages so under even normal memory pressure such as copying a large file, it gets exhausted quickly. As it is getting exhausted, kswapd applies pressure equally to all zones, including the DMA32 zone. DMA32 is approximately 700,000 pages with a high watermark of around 23,000 pages. In this situation, kswapd will reclaim around (23000*8 where 8 is the high watermark + balance gap of 7 * high watermark) pages or 718M of pages before the zone is ignored. What the user sees is that free memory far higher than it should be. To avoid an excessive number of pages being reclaimed from the larger zones, explicitely defines the "balance gap" to be either 1% of the zone or the low watermark for the zone, whichever is smaller. While kswapd will check all zones to apply pressure, it'll ignore zones that meets the (high_wmark + balance_gap) watermark. To test this, 80G were copied from a partition and the amount of memory being used was recorded. A comparison of a patch and unpatched kernel can be seen at http://www.csn.ul.ie/~mel/postings/minfree-20110222/memory-usage-hydra.ps and shows that kswapd is not reclaiming as much memory with the patch applied. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Rik van Riel <riel@redhat.com> Cc: Shaohua Li <shaohua.li@intel.com> Cc: "Chen, Tim C" <tim.c.chen@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/swap.h9
-rw-r--r--mm/vmscan.c16
2 files changed, 22 insertions, 3 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c335055c4253..ed6ebe690f4a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -155,6 +155,15 @@ enum {
155#define SWAP_CLUSTER_MAX 32 155#define SWAP_CLUSTER_MAX 32
156#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX 156#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
157 157
158/*
159 * Ratio between the present memory in the zone and the "gap" that
160 * we're allowing kswapd to shrink in addition to the per-zone high
161 * wmark, even for zones that already have the high wmark satisfied,
162 * in order to provide better per-zone lru behavior. We are ok to
163 * spend not more than 1% of the memory for this zone balancing "gap".
164 */
165#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
166
158#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ 167#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
159#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ 168#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
160#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ 169#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 665b090b6c72..060e4c191403 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2399,6 +2399,7 @@ loop_again:
2399 for (i = 0; i <= end_zone; i++) { 2399 for (i = 0; i <= end_zone; i++) {
2400 struct zone *zone = pgdat->node_zones + i; 2400 struct zone *zone = pgdat->node_zones + i;
2401 int nr_slab; 2401 int nr_slab;
2402 unsigned long balance_gap;
2402 2403
2403 if (!populated_zone(zone)) 2404 if (!populated_zone(zone))
2404 continue; 2405 continue;
@@ -2415,11 +2416,20 @@ loop_again:
2415 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); 2416 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2416 2417
2417 /* 2418 /*
2418 * We put equal pressure on every zone, unless one 2419 * We put equal pressure on every zone, unless
2419 * zone has way too many pages free already. 2420 * one zone has way too many pages free
2421 * already. The "too many pages" is defined
2422 * as the high wmark plus a "gap" where the
2423 * gap is either the low watermark or 1%
2424 * of the zone, whichever is smaller.
2420 */ 2425 */
2426 balance_gap = min(low_wmark_pages(zone),
2427 (zone->present_pages +
2428 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2429 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2421 if (!zone_watermark_ok_safe(zone, order, 2430 if (!zone_watermark_ok_safe(zone, order,
2422 8*high_wmark_pages(zone), end_zone, 0)) 2431 high_wmark_pages(zone) + balance_gap,
2432 end_zone, 0))
2423 shrink_zone(priority, zone, &sc); 2433 shrink_zone(priority, zone, &sc);
2424 reclaim_state->reclaimed_slab = 0; 2434 reclaim_state->reclaimed_slab = 0;
2425 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2435 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,