aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2014-06-04 19:07:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:54:01 -0400
commit675becce15f320337499bc1a9356260409a5ba29 (patch)
treecfd83d7630aff3cee016910afff4d663e8ba3c33 /mm
parentf98bafa06a28fdfdd5c49f820f4d6560f636fc46 (diff)
mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL
throttle_direct_reclaim() is meant to trigger during swap-over-network during which the min watermark is treated as a pfmemalloc reserve. It throttes on the first node in the zonelist but this is flawed. The user-visible impact is that a process running on CPU whose local memory node has no ZONE_NORMAL will stall for prolonged periods of time, possibly indefintely. This is due to throttle_direct_reclaim thinking the pfmemalloc reserves are depleted when in fact they don't exist on that node. On a NUMA machine running a 32-bit kernel (I know) allocation requests from CPUs on node 1 would detect no pfmemalloc reserves and the process gets throttled. This patch adjusts throttling of direct reclaim to throttle based on the first node in the zonelist that has a usable ZONE_NORMAL or lower zone. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/vmscan.c43
1 files changed, 37 insertions, 6 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fbcf46076c4f..53e4534885ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2537,10 +2537,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2537 2537
2538 for (i = 0; i <= ZONE_NORMAL; i++) { 2538 for (i = 0; i <= ZONE_NORMAL; i++) {
2539 zone = &pgdat->node_zones[i]; 2539 zone = &pgdat->node_zones[i];
2540 if (!populated_zone(zone))
2541 continue;
2542
2540 pfmemalloc_reserve += min_wmark_pages(zone); 2543 pfmemalloc_reserve += min_wmark_pages(zone);
2541 free_pages += zone_page_state(zone, NR_FREE_PAGES); 2544 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2542 } 2545 }
2543 2546
2547 /* If there are no reserves (unexpected config) then do not throttle */
2548 if (!pfmemalloc_reserve)
2549 return true;
2550
2544 wmark_ok = free_pages > pfmemalloc_reserve / 2; 2551 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2545 2552
2546 /* kswapd must be awake if processes are being throttled */ 2553 /* kswapd must be awake if processes are being throttled */
@@ -2565,9 +2572,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2565static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2572static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2566 nodemask_t *nodemask) 2573 nodemask_t *nodemask)
2567{ 2574{
2575 struct zoneref *z;
2568 struct zone *zone; 2576 struct zone *zone;
2569 int high_zoneidx = gfp_zone(gfp_mask); 2577 pg_data_t *pgdat = NULL;
2570 pg_data_t *pgdat;
2571 2578
2572 /* 2579 /*
2573 * Kernel threads should not be throttled as they may be indirectly 2580 * Kernel threads should not be throttled as they may be indirectly
@@ -2586,10 +2593,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2586 if (fatal_signal_pending(current)) 2593 if (fatal_signal_pending(current))
2587 goto out; 2594 goto out;
2588 2595
2589 /* Check if the pfmemalloc reserves are ok */ 2596 /*
2590 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); 2597 * Check if the pfmemalloc reserves are ok by finding the first node
2591 pgdat = zone->zone_pgdat; 2598 * with a usable ZONE_NORMAL or lower zone. The expectation is that
2592 if (pfmemalloc_watermark_ok(pgdat)) 2599 * GFP_KERNEL will be required for allocating network buffers when
2600 * swapping over the network so ZONE_HIGHMEM is unusable.
2601 *
2602 * Throttling is based on the first usable node and throttled processes
2603 * wait on a queue until kswapd makes progress and wakes them. There
2604 * is an affinity then between processes waking up and where reclaim
2605 * progress has been made assuming the process wakes on the same node.
2606 * More importantly, processes running on remote nodes will not compete
2607 * for remote pfmemalloc reserves and processes on different nodes
2608 * should make reasonable progress.
2609 */
2610 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2611 gfp_mask, nodemask) {
2612 if (zone_idx(zone) > ZONE_NORMAL)
2613 continue;
2614
2615 /* Throttle based on the first usable node */
2616 pgdat = zone->zone_pgdat;
2617 if (pfmemalloc_watermark_ok(pgdat))
2618 goto out;
2619 break;
2620 }
2621
2622 /* If no zone was usable by the allocation flags then do not throttle */
2623 if (!pgdat)
2593 goto out; 2624 goto out;
2594 2625
2595 /* Account for the throttling */ 2626 /* Account for the throttling */