diff options
author | Mel Gorman <mgorman@suse.de> | 2014-06-04 19:07:35 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 19:54:01 -0400 |
commit | 675becce15f320337499bc1a9356260409a5ba29 (patch) | |
tree | cfd83d7630aff3cee016910afff4d663e8ba3c33 /mm | |
parent | f98bafa06a28fdfdd5c49f820f4d6560f636fc46 (diff) |
mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL
throttle_direct_reclaim() is meant to trigger during swap-over-network
during which the min watermark is treated as a pfmemalloc reserve. It
throttes on the first node in the zonelist but this is flawed.
The user-visible impact is that a process running on CPU whose local
memory node has no ZONE_NORMAL will stall for prolonged periods of time,
possibly indefintely. This is due to throttle_direct_reclaim thinking the
pfmemalloc reserves are depleted when in fact they don't exist on that
node.
On a NUMA machine running a 32-bit kernel (I know) allocation requests
from CPUs on node 1 would detect no pfmemalloc reserves and the process
gets throttled. This patch adjusts throttling of direct reclaim to
throttle based on the first node in the zonelist that has a usable
ZONE_NORMAL or lower zone.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/vmscan.c | 43 |
1 files changed, 37 insertions, 6 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index fbcf46076c4f..53e4534885ad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2537,10 +2537,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2537 | 2537 | ||
2538 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2538 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2539 | zone = &pgdat->node_zones[i]; | 2539 | zone = &pgdat->node_zones[i]; |
2540 | if (!populated_zone(zone)) | ||
2541 | continue; | ||
2542 | |||
2540 | pfmemalloc_reserve += min_wmark_pages(zone); | 2543 | pfmemalloc_reserve += min_wmark_pages(zone); |
2541 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | 2544 | free_pages += zone_page_state(zone, NR_FREE_PAGES); |
2542 | } | 2545 | } |
2543 | 2546 | ||
2547 | /* If there are no reserves (unexpected config) then do not throttle */ | ||
2548 | if (!pfmemalloc_reserve) | ||
2549 | return true; | ||
2550 | |||
2544 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | 2551 | wmark_ok = free_pages > pfmemalloc_reserve / 2; |
2545 | 2552 | ||
2546 | /* kswapd must be awake if processes are being throttled */ | 2553 | /* kswapd must be awake if processes are being throttled */ |
@@ -2565,9 +2572,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2565 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | 2572 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, |
2566 | nodemask_t *nodemask) | 2573 | nodemask_t *nodemask) |
2567 | { | 2574 | { |
2575 | struct zoneref *z; | ||
2568 | struct zone *zone; | 2576 | struct zone *zone; |
2569 | int high_zoneidx = gfp_zone(gfp_mask); | 2577 | pg_data_t *pgdat = NULL; |
2570 | pg_data_t *pgdat; | ||
2571 | 2578 | ||
2572 | /* | 2579 | /* |
2573 | * Kernel threads should not be throttled as they may be indirectly | 2580 | * Kernel threads should not be throttled as they may be indirectly |
@@ -2586,10 +2593,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2586 | if (fatal_signal_pending(current)) | 2593 | if (fatal_signal_pending(current)) |
2587 | goto out; | 2594 | goto out; |
2588 | 2595 | ||
2589 | /* Check if the pfmemalloc reserves are ok */ | 2596 | /* |
2590 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | 2597 | * Check if the pfmemalloc reserves are ok by finding the first node |
2591 | pgdat = zone->zone_pgdat; | 2598 | * with a usable ZONE_NORMAL or lower zone. The expectation is that |
2592 | if (pfmemalloc_watermark_ok(pgdat)) | 2599 | * GFP_KERNEL will be required for allocating network buffers when |
2600 | * swapping over the network so ZONE_HIGHMEM is unusable. | ||
2601 | * | ||
2602 | * Throttling is based on the first usable node and throttled processes | ||
2603 | * wait on a queue until kswapd makes progress and wakes them. There | ||
2604 | * is an affinity then between processes waking up and where reclaim | ||
2605 | * progress has been made assuming the process wakes on the same node. | ||
2606 | * More importantly, processes running on remote nodes will not compete | ||
2607 | * for remote pfmemalloc reserves and processes on different nodes | ||
2608 | * should make reasonable progress. | ||
2609 | */ | ||
2610 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
2611 | gfp_mask, nodemask) { | ||
2612 | if (zone_idx(zone) > ZONE_NORMAL) | ||
2613 | continue; | ||
2614 | |||
2615 | /* Throttle based on the first usable node */ | ||
2616 | pgdat = zone->zone_pgdat; | ||
2617 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2618 | goto out; | ||
2619 | break; | ||
2620 | } | ||
2621 | |||
2622 | /* If no zone was usable by the allocation flags then do not throttle */ | ||
2623 | if (!pgdat) | ||
2593 | goto out; | 2624 | goto out; |
2594 | 2625 | ||
2595 | /* Account for the throttling */ | 2626 | /* Account for the throttling */ |