aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@techsingularity.net>2017-05-03 17:53:45 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-03 18:52:09 -0400
commite716f2eb24defb33b82be763a3ed9a618a210cee (patch)
tree623143ddb954f21419b11ad38894b196e6261dbd /mm/vmscan.c
parent631b6e083ec328f7203f466ba839d296aee70c36 (diff)
mm, vmscan: prevent kswapd sleeping prematurely due to mismatched classzone_idx
kswapd is woken to reclaim a node based on a failed allocation request from any eligible zone. Once reclaiming in balance_pgdat(), it will continue reclaiming until there is an eligible zone available for the zone it was woken for. kswapd tracks what zone it was recently woken for in pgdat->kswapd_classzone_idx. If it has not been woken recently, this zone will be 0. However, the decision on whether to sleep is made on kswapd_classzone_idx which is 0 without a recent wakeup request and that classzone does not account for lowmem reserves. This allows kswapd to sleep when a low small zone such as ZONE_DMA is balanced for a GFP_DMA request even if a stream of allocations cannot use that zone. While kswapd may be woken again shortly in the near future there are two consequences -- the pgdat bits that control congestion are cleared prematurely and direct reclaim is more likely as kswapd slept prematurely. This patch flips kswapd_classzone_idx to default to MAX_NR_ZONES (an invalid index) when there has been no recent wakeups. If there are no wakeups, it'll decide whether to sleep based on the highest possible zone available (MAX_NR_ZONES - 1). It then becomes critical that the "pgdat balanced" decisions during reclaim and when deciding to sleep are the same. If there is a mismatch, kswapd can stay awake continually trying to balance tiny zones. simoop was used to evaluate it again. Two of the preparation patches regressed the workload so they are included as the second set of results. Otherwise this patch looks artifically excellent 4.11.0-rc1 4.11.0-rc1 4.11.0-rc1 vanilla clear-v2 keepawake-v2 Amean p50-Read 21670074.18 ( 0.00%) 19786774.76 ( 8.69%) 22668332.52 ( -4.61%) Amean p95-Read 25456267.64 ( 0.00%) 24101956.27 ( 5.32%) 26738688.00 ( -5.04%) Amean p99-Read 29369064.73 ( 0.00%) 27691872.71 ( 5.71%) 30991404.52 ( -5.52%) Amean p50-Write 1390.30 ( 0.00%) 1011.91 ( 27.22%) 924.91 ( 33.47%) Amean p95-Write 412901.57 ( 0.00%) 34874.98 ( 91.55%) 1362.62 ( 99.67%) Amean p99-Write 6668722.09 ( 0.00%) 575449.60 ( 91.37%) 16854.04 ( 99.75%) Amean p50-Allocation 78714.31 ( 0.00%) 84246.26 ( -7.03%) 74729.74 ( 5.06%) Amean p95-Allocation 175533.51 ( 0.00%) 400058.43 (-127.91%) 101609.74 ( 42.11%) Amean p99-Allocation 247003.02 ( 0.00%) 10905600.00 (-4315.17%) 125765.57 ( 49.08%) With this patch on top, write and allocation latencies are massively improved. The read latencies are slightly impaired but it's worth noting that this is mostly due to the IO scheduler and not directly related to reclaim. The vmstats are a bit of a mix but the relevant ones are as follows; 4.10.0-rc7 4.10.0-rc7 4.10.0-rc7 mmots-20170209 clear-v1r25keepawake-v1r25 Swap Ins 0 0 0 Swap Outs 0 608 0 Direct pages scanned 6910672 3132699 6357298 Kswapd pages scanned 57036946 82488665 56986286 Kswapd pages reclaimed 55993488 63474329 55939113 Direct pages reclaimed 6905990 2964843 6352115 Kswapd efficiency 98% 76% 98% Kswapd velocity 12494.375 17597.507 12488.065 Direct efficiency 99% 94% 99% Direct velocity 1513.835 668.306 1393.148 Page writes by reclaim 0.000 4410243.000 0.000 Page writes file 0 4409635 0 Page writes anon 0 608 0 Page reclaim immediate 1036792 14175203 1042571 4.11.0-rc1 4.11.0-rc1 4.11.0-rc1 vanilla clear-v2 keepawake-v2 Swap Ins 0 12 0 Swap Outs 0 838 0 Direct pages scanned 6579706 3237270 6256811 Kswapd pages scanned 61853702 79961486 54837791 Kswapd pages reclaimed 60768764 60755788 53849586 Direct pages reclaimed 6579055 2987453 6256151 Kswapd efficiency 98% 75% 98% Page writes by reclaim 0.000 4389496.000 0.000 Page writes file 0 4388658 0 Page writes anon 0 838 0 Page reclaim immediate 1073573 14473009 982507 Swap-outs are equivalent to baseline. Direct reclaim is reduced but not eliminated. It's worth noting that there are two periods of direct reclaim for this workload. The first is when it switches from preparing the files for the actual test itself. It's a lot of file IO followed by a lot of allocs that reclaims heavily for a brief window. While direct reclaim is lower with clear-v2, it is due to kswapd scanning aggressively and trying to reclaim the world which is not the right thing to do. With the patches applied, there is still direct reclaim but the phase change from "creating work files" to starting multiple threads that allocate a lot of anonymous memory faster than kswapd can reclaim. Scanning/reclaim efficiency is restored by this patch. Page writes from reclaim context are back at 0 which is ideal. Pages immediately reclaimed after IO completes is slightly improved but it is expected this will vary slightly. On UMA, there is almost no change so this is not expected to be a universal win. [mgorman@suse.de: fix ->kswapd_classzone_idx initialization] Link: http://lkml.kernel.org/r/20170406174538.5msrznj6nt6qpbx5@suse.de Link: http://lkml.kernel.org/r/20170309075657.25121-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Shantanu Goel <sgoel01@yahoo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c120
1 files changed, 66 insertions, 54 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8c553fa0d800..8ce39867140b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3049,14 +3049,36 @@ static void age_active_anon(struct pglist_data *pgdat,
3049 } while (memcg); 3049 } while (memcg);
3050} 3050}
3051 3051
3052static bool zone_balanced(struct zone *zone, int order, int classzone_idx) 3052/*
3053 * Returns true if there is an eligible zone balanced for the request order
3054 * and classzone_idx
3055 */
3056static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3053{ 3057{
3054 unsigned long mark = high_wmark_pages(zone); 3058 int i;
3059 unsigned long mark = -1;
3060 struct zone *zone;
3055 3061
3056 if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) 3062 for (i = 0; i <= classzone_idx; i++) {
3057 return false; 3063 zone = pgdat->node_zones + i;
3058 3064
3059 return true; 3065 if (!managed_zone(zone))
3066 continue;
3067
3068 mark = high_wmark_pages(zone);
3069 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3070 return true;
3071 }
3072
3073 /*
3074 * If a node has no populated zone within classzone_idx, it does not
3075 * need balancing by definition. This can happen if a zone-restricted
3076 * allocation tries to wake a remote kswapd.
3077 */
3078 if (mark == -1)
3079 return true;
3080
3081 return false;
3060} 3082}
3061 3083
3062/* Clear pgdat state for congested, dirty or under writeback. */ 3084/* Clear pgdat state for congested, dirty or under writeback. */
@@ -3075,8 +3097,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
3075 */ 3097 */
3076static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) 3098static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3077{ 3099{
3078 int i;
3079
3080 /* 3100 /*
3081 * The throttled processes are normally woken up in balance_pgdat() as 3101 * The throttled processes are normally woken up in balance_pgdat() as
3082 * soon as allow_direct_reclaim() is true. But there is a potential 3102 * soon as allow_direct_reclaim() is true. But there is a potential
@@ -3097,16 +3117,9 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3097 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 3117 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3098 return true; 3118 return true;
3099 3119
3100 for (i = 0; i <= classzone_idx; i++) { 3120 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3101 struct zone *zone = pgdat->node_zones + i; 3121 clear_pgdat_congested(pgdat);
3102 3122 return true;
3103 if (!managed_zone(zone))
3104 continue;
3105
3106 if (zone_balanced(zone, order, classzone_idx)) {
3107 clear_pgdat_congested(pgdat);
3108 return true;
3109 }
3110 } 3123 }
3111 3124
3112 return false; 3125 return false;
@@ -3212,23 +3225,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3212 } 3225 }
3213 3226
3214 /* 3227 /*
3215 * Only reclaim if there are no eligible zones. Check from 3228 * Only reclaim if there are no eligible zones. Note that
3216 * high to low zone as allocations prefer higher zones. 3229 * sc.reclaim_idx is not used as buffer_heads_over_limit may
3217 * Scanning from low to high zone would allow congestion to be 3230 * have adjusted it.
3218 * cleared during a very small window when a small low
3219 * zone was balanced even under extreme pressure when the
3220 * overall node may be congested. Note that sc.reclaim_idx
3221 * is not used as buffer_heads_over_limit may have adjusted
3222 * it.
3223 */ 3231 */
3224 for (i = classzone_idx; i >= 0; i--) { 3232 if (pgdat_balanced(pgdat, sc.order, classzone_idx))
3225 zone = pgdat->node_zones + i; 3233 goto out;
3226 if (!managed_zone(zone))
3227 continue;
3228
3229 if (zone_balanced(zone, sc.order, classzone_idx))
3230 goto out;
3231 }
3232 3234
3233 /* 3235 /*
3234 * Do some background aging of the anon list, to give 3236 * Do some background aging of the anon list, to give
@@ -3295,6 +3297,22 @@ out:
3295 return sc.order; 3297 return sc.order;
3296} 3298}
3297 3299
3300/*
3301 * pgdat->kswapd_classzone_idx is the highest zone index that a recent
3302 * allocation request woke kswapd for. When kswapd has not woken recently,
3303 * the value is MAX_NR_ZONES which is not a valid index. This compares a
3304 * given classzone and returns it or the highest classzone index kswapd
3305 * was recently woke for.
3306 */
3307static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3308 enum zone_type classzone_idx)
3309{
3310 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3311 return classzone_idx;
3312
3313 return max(pgdat->kswapd_classzone_idx, classzone_idx);
3314}
3315
3298static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, 3316static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3299 unsigned int classzone_idx) 3317 unsigned int classzone_idx)
3300{ 3318{
@@ -3336,7 +3354,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
3336 * the previous request that slept prematurely. 3354 * the previous request that slept prematurely.
3337 */ 3355 */
3338 if (remaining) { 3356 if (remaining) {
3339 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); 3357 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3340 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); 3358 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3341 } 3359 }
3342 3360
@@ -3390,7 +3408,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
3390 */ 3408 */
3391static int kswapd(void *p) 3409static int kswapd(void *p)
3392{ 3410{
3393 unsigned int alloc_order, reclaim_order, classzone_idx; 3411 unsigned int alloc_order, reclaim_order;
3412 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3394 pg_data_t *pgdat = (pg_data_t*)p; 3413 pg_data_t *pgdat = (pg_data_t*)p;
3395 struct task_struct *tsk = current; 3414 struct task_struct *tsk = current;
3396 3415
@@ -3420,20 +3439,23 @@ static int kswapd(void *p)
3420 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3439 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3421 set_freezable(); 3440 set_freezable();
3422 3441
3423 pgdat->kswapd_order = alloc_order = reclaim_order = 0; 3442 pgdat->kswapd_order = 0;
3424 pgdat->kswapd_classzone_idx = classzone_idx = 0; 3443 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3425 for ( ; ; ) { 3444 for ( ; ; ) {
3426 bool ret; 3445 bool ret;
3427 3446
3447 alloc_order = reclaim_order = pgdat->kswapd_order;
3448 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3449
3428kswapd_try_sleep: 3450kswapd_try_sleep:
3429 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, 3451 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3430 classzone_idx); 3452 classzone_idx);
3431 3453
3432 /* Read the new order and classzone_idx */ 3454 /* Read the new order and classzone_idx */
3433 alloc_order = reclaim_order = pgdat->kswapd_order; 3455 alloc_order = reclaim_order = pgdat->kswapd_order;
3434 classzone_idx = pgdat->kswapd_classzone_idx; 3456 classzone_idx = kswapd_classzone_idx(pgdat, 0);
3435 pgdat->kswapd_order = 0; 3457 pgdat->kswapd_order = 0;
3436 pgdat->kswapd_classzone_idx = 0; 3458 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3437 3459
3438 ret = try_to_freeze(); 3460 ret = try_to_freeze();
3439 if (kthread_should_stop()) 3461 if (kthread_should_stop())
@@ -3459,9 +3481,6 @@ kswapd_try_sleep:
3459 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); 3481 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3460 if (reclaim_order < alloc_order) 3482 if (reclaim_order < alloc_order)
3461 goto kswapd_try_sleep; 3483 goto kswapd_try_sleep;
3462
3463 alloc_order = reclaim_order = pgdat->kswapd_order;
3464 classzone_idx = pgdat->kswapd_classzone_idx;
3465 } 3484 }
3466 3485
3467 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); 3486 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3477,7 +3496,6 @@ kswapd_try_sleep:
3477void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3496void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3478{ 3497{
3479 pg_data_t *pgdat; 3498 pg_data_t *pgdat;
3480 int z;
3481 3499
3482 if (!managed_zone(zone)) 3500 if (!managed_zone(zone))
3483 return; 3501 return;
@@ -3485,7 +3503,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3485 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) 3503 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
3486 return; 3504 return;
3487 pgdat = zone->zone_pgdat; 3505 pgdat = zone->zone_pgdat;
3488 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); 3506 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
3507 classzone_idx);
3489 pgdat->kswapd_order = max(pgdat->kswapd_order, order); 3508 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3490 if (!waitqueue_active(&pgdat->kswapd_wait)) 3509 if (!waitqueue_active(&pgdat->kswapd_wait))
3491 return; 3510 return;
@@ -3494,17 +3513,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3494 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 3513 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3495 return; 3514 return;
3496 3515
3497 /* Only wake kswapd if all zones are unbalanced */ 3516 if (pgdat_balanced(pgdat, order, classzone_idx))
3498 for (z = 0; z <= classzone_idx; z++) { 3517 return;
3499 zone = pgdat->node_zones + z;
3500 if (!managed_zone(zone))
3501 continue;
3502
3503 if (zone_balanced(zone, order, classzone_idx))
3504 return;
3505 }
3506 3518
3507 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3519 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
3508 wake_up_interruptible(&pgdat->kswapd_wait); 3520 wake_up_interruptible(&pgdat->kswapd_wait);
3509} 3521}
3510 3522