summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/compaction.c10
-rw-r--r--mm/internal.h1
-rw-r--r--mm/vmscan.c147
3 files changed, 54 insertions, 104 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 5b2bfbaa821a..ccf97b02b85f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1191,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1191 1191
1192 /* 1192 /*
1193 * Mark that the PG_migrate_skip information should be cleared 1193 * Mark that the PG_migrate_skip information should be cleared
1194 * by kswapd when it goes to sleep. kswapd does not set the 1194 * by kswapd when it goes to sleep. kcompactd does not set the
1195 * flag itself as the decision to be clear should be directly 1195 * flag itself as the decision to be clear should be directly
1196 * based on an allocation request. 1196 * based on an allocation request.
1197 */ 1197 */
1198 if (!current_is_kswapd()) 1198 if (cc->direct_compaction)
1199 zone->compact_blockskip_flush = true; 1199 zone->compact_blockskip_flush = true;
1200 1200
1201 return COMPACT_COMPLETE; 1201 return COMPACT_COMPLETE;
@@ -1338,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1338 1338
1339 /* 1339 /*
1340 * Clear pageblock skip if there were failures recently and compaction 1340 * Clear pageblock skip if there were failures recently and compaction
1341 * is about to be retried after being deferred. kswapd does not do 1341 * is about to be retried after being deferred.
1342 * this reset as it'll reset the cached information when going to sleep.
1343 */ 1342 */
1344 if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 1343 if (compaction_restarting(zone, cc->order))
1345 __reset_isolation_suitable(zone); 1344 __reset_isolation_suitable(zone);
1346 1345
1347 /* 1346 /*
@@ -1477,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
1477 .mode = mode, 1476 .mode = mode,
1478 .alloc_flags = alloc_flags, 1477 .alloc_flags = alloc_flags,
1479 .classzone_idx = classzone_idx, 1478 .classzone_idx = classzone_idx,
1479 .direct_compaction = true,
1480 }; 1480 };
1481 INIT_LIST_HEAD(&cc.freepages); 1481 INIT_LIST_HEAD(&cc.freepages);
1482 INIT_LIST_HEAD(&cc.migratepages); 1482 INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/internal.h b/mm/internal.h
index b95952c2faec..4042a8a05672 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -172,6 +172,7 @@ struct compact_control {
172 unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ 172 unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
173 enum migrate_mode mode; /* Async or sync migration mode */ 173 enum migrate_mode mode; /* Async or sync migration mode */
174 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 174 bool ignore_skip_hint; /* Scan blocks even if marked skip */
175 bool direct_compaction; /* False from kcompactd or /proc/... */
175 int order; /* order a direct compactor needs */ 176 int order; /* order a direct compactor needs */
176 const gfp_t gfp_mask; /* gfp mask of a direct compactor */ 177 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
177 const int alloc_flags; /* alloc flags of a direct compactor */ 178 const int alloc_flags; /* alloc flags of a direct compactor */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5dcc71140108..f87cfaa955a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2968,18 +2968,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
2968 } while (memcg); 2968 } while (memcg);
2969} 2969}
2970 2970
2971static bool zone_balanced(struct zone *zone, int order, 2971static bool zone_balanced(struct zone *zone, int order, bool highorder,
2972 unsigned long balance_gap, int classzone_idx) 2972 unsigned long balance_gap, int classzone_idx)
2973{ 2973{
2974 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + 2974 unsigned long mark = high_wmark_pages(zone) + balance_gap;
2975 balance_gap, classzone_idx))
2976 return false;
2977 2975
2978 if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, 2976 /*
2979 order, 0, classzone_idx) == COMPACT_SKIPPED) 2977 * When checking from pgdat_balanced(), kswapd should stop and sleep
2980 return false; 2978 * when it reaches the high order-0 watermark and let kcompactd take
2979 * over. Other callers such as wakeup_kswapd() want to determine the
2980 * true high-order watermark.
2981 */
2982 if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
2983 mark += (1UL << order);
2984 order = 0;
2985 }
2981 2986
2982 return true; 2987 return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
2983} 2988}
2984 2989
2985/* 2990/*
@@ -3029,7 +3034,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3029 continue; 3034 continue;
3030 } 3035 }
3031 3036
3032 if (zone_balanced(zone, order, 0, i)) 3037 if (zone_balanced(zone, order, false, 0, i))
3033 balanced_pages += zone->managed_pages; 3038 balanced_pages += zone->managed_pages;
3034 else if (!order) 3039 else if (!order)
3035 return false; 3040 return false;
@@ -3083,10 +3088,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
3083 */ 3088 */
3084static bool kswapd_shrink_zone(struct zone *zone, 3089static bool kswapd_shrink_zone(struct zone *zone,
3085 int classzone_idx, 3090 int classzone_idx,
3086 struct scan_control *sc, 3091 struct scan_control *sc)
3087 unsigned long *nr_attempted)
3088{ 3092{
3089 int testorder = sc->order;
3090 unsigned long balance_gap; 3093 unsigned long balance_gap;
3091 bool lowmem_pressure; 3094 bool lowmem_pressure;
3092 3095
@@ -3094,17 +3097,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
3094 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); 3097 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
3095 3098
3096 /* 3099 /*
3097 * Kswapd reclaims only single pages with compaction enabled. Trying
3098 * too hard to reclaim until contiguous free pages have become
3099 * available can hurt performance by evicting too much useful data
3100 * from memory. Do not reclaim more than needed for compaction.
3101 */
3102 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
3103 compaction_suitable(zone, sc->order, 0, classzone_idx)
3104 != COMPACT_SKIPPED)
3105 testorder = 0;
3106
3107 /*
3108 * We put equal pressure on every zone, unless one zone has way too 3100 * We put equal pressure on every zone, unless one zone has way too
3109 * many pages free already. The "too many pages" is defined as the 3101 * many pages free already. The "too many pages" is defined as the
3110 * high wmark plus a "gap" where the gap is either the low 3102 * high wmark plus a "gap" where the gap is either the low
@@ -3118,15 +3110,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
3118 * reclaim is necessary 3110 * reclaim is necessary
3119 */ 3111 */
3120 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); 3112 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
3121 if (!lowmem_pressure && zone_balanced(zone, testorder, 3113 if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
3122 balance_gap, classzone_idx)) 3114 balance_gap, classzone_idx))
3123 return true; 3115 return true;
3124 3116
3125 shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); 3117 shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
3126 3118
3127 /* Account for the number of pages attempted to reclaim */
3128 *nr_attempted += sc->nr_to_reclaim;
3129
3130 clear_bit(ZONE_WRITEBACK, &zone->flags); 3119 clear_bit(ZONE_WRITEBACK, &zone->flags);
3131 3120
3132 /* 3121 /*
@@ -3136,7 +3125,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
3136 * waits. 3125 * waits.
3137 */ 3126 */
3138 if (zone_reclaimable(zone) && 3127 if (zone_reclaimable(zone) &&
3139 zone_balanced(zone, testorder, 0, classzone_idx)) { 3128 zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
3140 clear_bit(ZONE_CONGESTED, &zone->flags); 3129 clear_bit(ZONE_CONGESTED, &zone->flags);
3141 clear_bit(ZONE_DIRTY, &zone->flags); 3130 clear_bit(ZONE_DIRTY, &zone->flags);
3142 } 3131 }
@@ -3148,7 +3137,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
3148 * For kswapd, balance_pgdat() will work across all this node's zones until 3137 * For kswapd, balance_pgdat() will work across all this node's zones until
3149 * they are all at high_wmark_pages(zone). 3138 * they are all at high_wmark_pages(zone).
3150 * 3139 *
3151 * Returns the final order kswapd was reclaiming at 3140 * Returns the highest zone idx kswapd was reclaiming at
3152 * 3141 *
3153 * There is special handling here for zones which are full of pinned pages. 3142 * There is special handling here for zones which are full of pinned pages.
3154 * This can happen if the pages are all mlocked, or if they are all used by 3143 * This can happen if the pages are all mlocked, or if they are all used by
@@ -3165,8 +3154,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
3165 * interoperates with the page allocator fallback scheme to ensure that aging 3154 * interoperates with the page allocator fallback scheme to ensure that aging
3166 * of pages is balanced across the zones. 3155 * of pages is balanced across the zones.
3167 */ 3156 */
3168static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 3157static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3169 int *classzone_idx)
3170{ 3158{
3171 int i; 3159 int i;
3172 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 3160 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3183,9 +3171,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3183 count_vm_event(PAGEOUTRUN); 3171 count_vm_event(PAGEOUTRUN);
3184 3172
3185 do { 3173 do {
3186 unsigned long nr_attempted = 0;
3187 bool raise_priority = true; 3174 bool raise_priority = true;
3188 bool pgdat_needs_compaction = (order > 0);
3189 3175
3190 sc.nr_reclaimed = 0; 3176 sc.nr_reclaimed = 0;
3191 3177
@@ -3220,7 +3206,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3220 break; 3206 break;
3221 } 3207 }
3222 3208
3223 if (!zone_balanced(zone, order, 0, 0)) { 3209 if (!zone_balanced(zone, order, false, 0, 0)) {
3224 end_zone = i; 3210 end_zone = i;
3225 break; 3211 break;
3226 } else { 3212 } else {
@@ -3236,24 +3222,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3236 if (i < 0) 3222 if (i < 0)
3237 goto out; 3223 goto out;
3238 3224
3239 for (i = 0; i <= end_zone; i++) {
3240 struct zone *zone = pgdat->node_zones + i;
3241
3242 if (!populated_zone(zone))
3243 continue;
3244
3245 /*
3246 * If any zone is currently balanced then kswapd will
3247 * not call compaction as it is expected that the
3248 * necessary pages are already available.
3249 */
3250 if (pgdat_needs_compaction &&
3251 zone_watermark_ok(zone, order,
3252 low_wmark_pages(zone),
3253 *classzone_idx, 0))
3254 pgdat_needs_compaction = false;
3255 }
3256
3257 /* 3225 /*
3258 * If we're getting trouble reclaiming, start doing writepage 3226 * If we're getting trouble reclaiming, start doing writepage
3259 * even in laptop mode. 3227 * even in laptop mode.
@@ -3297,8 +3265,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3297 * that that high watermark would be met at 100% 3265 * that that high watermark would be met at 100%
3298 * efficiency. 3266 * efficiency.
3299 */ 3267 */
3300 if (kswapd_shrink_zone(zone, end_zone, 3268 if (kswapd_shrink_zone(zone, end_zone, &sc))
3301 &sc, &nr_attempted))
3302 raise_priority = false; 3269 raise_priority = false;
3303 } 3270 }
3304 3271
@@ -3311,49 +3278,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3311 pfmemalloc_watermark_ok(pgdat)) 3278 pfmemalloc_watermark_ok(pgdat))
3312 wake_up_all(&pgdat->pfmemalloc_wait); 3279 wake_up_all(&pgdat->pfmemalloc_wait);
3313 3280
3314 /*
3315 * Fragmentation may mean that the system cannot be rebalanced
3316 * for high-order allocations in all zones. If twice the
3317 * allocation size has been reclaimed and the zones are still
3318 * not balanced then recheck the watermarks at order-0 to
3319 * prevent kswapd reclaiming excessively. Assume that a
3320 * process requested a high-order can direct reclaim/compact.
3321 */
3322 if (order && sc.nr_reclaimed >= 2UL << order)
3323 order = sc.order = 0;
3324
3325 /* Check if kswapd should be suspending */ 3281 /* Check if kswapd should be suspending */
3326 if (try_to_freeze() || kthread_should_stop()) 3282 if (try_to_freeze() || kthread_should_stop())
3327 break; 3283 break;
3328 3284
3329 /* 3285 /*
3330 * Compact if necessary and kswapd is reclaiming at least the
3331 * high watermark number of pages as requsted
3332 */
3333 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
3334 compact_pgdat(pgdat, order);
3335
3336 /*
3337 * Raise priority if scanning rate is too low or there was no 3286 * Raise priority if scanning rate is too low or there was no
3338 * progress in reclaiming pages 3287 * progress in reclaiming pages
3339 */ 3288 */
3340 if (raise_priority || !sc.nr_reclaimed) 3289 if (raise_priority || !sc.nr_reclaimed)
3341 sc.priority--; 3290 sc.priority--;
3342 } while (sc.priority >= 1 && 3291 } while (sc.priority >= 1 &&
3343 !pgdat_balanced(pgdat, order, *classzone_idx)); 3292 !pgdat_balanced(pgdat, order, classzone_idx));
3344 3293
3345out: 3294out:
3346 /* 3295 /*
3347 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3296 * Return the highest zone idx we were reclaiming at so
3348 * makes a decision on the order we were last reclaiming at. However, 3297 * prepare_kswapd_sleep() makes the same decisions as here.
3349 * if another caller entered the allocator slow path while kswapd
3350 * was awake, order will remain at the higher level
3351 */ 3298 */
3352 *classzone_idx = end_zone; 3299 return end_zone;
3353 return order;
3354} 3300}
3355 3301
3356static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 3302static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
3303 int classzone_idx, int balanced_classzone_idx)
3357{ 3304{
3358 long remaining = 0; 3305 long remaining = 0;
3359 DEFINE_WAIT(wait); 3306 DEFINE_WAIT(wait);
@@ -3364,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3364 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3311 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3365 3312
3366 /* Try to sleep for a short interval */ 3313 /* Try to sleep for a short interval */
3367 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3314 if (prepare_kswapd_sleep(pgdat, order, remaining,
3315 balanced_classzone_idx)) {
3368 remaining = schedule_timeout(HZ/10); 3316 remaining = schedule_timeout(HZ/10);
3369 finish_wait(&pgdat->kswapd_wait, &wait); 3317 finish_wait(&pgdat->kswapd_wait, &wait);
3370 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 3318 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3374,7 +3322,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3374 * After a short sleep, check if it was a premature sleep. If not, then 3322 * After a short sleep, check if it was a premature sleep. If not, then
3375 * go fully to sleep until explicitly woken up. 3323 * go fully to sleep until explicitly woken up.
3376 */ 3324 */
3377 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { 3325 if (prepare_kswapd_sleep(pgdat, order, remaining,
3326 balanced_classzone_idx)) {
3378 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 3327 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3379 3328
3380 /* 3329 /*
@@ -3395,6 +3344,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3395 */ 3344 */
3396 reset_isolation_suitable(pgdat); 3345 reset_isolation_suitable(pgdat);
3397 3346
3347 /*
3348 * We have freed the memory, now we should compact it to make
3349 * allocation of the requested order possible.
3350 */
3351 wakeup_kcompactd(pgdat, order, classzone_idx);
3352
3398 if (!kthread_should_stop()) 3353 if (!kthread_should_stop())
3399 schedule(); 3354 schedule();
3400 3355
@@ -3424,7 +3379,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3424static int kswapd(void *p) 3379static int kswapd(void *p)
3425{ 3380{
3426 unsigned long order, new_order; 3381 unsigned long order, new_order;
3427 unsigned balanced_order;
3428 int classzone_idx, new_classzone_idx; 3382 int classzone_idx, new_classzone_idx;
3429 int balanced_classzone_idx; 3383 int balanced_classzone_idx;
3430 pg_data_t *pgdat = (pg_data_t*)p; 3384 pg_data_t *pgdat = (pg_data_t*)p;
@@ -3457,23 +3411,19 @@ static int kswapd(void *p)
3457 set_freezable(); 3411 set_freezable();
3458 3412
3459 order = new_order = 0; 3413 order = new_order = 0;
3460 balanced_order = 0;
3461 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 3414 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
3462 balanced_classzone_idx = classzone_idx; 3415 balanced_classzone_idx = classzone_idx;
3463 for ( ; ; ) { 3416 for ( ; ; ) {
3464 bool ret; 3417 bool ret;
3465 3418
3466 /* 3419 /*
3467 * If the last balance_pgdat was unsuccessful it's unlikely a 3420 * While we were reclaiming, there might have been another
3468 * new request of a similar or harder type will succeed soon 3421 * wakeup, so check the values.
3469 * so consider going to sleep on the basis we reclaimed at
3470 */ 3422 */
3471 if (balanced_order == new_order) { 3423 new_order = pgdat->kswapd_max_order;
3472 new_order = pgdat->kswapd_max_order; 3424 new_classzone_idx = pgdat->classzone_idx;
3473 new_classzone_idx = pgdat->classzone_idx; 3425 pgdat->kswapd_max_order = 0;
3474 pgdat->kswapd_max_order = 0; 3426 pgdat->classzone_idx = pgdat->nr_zones - 1;
3475 pgdat->classzone_idx = pgdat->nr_zones - 1;
3476 }
3477 3427
3478 if (order < new_order || classzone_idx > new_classzone_idx) { 3428 if (order < new_order || classzone_idx > new_classzone_idx) {
3479 /* 3429 /*
@@ -3483,7 +3433,7 @@ static int kswapd(void *p)
3483 order = new_order; 3433 order = new_order;
3484 classzone_idx = new_classzone_idx; 3434 classzone_idx = new_classzone_idx;
3485 } else { 3435 } else {
3486 kswapd_try_to_sleep(pgdat, balanced_order, 3436 kswapd_try_to_sleep(pgdat, order, classzone_idx,
3487 balanced_classzone_idx); 3437 balanced_classzone_idx);
3488 order = pgdat->kswapd_max_order; 3438 order = pgdat->kswapd_max_order;
3489 classzone_idx = pgdat->classzone_idx; 3439 classzone_idx = pgdat->classzone_idx;
@@ -3503,9 +3453,8 @@ static int kswapd(void *p)
3503 */ 3453 */
3504 if (!ret) { 3454 if (!ret) {
3505 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 3455 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
3506 balanced_classzone_idx = classzone_idx; 3456 balanced_classzone_idx = balance_pgdat(pgdat, order,
3507 balanced_order = balance_pgdat(pgdat, order, 3457 classzone_idx);
3508 &balanced_classzone_idx);
3509 } 3458 }
3510 } 3459 }
3511 3460
@@ -3535,7 +3484,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3535 } 3484 }
3536 if (!waitqueue_active(&pgdat->kswapd_wait)) 3485 if (!waitqueue_active(&pgdat->kswapd_wait))
3537 return; 3486 return;
3538 if (zone_balanced(zone, order, 0, 0)) 3487 if (zone_balanced(zone, order, true, 0, 0))
3539 return; 3488 return;
3540 3489
3541 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3490 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);