summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c143
1 files changed, 126 insertions, 17 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 24ab1f7394ab..a714c4f800e9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
88 /* Can pages be swapped as part of reclaim? */ 88 /* Can pages be swapped as part of reclaim? */
89 unsigned int may_swap:1; 89 unsigned int may_swap:1;
90 90
91 /* e.g. boosted watermark reclaim leaves slabs alone */
92 unsigned int may_shrinkslab:1;
93
91 /* 94 /*
92 * Cgroups are not reclaimed below their configured memory.low, 95 * Cgroups are not reclaimed below their configured memory.low,
93 * unless we threaten to OOM. If any cgroups are skipped due to 96 * unless we threaten to OOM. If any cgroups are skipped due to
@@ -1457,14 +1460,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1457 count_memcg_page_event(page, PGLAZYFREED); 1460 count_memcg_page_event(page, PGLAZYFREED);
1458 } else if (!mapping || !__remove_mapping(mapping, page, true)) 1461 } else if (!mapping || !__remove_mapping(mapping, page, true))
1459 goto keep_locked; 1462 goto keep_locked;
1460 /* 1463
1461 * At this point, we have no other references and there is 1464 unlock_page(page);
1462 * no way to pick any more up (removed from LRU, removed
1463 * from pagecache). Can use non-atomic bitops now (and
1464 * we obviously don't have to worry about waking up a process
1465 * waiting on the page lock, because there are no references.
1466 */
1467 __ClearPageLocked(page);
1468free_it: 1465free_it:
1469 nr_reclaimed++; 1466 nr_reclaimed++;
1470 1467
@@ -2756,8 +2753,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2756 shrink_node_memcg(pgdat, memcg, sc, &lru_pages); 2753 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2757 node_lru_pages += lru_pages; 2754 node_lru_pages += lru_pages;
2758 2755
2759 shrink_slab(sc->gfp_mask, pgdat->node_id, 2756 if (sc->may_shrinkslab) {
2757 shrink_slab(sc->gfp_mask, pgdat->node_id,
2760 memcg, sc->priority); 2758 memcg, sc->priority);
2759 }
2761 2760
2762 /* Record the group's reclaim efficiency */ 2761 /* Record the group's reclaim efficiency */
2763 vmpressure(sc->gfp_mask, memcg, false, 2762 vmpressure(sc->gfp_mask, memcg, false,
@@ -3239,6 +3238,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3239 .may_writepage = !laptop_mode, 3238 .may_writepage = !laptop_mode,
3240 .may_unmap = 1, 3239 .may_unmap = 1,
3241 .may_swap = 1, 3240 .may_swap = 1,
3241 .may_shrinkslab = 1,
3242 }; 3242 };
3243 3243
3244 /* 3244 /*
@@ -3283,6 +3283,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3283 .may_unmap = 1, 3283 .may_unmap = 1,
3284 .reclaim_idx = MAX_NR_ZONES - 1, 3284 .reclaim_idx = MAX_NR_ZONES - 1,
3285 .may_swap = !noswap, 3285 .may_swap = !noswap,
3286 .may_shrinkslab = 1,
3286 }; 3287 };
3287 unsigned long lru_pages; 3288 unsigned long lru_pages;
3288 3289
@@ -3329,6 +3330,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3329 .may_writepage = !laptop_mode, 3330 .may_writepage = !laptop_mode,
3330 .may_unmap = 1, 3331 .may_unmap = 1,
3331 .may_swap = may_swap, 3332 .may_swap = may_swap,
3333 .may_shrinkslab = 1,
3332 }; 3334 };
3333 3335
3334 /* 3336 /*
@@ -3379,6 +3381,30 @@ static void age_active_anon(struct pglist_data *pgdat,
3379 } while (memcg); 3381 } while (memcg);
3380} 3382}
3381 3383
3384static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
3385{
3386 int i;
3387 struct zone *zone;
3388
3389 /*
3390 * Check for watermark boosts top-down as the higher zones
3391 * are more likely to be boosted. Both watermarks and boosts
3392 * should not be checked at the time time as reclaim would
3393 * start prematurely when there is no boosting and a lower
3394 * zone is balanced.
3395 */
3396 for (i = classzone_idx; i >= 0; i--) {
3397 zone = pgdat->node_zones + i;
3398 if (!managed_zone(zone))
3399 continue;
3400
3401 if (zone->watermark_boost)
3402 return true;
3403 }
3404
3405 return false;
3406}
3407
3382/* 3408/*
3383 * Returns true if there is an eligible zone balanced for the request order 3409 * Returns true if there is an eligible zone balanced for the request order
3384 * and classzone_idx 3410 * and classzone_idx
@@ -3389,6 +3415,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3389 unsigned long mark = -1; 3415 unsigned long mark = -1;
3390 struct zone *zone; 3416 struct zone *zone;
3391 3417
3418 /*
3419 * Check watermarks bottom-up as lower zones are more likely to
3420 * meet watermarks.
3421 */
3392 for (i = 0; i <= classzone_idx; i++) { 3422 for (i = 0; i <= classzone_idx; i++) {
3393 zone = pgdat->node_zones + i; 3423 zone = pgdat->node_zones + i;
3394 3424
@@ -3517,14 +3547,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3517 unsigned long nr_soft_reclaimed; 3547 unsigned long nr_soft_reclaimed;
3518 unsigned long nr_soft_scanned; 3548 unsigned long nr_soft_scanned;
3519 unsigned long pflags; 3549 unsigned long pflags;
3550 unsigned long nr_boost_reclaim;
3551 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3552 bool boosted;
3520 struct zone *zone; 3553 struct zone *zone;
3521 struct scan_control sc = { 3554 struct scan_control sc = {
3522 .gfp_mask = GFP_KERNEL, 3555 .gfp_mask = GFP_KERNEL,
3523 .order = order, 3556 .order = order,
3524 .priority = DEF_PRIORITY,
3525 .may_writepage = !laptop_mode,
3526 .may_unmap = 1, 3557 .may_unmap = 1,
3527 .may_swap = 1,
3528 }; 3558 };
3529 3559
3530 psi_memstall_enter(&pflags); 3560 psi_memstall_enter(&pflags);
@@ -3532,9 +3562,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3532 3562
3533 count_vm_event(PAGEOUTRUN); 3563 count_vm_event(PAGEOUTRUN);
3534 3564
3565 /*
3566 * Account for the reclaim boost. Note that the zone boost is left in
3567 * place so that parallel allocations that are near the watermark will
3568 * stall or direct reclaim until kswapd is finished.
3569 */
3570 nr_boost_reclaim = 0;
3571 for (i = 0; i <= classzone_idx; i++) {
3572 zone = pgdat->node_zones + i;
3573 if (!managed_zone(zone))
3574 continue;
3575
3576 nr_boost_reclaim += zone->watermark_boost;
3577 zone_boosts[i] = zone->watermark_boost;
3578 }
3579 boosted = nr_boost_reclaim;
3580
3581restart:
3582 sc.priority = DEF_PRIORITY;
3535 do { 3583 do {
3536 unsigned long nr_reclaimed = sc.nr_reclaimed; 3584 unsigned long nr_reclaimed = sc.nr_reclaimed;
3537 bool raise_priority = true; 3585 bool raise_priority = true;
3586 bool balanced;
3538 bool ret; 3587 bool ret;
3539 3588
3540 sc.reclaim_idx = classzone_idx; 3589 sc.reclaim_idx = classzone_idx;
@@ -3561,13 +3610,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3561 } 3610 }
3562 3611
3563 /* 3612 /*
3564 * Only reclaim if there are no eligible zones. Note that 3613 * If the pgdat is imbalanced then ignore boosting and preserve
3565 * sc.reclaim_idx is not used as buffer_heads_over_limit may 3614 * the watermarks for a later time and restart. Note that the
3566 * have adjusted it. 3615 * zone watermarks will be still reset at the end of balancing
3616 * on the grounds that the normal reclaim should be enough to
3617 * re-evaluate if boosting is required when kswapd next wakes.
3567 */ 3618 */
3568 if (pgdat_balanced(pgdat, sc.order, classzone_idx)) 3619 balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
3620 if (!balanced && nr_boost_reclaim) {
3621 nr_boost_reclaim = 0;
3622 goto restart;
3623 }
3624
3625 /*
3626 * If boosting is not active then only reclaim if there are no
3627 * eligible zones. Note that sc.reclaim_idx is not used as
3628 * buffer_heads_over_limit may have adjusted it.
3629 */
3630 if (!nr_boost_reclaim && balanced)
3569 goto out; 3631 goto out;
3570 3632
3633 /* Limit the priority of boosting to avoid reclaim writeback */
3634 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3635 raise_priority = false;
3636
3637 /*
3638 * Do not writeback or swap pages for boosted reclaim. The
3639 * intent is to relieve pressure not issue sub-optimal IO
3640 * from reclaim context. If no pages are reclaimed, the
3641 * reclaim will be aborted.
3642 */
3643 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3644 sc.may_swap = !nr_boost_reclaim;
3645 sc.may_shrinkslab = !nr_boost_reclaim;
3646
3571 /* 3647 /*
3572 * Do some background aging of the anon list, to give 3648 * Do some background aging of the anon list, to give
3573 * pages a chance to be referenced before reclaiming. All 3649 * pages a chance to be referenced before reclaiming. All
@@ -3619,6 +3695,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3619 * progress in reclaiming pages 3695 * progress in reclaiming pages
3620 */ 3696 */
3621 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 3697 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3698 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3699
3700 /*
3701 * If reclaim made no progress for a boost, stop reclaim as
3702 * IO cannot be queued and it could be an infinite loop in
3703 * extreme circumstances.
3704 */
3705 if (nr_boost_reclaim && !nr_reclaimed)
3706 break;
3707
3622 if (raise_priority || !nr_reclaimed) 3708 if (raise_priority || !nr_reclaimed)
3623 sc.priority--; 3709 sc.priority--;
3624 } while (sc.priority >= 1); 3710 } while (sc.priority >= 1);
@@ -3627,6 +3713,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3627 pgdat->kswapd_failures++; 3713 pgdat->kswapd_failures++;
3628 3714
3629out: 3715out:
3716 /* If reclaim was boosted, account for the reclaim done in this pass */
3717 if (boosted) {
3718 unsigned long flags;
3719
3720 for (i = 0; i <= classzone_idx; i++) {
3721 if (!zone_boosts[i])
3722 continue;
3723
3724 /* Increments are under the zone lock */
3725 zone = pgdat->node_zones + i;
3726 spin_lock_irqsave(&zone->lock, flags);
3727 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3728 spin_unlock_irqrestore(&zone->lock, flags);
3729 }
3730
3731 /*
3732 * As there is now likely space, wakeup kcompact to defragment
3733 * pageblocks.
3734 */
3735 wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
3736 }
3737
3630 snapshot_refaults(NULL, pgdat); 3738 snapshot_refaults(NULL, pgdat);
3631 __fs_reclaim_release(); 3739 __fs_reclaim_release();
3632 psi_memstall_leave(&pflags); 3740 psi_memstall_leave(&pflags);
@@ -3855,7 +3963,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3855 3963
3856 /* Hopeless node, leave it to direct reclaim if possible */ 3964 /* Hopeless node, leave it to direct reclaim if possible */
3857 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || 3965 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3858 pgdat_balanced(pgdat, order, classzone_idx)) { 3966 (pgdat_balanced(pgdat, order, classzone_idx) &&
3967 !pgdat_watermark_boosted(pgdat, classzone_idx))) {
3859 /* 3968 /*
3860 * There may be plenty of free memory available, but it's too 3969 * There may be plenty of free memory available, but it's too
3861 * fragmented for high-order allocations. Wake up kcompactd 3970 * fragmented for high-order allocations. Wake up kcompactd