diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 143 |
1 files changed, 126 insertions, 17 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 24ab1f7394ab..a714c4f800e9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -88,6 +88,9 @@ struct scan_control { | |||
88 | /* Can pages be swapped as part of reclaim? */ | 88 | /* Can pages be swapped as part of reclaim? */ |
89 | unsigned int may_swap:1; | 89 | unsigned int may_swap:1; |
90 | 90 | ||
91 | /* e.g. boosted watermark reclaim leaves slabs alone */ | ||
92 | unsigned int may_shrinkslab:1; | ||
93 | |||
91 | /* | 94 | /* |
92 | * Cgroups are not reclaimed below their configured memory.low, | 95 | * Cgroups are not reclaimed below their configured memory.low, |
93 | * unless we threaten to OOM. If any cgroups are skipped due to | 96 | * unless we threaten to OOM. If any cgroups are skipped due to |
@@ -1457,14 +1460,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1457 | count_memcg_page_event(page, PGLAZYFREED); | 1460 | count_memcg_page_event(page, PGLAZYFREED); |
1458 | } else if (!mapping || !__remove_mapping(mapping, page, true)) | 1461 | } else if (!mapping || !__remove_mapping(mapping, page, true)) |
1459 | goto keep_locked; | 1462 | goto keep_locked; |
1460 | /* | 1463 | |
1461 | * At this point, we have no other references and there is | 1464 | unlock_page(page); |
1462 | * no way to pick any more up (removed from LRU, removed | ||
1463 | * from pagecache). Can use non-atomic bitops now (and | ||
1464 | * we obviously don't have to worry about waking up a process | ||
1465 | * waiting on the page lock, because there are no references. | ||
1466 | */ | ||
1467 | __ClearPageLocked(page); | ||
1468 | free_it: | 1465 | free_it: |
1469 | nr_reclaimed++; | 1466 | nr_reclaimed++; |
1470 | 1467 | ||
@@ -2756,8 +2753,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2756 | shrink_node_memcg(pgdat, memcg, sc, &lru_pages); | 2753 | shrink_node_memcg(pgdat, memcg, sc, &lru_pages); |
2757 | node_lru_pages += lru_pages; | 2754 | node_lru_pages += lru_pages; |
2758 | 2755 | ||
2759 | shrink_slab(sc->gfp_mask, pgdat->node_id, | 2756 | if (sc->may_shrinkslab) { |
2757 | shrink_slab(sc->gfp_mask, pgdat->node_id, | ||
2760 | memcg, sc->priority); | 2758 | memcg, sc->priority); |
2759 | } | ||
2761 | 2760 | ||
2762 | /* Record the group's reclaim efficiency */ | 2761 | /* Record the group's reclaim efficiency */ |
2763 | vmpressure(sc->gfp_mask, memcg, false, | 2762 | vmpressure(sc->gfp_mask, memcg, false, |
@@ -3239,6 +3238,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
3239 | .may_writepage = !laptop_mode, | 3238 | .may_writepage = !laptop_mode, |
3240 | .may_unmap = 1, | 3239 | .may_unmap = 1, |
3241 | .may_swap = 1, | 3240 | .may_swap = 1, |
3241 | .may_shrinkslab = 1, | ||
3242 | }; | 3242 | }; |
3243 | 3243 | ||
3244 | /* | 3244 | /* |
@@ -3283,6 +3283,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, | |||
3283 | .may_unmap = 1, | 3283 | .may_unmap = 1, |
3284 | .reclaim_idx = MAX_NR_ZONES - 1, | 3284 | .reclaim_idx = MAX_NR_ZONES - 1, |
3285 | .may_swap = !noswap, | 3285 | .may_swap = !noswap, |
3286 | .may_shrinkslab = 1, | ||
3286 | }; | 3287 | }; |
3287 | unsigned long lru_pages; | 3288 | unsigned long lru_pages; |
3288 | 3289 | ||
@@ -3329,6 +3330,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3329 | .may_writepage = !laptop_mode, | 3330 | .may_writepage = !laptop_mode, |
3330 | .may_unmap = 1, | 3331 | .may_unmap = 1, |
3331 | .may_swap = may_swap, | 3332 | .may_swap = may_swap, |
3333 | .may_shrinkslab = 1, | ||
3332 | }; | 3334 | }; |
3333 | 3335 | ||
3334 | /* | 3336 | /* |
@@ -3379,6 +3381,30 @@ static void age_active_anon(struct pglist_data *pgdat, | |||
3379 | } while (memcg); | 3381 | } while (memcg); |
3380 | } | 3382 | } |
3381 | 3383 | ||
3384 | static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) | ||
3385 | { | ||
3386 | int i; | ||
3387 | struct zone *zone; | ||
3388 | |||
3389 | /* | ||
3390 | * Check for watermark boosts top-down as the higher zones | ||
3391 | * are more likely to be boosted. Both watermarks and boosts | ||
3392 | * should not be checked at the time time as reclaim would | ||
3393 | * start prematurely when there is no boosting and a lower | ||
3394 | * zone is balanced. | ||
3395 | */ | ||
3396 | for (i = classzone_idx; i >= 0; i--) { | ||
3397 | zone = pgdat->node_zones + i; | ||
3398 | if (!managed_zone(zone)) | ||
3399 | continue; | ||
3400 | |||
3401 | if (zone->watermark_boost) | ||
3402 | return true; | ||
3403 | } | ||
3404 | |||
3405 | return false; | ||
3406 | } | ||
3407 | |||
3382 | /* | 3408 | /* |
3383 | * Returns true if there is an eligible zone balanced for the request order | 3409 | * Returns true if there is an eligible zone balanced for the request order |
3384 | * and classzone_idx | 3410 | * and classzone_idx |
@@ -3389,6 +3415,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
3389 | unsigned long mark = -1; | 3415 | unsigned long mark = -1; |
3390 | struct zone *zone; | 3416 | struct zone *zone; |
3391 | 3417 | ||
3418 | /* | ||
3419 | * Check watermarks bottom-up as lower zones are more likely to | ||
3420 | * meet watermarks. | ||
3421 | */ | ||
3392 | for (i = 0; i <= classzone_idx; i++) { | 3422 | for (i = 0; i <= classzone_idx; i++) { |
3393 | zone = pgdat->node_zones + i; | 3423 | zone = pgdat->node_zones + i; |
3394 | 3424 | ||
@@ -3517,14 +3547,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3517 | unsigned long nr_soft_reclaimed; | 3547 | unsigned long nr_soft_reclaimed; |
3518 | unsigned long nr_soft_scanned; | 3548 | unsigned long nr_soft_scanned; |
3519 | unsigned long pflags; | 3549 | unsigned long pflags; |
3550 | unsigned long nr_boost_reclaim; | ||
3551 | unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; | ||
3552 | bool boosted; | ||
3520 | struct zone *zone; | 3553 | struct zone *zone; |
3521 | struct scan_control sc = { | 3554 | struct scan_control sc = { |
3522 | .gfp_mask = GFP_KERNEL, | 3555 | .gfp_mask = GFP_KERNEL, |
3523 | .order = order, | 3556 | .order = order, |
3524 | .priority = DEF_PRIORITY, | ||
3525 | .may_writepage = !laptop_mode, | ||
3526 | .may_unmap = 1, | 3557 | .may_unmap = 1, |
3527 | .may_swap = 1, | ||
3528 | }; | 3558 | }; |
3529 | 3559 | ||
3530 | psi_memstall_enter(&pflags); | 3560 | psi_memstall_enter(&pflags); |
@@ -3532,9 +3562,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3532 | 3562 | ||
3533 | count_vm_event(PAGEOUTRUN); | 3563 | count_vm_event(PAGEOUTRUN); |
3534 | 3564 | ||
3565 | /* | ||
3566 | * Account for the reclaim boost. Note that the zone boost is left in | ||
3567 | * place so that parallel allocations that are near the watermark will | ||
3568 | * stall or direct reclaim until kswapd is finished. | ||
3569 | */ | ||
3570 | nr_boost_reclaim = 0; | ||
3571 | for (i = 0; i <= classzone_idx; i++) { | ||
3572 | zone = pgdat->node_zones + i; | ||
3573 | if (!managed_zone(zone)) | ||
3574 | continue; | ||
3575 | |||
3576 | nr_boost_reclaim += zone->watermark_boost; | ||
3577 | zone_boosts[i] = zone->watermark_boost; | ||
3578 | } | ||
3579 | boosted = nr_boost_reclaim; | ||
3580 | |||
3581 | restart: | ||
3582 | sc.priority = DEF_PRIORITY; | ||
3535 | do { | 3583 | do { |
3536 | unsigned long nr_reclaimed = sc.nr_reclaimed; | 3584 | unsigned long nr_reclaimed = sc.nr_reclaimed; |
3537 | bool raise_priority = true; | 3585 | bool raise_priority = true; |
3586 | bool balanced; | ||
3538 | bool ret; | 3587 | bool ret; |
3539 | 3588 | ||
3540 | sc.reclaim_idx = classzone_idx; | 3589 | sc.reclaim_idx = classzone_idx; |
@@ -3561,13 +3610,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3561 | } | 3610 | } |
3562 | 3611 | ||
3563 | /* | 3612 | /* |
3564 | * Only reclaim if there are no eligible zones. Note that | 3613 | * If the pgdat is imbalanced then ignore boosting and preserve |
3565 | * sc.reclaim_idx is not used as buffer_heads_over_limit may | 3614 | * the watermarks for a later time and restart. Note that the |
3566 | * have adjusted it. | 3615 | * zone watermarks will be still reset at the end of balancing |
3616 | * on the grounds that the normal reclaim should be enough to | ||
3617 | * re-evaluate if boosting is required when kswapd next wakes. | ||
3567 | */ | 3618 | */ |
3568 | if (pgdat_balanced(pgdat, sc.order, classzone_idx)) | 3619 | balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); |
3620 | if (!balanced && nr_boost_reclaim) { | ||
3621 | nr_boost_reclaim = 0; | ||
3622 | goto restart; | ||
3623 | } | ||
3624 | |||
3625 | /* | ||
3626 | * If boosting is not active then only reclaim if there are no | ||
3627 | * eligible zones. Note that sc.reclaim_idx is not used as | ||
3628 | * buffer_heads_over_limit may have adjusted it. | ||
3629 | */ | ||
3630 | if (!nr_boost_reclaim && balanced) | ||
3569 | goto out; | 3631 | goto out; |
3570 | 3632 | ||
3633 | /* Limit the priority of boosting to avoid reclaim writeback */ | ||
3634 | if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) | ||
3635 | raise_priority = false; | ||
3636 | |||
3637 | /* | ||
3638 | * Do not writeback or swap pages for boosted reclaim. The | ||
3639 | * intent is to relieve pressure not issue sub-optimal IO | ||
3640 | * from reclaim context. If no pages are reclaimed, the | ||
3641 | * reclaim will be aborted. | ||
3642 | */ | ||
3643 | sc.may_writepage = !laptop_mode && !nr_boost_reclaim; | ||
3644 | sc.may_swap = !nr_boost_reclaim; | ||
3645 | sc.may_shrinkslab = !nr_boost_reclaim; | ||
3646 | |||
3571 | /* | 3647 | /* |
3572 | * Do some background aging of the anon list, to give | 3648 | * Do some background aging of the anon list, to give |
3573 | * pages a chance to be referenced before reclaiming. All | 3649 | * pages a chance to be referenced before reclaiming. All |
@@ -3619,6 +3695,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3619 | * progress in reclaiming pages | 3695 | * progress in reclaiming pages |
3620 | */ | 3696 | */ |
3621 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; | 3697 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; |
3698 | nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); | ||
3699 | |||
3700 | /* | ||
3701 | * If reclaim made no progress for a boost, stop reclaim as | ||
3702 | * IO cannot be queued and it could be an infinite loop in | ||
3703 | * extreme circumstances. | ||
3704 | */ | ||
3705 | if (nr_boost_reclaim && !nr_reclaimed) | ||
3706 | break; | ||
3707 | |||
3622 | if (raise_priority || !nr_reclaimed) | 3708 | if (raise_priority || !nr_reclaimed) |
3623 | sc.priority--; | 3709 | sc.priority--; |
3624 | } while (sc.priority >= 1); | 3710 | } while (sc.priority >= 1); |
@@ -3627,6 +3713,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3627 | pgdat->kswapd_failures++; | 3713 | pgdat->kswapd_failures++; |
3628 | 3714 | ||
3629 | out: | 3715 | out: |
3716 | /* If reclaim was boosted, account for the reclaim done in this pass */ | ||
3717 | if (boosted) { | ||
3718 | unsigned long flags; | ||
3719 | |||
3720 | for (i = 0; i <= classzone_idx; i++) { | ||
3721 | if (!zone_boosts[i]) | ||
3722 | continue; | ||
3723 | |||
3724 | /* Increments are under the zone lock */ | ||
3725 | zone = pgdat->node_zones + i; | ||
3726 | spin_lock_irqsave(&zone->lock, flags); | ||
3727 | zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); | ||
3728 | spin_unlock_irqrestore(&zone->lock, flags); | ||
3729 | } | ||
3730 | |||
3731 | /* | ||
3732 | * As there is now likely space, wakeup kcompact to defragment | ||
3733 | * pageblocks. | ||
3734 | */ | ||
3735 | wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); | ||
3736 | } | ||
3737 | |||
3630 | snapshot_refaults(NULL, pgdat); | 3738 | snapshot_refaults(NULL, pgdat); |
3631 | __fs_reclaim_release(); | 3739 | __fs_reclaim_release(); |
3632 | psi_memstall_leave(&pflags); | 3740 | psi_memstall_leave(&pflags); |
@@ -3855,7 +3963,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, | |||
3855 | 3963 | ||
3856 | /* Hopeless node, leave it to direct reclaim if possible */ | 3964 | /* Hopeless node, leave it to direct reclaim if possible */ |
3857 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || | 3965 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || |
3858 | pgdat_balanced(pgdat, order, classzone_idx)) { | 3966 | (pgdat_balanced(pgdat, order, classzone_idx) && |
3967 | !pgdat_watermark_boosted(pgdat, classzone_idx))) { | ||
3859 | /* | 3968 | /* |
3860 | * There may be plenty of free memory available, but it's too | 3969 | * There may be plenty of free memory available, but it's too |
3861 | * fragmented for high-order allocations. Wake up kcompactd | 3970 | * fragmented for high-order allocations. Wake up kcompactd |