diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 118 |
1 files changed, 69 insertions, 49 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c6d5f64feca..f3e0c69a97b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1864,14 +1864,14 @@ int move_freepages(struct zone *zone, | |||
1864 | #endif | 1864 | #endif |
1865 | 1865 | ||
1866 | for (page = start_page; page <= end_page;) { | 1866 | for (page = start_page; page <= end_page;) { |
1867 | /* Make sure we are not inadvertently changing nodes */ | ||
1868 | VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); | ||
1869 | |||
1870 | if (!pfn_valid_within(page_to_pfn(page))) { | 1867 | if (!pfn_valid_within(page_to_pfn(page))) { |
1871 | page++; | 1868 | page++; |
1872 | continue; | 1869 | continue; |
1873 | } | 1870 | } |
1874 | 1871 | ||
1872 | /* Make sure we are not inadvertently changing nodes */ | ||
1873 | VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); | ||
1874 | |||
1875 | if (!PageBuddy(page)) { | 1875 | if (!PageBuddy(page)) { |
1876 | page++; | 1876 | page++; |
1877 | continue; | 1877 | continue; |
@@ -2583,30 +2583,22 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2583 | * Update NUMA hit/miss statistics | 2583 | * Update NUMA hit/miss statistics |
2584 | * | 2584 | * |
2585 | * Must be called with interrupts disabled. | 2585 | * Must be called with interrupts disabled. |
2586 | * | ||
2587 | * When __GFP_OTHER_NODE is set assume the node of the preferred | ||
2588 | * zone is the local node. This is useful for daemons who allocate | ||
2589 | * memory on behalf of other processes. | ||
2590 | */ | 2586 | */ |
2591 | static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, | 2587 | static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) |
2592 | gfp_t flags) | ||
2593 | { | 2588 | { |
2594 | #ifdef CONFIG_NUMA | 2589 | #ifdef CONFIG_NUMA |
2595 | int local_nid = numa_node_id(); | ||
2596 | enum zone_stat_item local_stat = NUMA_LOCAL; | 2590 | enum zone_stat_item local_stat = NUMA_LOCAL; |
2597 | 2591 | ||
2598 | if (unlikely(flags & __GFP_OTHER_NODE)) { | 2592 | if (z->node != numa_node_id()) |
2599 | local_stat = NUMA_OTHER; | 2593 | local_stat = NUMA_OTHER; |
2600 | local_nid = preferred_zone->node; | ||
2601 | } | ||
2602 | 2594 | ||
2603 | if (z->node == local_nid) { | 2595 | if (z->node == preferred_zone->node) |
2604 | __inc_zone_state(z, NUMA_HIT); | 2596 | __inc_zone_state(z, NUMA_HIT); |
2605 | __inc_zone_state(z, local_stat); | 2597 | else { |
2606 | } else { | ||
2607 | __inc_zone_state(z, NUMA_MISS); | 2598 | __inc_zone_state(z, NUMA_MISS); |
2608 | __inc_zone_state(preferred_zone, NUMA_FOREIGN); | 2599 | __inc_zone_state(preferred_zone, NUMA_FOREIGN); |
2609 | } | 2600 | } |
2601 | __inc_zone_state(z, local_stat); | ||
2610 | #endif | 2602 | #endif |
2611 | } | 2603 | } |
2612 | 2604 | ||
@@ -2674,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
2674 | } | 2666 | } |
2675 | 2667 | ||
2676 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | 2668 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
2677 | zone_statistics(preferred_zone, zone, gfp_flags); | 2669 | zone_statistics(preferred_zone, zone); |
2678 | local_irq_restore(flags); | 2670 | local_irq_restore(flags); |
2679 | 2671 | ||
2680 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 2672 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
@@ -3531,12 +3523,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3531 | struct page *page = NULL; | 3523 | struct page *page = NULL; |
3532 | unsigned int alloc_flags; | 3524 | unsigned int alloc_flags; |
3533 | unsigned long did_some_progress; | 3525 | unsigned long did_some_progress; |
3534 | enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; | 3526 | enum compact_priority compact_priority; |
3535 | enum compact_result compact_result; | 3527 | enum compact_result compact_result; |
3536 | int compaction_retries = 0; | 3528 | int compaction_retries; |
3537 | int no_progress_loops = 0; | 3529 | int no_progress_loops; |
3538 | unsigned long alloc_start = jiffies; | 3530 | unsigned long alloc_start = jiffies; |
3539 | unsigned int stall_timeout = 10 * HZ; | 3531 | unsigned int stall_timeout = 10 * HZ; |
3532 | unsigned int cpuset_mems_cookie; | ||
3540 | 3533 | ||
3541 | /* | 3534 | /* |
3542 | * In the slowpath, we sanity check order to avoid ever trying to | 3535 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -3557,6 +3550,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3557 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) | 3550 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) |
3558 | gfp_mask &= ~__GFP_ATOMIC; | 3551 | gfp_mask &= ~__GFP_ATOMIC; |
3559 | 3552 | ||
3553 | retry_cpuset: | ||
3554 | compaction_retries = 0; | ||
3555 | no_progress_loops = 0; | ||
3556 | compact_priority = DEF_COMPACT_PRIORITY; | ||
3557 | cpuset_mems_cookie = read_mems_allowed_begin(); | ||
3558 | /* | ||
3559 | * We need to recalculate the starting point for the zonelist iterator | ||
3560 | * because we might have used different nodemask in the fast path, or | ||
3561 | * there was a cpuset modification and we are retrying - otherwise we | ||
3562 | * could end up iterating over non-eligible zones endlessly. | ||
3563 | */ | ||
3564 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, | ||
3565 | ac->high_zoneidx, ac->nodemask); | ||
3566 | if (!ac->preferred_zoneref->zone) | ||
3567 | goto nopage; | ||
3568 | |||
3569 | |||
3560 | /* | 3570 | /* |
3561 | * The fast path uses conservative alloc_flags to succeed only until | 3571 | * The fast path uses conservative alloc_flags to succeed only until |
3562 | * kswapd needs to be woken up, and to avoid the cost of setting up | 3572 | * kswapd needs to be woken up, and to avoid the cost of setting up |
@@ -3716,6 +3726,13 @@ retry: | |||
3716 | &compaction_retries)) | 3726 | &compaction_retries)) |
3717 | goto retry; | 3727 | goto retry; |
3718 | 3728 | ||
3729 | /* | ||
3730 | * It's possible we raced with cpuset update so the OOM would be | ||
3731 | * premature (see below the nopage: label for full explanation). | ||
3732 | */ | ||
3733 | if (read_mems_allowed_retry(cpuset_mems_cookie)) | ||
3734 | goto retry_cpuset; | ||
3735 | |||
3719 | /* Reclaim has failed us, start killing things */ | 3736 | /* Reclaim has failed us, start killing things */ |
3720 | page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); | 3737 | page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); |
3721 | if (page) | 3738 | if (page) |
@@ -3728,6 +3745,16 @@ retry: | |||
3728 | } | 3745 | } |
3729 | 3746 | ||
3730 | nopage: | 3747 | nopage: |
3748 | /* | ||
3749 | * When updating a task's mems_allowed or mempolicy nodemask, it is | ||
3750 | * possible to race with parallel threads in such a way that our | ||
3751 | * allocation can fail while the mask is being updated. If we are about | ||
3752 | * to fail, check if the cpuset changed during allocation and if so, | ||
3753 | * retry. | ||
3754 | */ | ||
3755 | if (read_mems_allowed_retry(cpuset_mems_cookie)) | ||
3756 | goto retry_cpuset; | ||
3757 | |||
3731 | warn_alloc(gfp_mask, | 3758 | warn_alloc(gfp_mask, |
3732 | "page allocation failure: order:%u", order); | 3759 | "page allocation failure: order:%u", order); |
3733 | got_pg: | 3760 | got_pg: |
@@ -3742,7 +3769,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
3742 | struct zonelist *zonelist, nodemask_t *nodemask) | 3769 | struct zonelist *zonelist, nodemask_t *nodemask) |
3743 | { | 3770 | { |
3744 | struct page *page; | 3771 | struct page *page; |
3745 | unsigned int cpuset_mems_cookie; | ||
3746 | unsigned int alloc_flags = ALLOC_WMARK_LOW; | 3772 | unsigned int alloc_flags = ALLOC_WMARK_LOW; |
3747 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ | 3773 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ |
3748 | struct alloc_context ac = { | 3774 | struct alloc_context ac = { |
@@ -3779,9 +3805,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
3779 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) | 3805 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) |
3780 | alloc_flags |= ALLOC_CMA; | 3806 | alloc_flags |= ALLOC_CMA; |
3781 | 3807 | ||
3782 | retry_cpuset: | ||
3783 | cpuset_mems_cookie = read_mems_allowed_begin(); | ||
3784 | |||
3785 | /* Dirty zone balancing only done in the fast path */ | 3808 | /* Dirty zone balancing only done in the fast path */ |
3786 | ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); | 3809 | ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); |
3787 | 3810 | ||
@@ -3792,8 +3815,13 @@ retry_cpuset: | |||
3792 | */ | 3815 | */ |
3793 | ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, | 3816 | ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, |
3794 | ac.high_zoneidx, ac.nodemask); | 3817 | ac.high_zoneidx, ac.nodemask); |
3795 | if (!ac.preferred_zoneref) { | 3818 | if (!ac.preferred_zoneref->zone) { |
3796 | page = NULL; | 3819 | page = NULL; |
3820 | /* | ||
3821 | * This might be due to race with cpuset_current_mems_allowed | ||
3822 | * update, so make sure we retry with original nodemask in the | ||
3823 | * slow path. | ||
3824 | */ | ||
3797 | goto no_zone; | 3825 | goto no_zone; |
3798 | } | 3826 | } |
3799 | 3827 | ||
@@ -3802,6 +3830,7 @@ retry_cpuset: | |||
3802 | if (likely(page)) | 3830 | if (likely(page)) |
3803 | goto out; | 3831 | goto out; |
3804 | 3832 | ||
3833 | no_zone: | ||
3805 | /* | 3834 | /* |
3806 | * Runtime PM, block IO and its error handling path can deadlock | 3835 | * Runtime PM, block IO and its error handling path can deadlock |
3807 | * because I/O on the device might not complete. | 3836 | * because I/O on the device might not complete. |
@@ -3813,21 +3842,10 @@ retry_cpuset: | |||
3813 | * Restore the original nodemask if it was potentially replaced with | 3842 | * Restore the original nodemask if it was potentially replaced with |
3814 | * &cpuset_current_mems_allowed to optimize the fast-path attempt. | 3843 | * &cpuset_current_mems_allowed to optimize the fast-path attempt. |
3815 | */ | 3844 | */ |
3816 | if (cpusets_enabled()) | 3845 | if (unlikely(ac.nodemask != nodemask)) |
3817 | ac.nodemask = nodemask; | 3846 | ac.nodemask = nodemask; |
3818 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); | ||
3819 | 3847 | ||
3820 | no_zone: | 3848 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
3821 | /* | ||
3822 | * When updating a task's mems_allowed, it is possible to race with | ||
3823 | * parallel threads in such a way that an allocation can fail while | ||
3824 | * the mask is being updated. If a page allocation is about to fail, | ||
3825 | * check if the cpuset changed during allocation and if so, retry. | ||
3826 | */ | ||
3827 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { | ||
3828 | alloc_mask = gfp_mask; | ||
3829 | goto retry_cpuset; | ||
3830 | } | ||
3831 | 3849 | ||
3832 | out: | 3850 | out: |
3833 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && | 3851 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && |
@@ -3904,8 +3922,8 @@ EXPORT_SYMBOL(free_pages); | |||
3904 | * drivers to provide a backing region of memory for use as either an | 3922 | * drivers to provide a backing region of memory for use as either an |
3905 | * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. | 3923 | * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. |
3906 | */ | 3924 | */ |
3907 | static struct page *__page_frag_refill(struct page_frag_cache *nc, | 3925 | static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, |
3908 | gfp_t gfp_mask) | 3926 | gfp_t gfp_mask) |
3909 | { | 3927 | { |
3910 | struct page *page = NULL; | 3928 | struct page *page = NULL; |
3911 | gfp_t gfp = gfp_mask; | 3929 | gfp_t gfp = gfp_mask; |
@@ -3925,22 +3943,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, | |||
3925 | return page; | 3943 | return page; |
3926 | } | 3944 | } |
3927 | 3945 | ||
3928 | void __page_frag_drain(struct page *page, unsigned int order, | 3946 | void __page_frag_cache_drain(struct page *page, unsigned int count) |
3929 | unsigned int count) | ||
3930 | { | 3947 | { |
3931 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); | 3948 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); |
3932 | 3949 | ||
3933 | if (page_ref_sub_and_test(page, count)) { | 3950 | if (page_ref_sub_and_test(page, count)) { |
3951 | unsigned int order = compound_order(page); | ||
3952 | |||
3934 | if (order == 0) | 3953 | if (order == 0) |
3935 | free_hot_cold_page(page, false); | 3954 | free_hot_cold_page(page, false); |
3936 | else | 3955 | else |
3937 | __free_pages_ok(page, order); | 3956 | __free_pages_ok(page, order); |
3938 | } | 3957 | } |
3939 | } | 3958 | } |
3940 | EXPORT_SYMBOL(__page_frag_drain); | 3959 | EXPORT_SYMBOL(__page_frag_cache_drain); |
3941 | 3960 | ||
3942 | void *__alloc_page_frag(struct page_frag_cache *nc, | 3961 | void *page_frag_alloc(struct page_frag_cache *nc, |
3943 | unsigned int fragsz, gfp_t gfp_mask) | 3962 | unsigned int fragsz, gfp_t gfp_mask) |
3944 | { | 3963 | { |
3945 | unsigned int size = PAGE_SIZE; | 3964 | unsigned int size = PAGE_SIZE; |
3946 | struct page *page; | 3965 | struct page *page; |
@@ -3948,7 +3967,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc, | |||
3948 | 3967 | ||
3949 | if (unlikely(!nc->va)) { | 3968 | if (unlikely(!nc->va)) { |
3950 | refill: | 3969 | refill: |
3951 | page = __page_frag_refill(nc, gfp_mask); | 3970 | page = __page_frag_cache_refill(nc, gfp_mask); |
3952 | if (!page) | 3971 | if (!page) |
3953 | return NULL; | 3972 | return NULL; |
3954 | 3973 | ||
@@ -3991,19 +4010,19 @@ refill: | |||
3991 | 4010 | ||
3992 | return nc->va + offset; | 4011 | return nc->va + offset; |
3993 | } | 4012 | } |
3994 | EXPORT_SYMBOL(__alloc_page_frag); | 4013 | EXPORT_SYMBOL(page_frag_alloc); |
3995 | 4014 | ||
3996 | /* | 4015 | /* |
3997 | * Frees a page fragment allocated out of either a compound or order 0 page. | 4016 | * Frees a page fragment allocated out of either a compound or order 0 page. |
3998 | */ | 4017 | */ |
3999 | void __free_page_frag(void *addr) | 4018 | void page_frag_free(void *addr) |
4000 | { | 4019 | { |
4001 | struct page *page = virt_to_head_page(addr); | 4020 | struct page *page = virt_to_head_page(addr); |
4002 | 4021 | ||
4003 | if (unlikely(put_page_testzero(page))) | 4022 | if (unlikely(put_page_testzero(page))) |
4004 | __free_pages_ok(page, compound_order(page)); | 4023 | __free_pages_ok(page, compound_order(page)); |
4005 | } | 4024 | } |
4006 | EXPORT_SYMBOL(__free_page_frag); | 4025 | EXPORT_SYMBOL(page_frag_free); |
4007 | 4026 | ||
4008 | static void *make_alloc_exact(unsigned long addr, unsigned int order, | 4027 | static void *make_alloc_exact(unsigned long addr, unsigned int order, |
4009 | size_t size) | 4028 | size_t size) |
@@ -7255,6 +7274,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
7255 | .zone = page_zone(pfn_to_page(start)), | 7274 | .zone = page_zone(pfn_to_page(start)), |
7256 | .mode = MIGRATE_SYNC, | 7275 | .mode = MIGRATE_SYNC, |
7257 | .ignore_skip_hint = true, | 7276 | .ignore_skip_hint = true, |
7277 | .gfp_mask = GFP_KERNEL, | ||
7258 | }; | 7278 | }; |
7259 | INIT_LIST_HEAD(&cc.migratepages); | 7279 | INIT_LIST_HEAD(&cc.migratepages); |
7260 | 7280 | ||