aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c118
1 files changed, 69 insertions, 49 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..f3e0c69a97b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1864,14 +1864,14 @@ int move_freepages(struct zone *zone,
1864#endif 1864#endif
1865 1865
1866 for (page = start_page; page <= end_page;) { 1866 for (page = start_page; page <= end_page;) {
1867 /* Make sure we are not inadvertently changing nodes */
1868 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1869
1870 if (!pfn_valid_within(page_to_pfn(page))) { 1867 if (!pfn_valid_within(page_to_pfn(page))) {
1871 page++; 1868 page++;
1872 continue; 1869 continue;
1873 } 1870 }
1874 1871
1872 /* Make sure we are not inadvertently changing nodes */
1873 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1874
1875 if (!PageBuddy(page)) { 1875 if (!PageBuddy(page)) {
1876 page++; 1876 page++;
1877 continue; 1877 continue;
@@ -2583,30 +2583,22 @@ int __isolate_free_page(struct page *page, unsigned int order)
2583 * Update NUMA hit/miss statistics 2583 * Update NUMA hit/miss statistics
2584 * 2584 *
2585 * Must be called with interrupts disabled. 2585 * Must be called with interrupts disabled.
2586 *
2587 * When __GFP_OTHER_NODE is set assume the node of the preferred
2588 * zone is the local node. This is useful for daemons who allocate
2589 * memory on behalf of other processes.
2590 */ 2586 */
2591static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, 2587static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2592 gfp_t flags)
2593{ 2588{
2594#ifdef CONFIG_NUMA 2589#ifdef CONFIG_NUMA
2595 int local_nid = numa_node_id();
2596 enum zone_stat_item local_stat = NUMA_LOCAL; 2590 enum zone_stat_item local_stat = NUMA_LOCAL;
2597 2591
2598 if (unlikely(flags & __GFP_OTHER_NODE)) { 2592 if (z->node != numa_node_id())
2599 local_stat = NUMA_OTHER; 2593 local_stat = NUMA_OTHER;
2600 local_nid = preferred_zone->node;
2601 }
2602 2594
2603 if (z->node == local_nid) { 2595 if (z->node == preferred_zone->node)
2604 __inc_zone_state(z, NUMA_HIT); 2596 __inc_zone_state(z, NUMA_HIT);
2605 __inc_zone_state(z, local_stat); 2597 else {
2606 } else {
2607 __inc_zone_state(z, NUMA_MISS); 2598 __inc_zone_state(z, NUMA_MISS);
2608 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 2599 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
2609 } 2600 }
2601 __inc_zone_state(z, local_stat);
2610#endif 2602#endif
2611} 2603}
2612 2604
@@ -2674,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
2674 } 2666 }
2675 2667
2676 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2668 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2677 zone_statistics(preferred_zone, zone, gfp_flags); 2669 zone_statistics(preferred_zone, zone);
2678 local_irq_restore(flags); 2670 local_irq_restore(flags);
2679 2671
2680 VM_BUG_ON_PAGE(bad_range(zone, page), page); 2672 VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -3531,12 +3523,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3531 struct page *page = NULL; 3523 struct page *page = NULL;
3532 unsigned int alloc_flags; 3524 unsigned int alloc_flags;
3533 unsigned long did_some_progress; 3525 unsigned long did_some_progress;
3534 enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; 3526 enum compact_priority compact_priority;
3535 enum compact_result compact_result; 3527 enum compact_result compact_result;
3536 int compaction_retries = 0; 3528 int compaction_retries;
3537 int no_progress_loops = 0; 3529 int no_progress_loops;
3538 unsigned long alloc_start = jiffies; 3530 unsigned long alloc_start = jiffies;
3539 unsigned int stall_timeout = 10 * HZ; 3531 unsigned int stall_timeout = 10 * HZ;
3532 unsigned int cpuset_mems_cookie;
3540 3533
3541 /* 3534 /*
3542 * In the slowpath, we sanity check order to avoid ever trying to 3535 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3557,6 +3550,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3557 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) 3550 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3558 gfp_mask &= ~__GFP_ATOMIC; 3551 gfp_mask &= ~__GFP_ATOMIC;
3559 3552
3553retry_cpuset:
3554 compaction_retries = 0;
3555 no_progress_loops = 0;
3556 compact_priority = DEF_COMPACT_PRIORITY;
3557 cpuset_mems_cookie = read_mems_allowed_begin();
3558 /*
3559 * We need to recalculate the starting point for the zonelist iterator
3560 * because we might have used different nodemask in the fast path, or
3561 * there was a cpuset modification and we are retrying - otherwise we
3562 * could end up iterating over non-eligible zones endlessly.
3563 */
3564 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3565 ac->high_zoneidx, ac->nodemask);
3566 if (!ac->preferred_zoneref->zone)
3567 goto nopage;
3568
3569
3560 /* 3570 /*
3561 * The fast path uses conservative alloc_flags to succeed only until 3571 * The fast path uses conservative alloc_flags to succeed only until
3562 * kswapd needs to be woken up, and to avoid the cost of setting up 3572 * kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3716,6 +3726,13 @@ retry:
3716 &compaction_retries)) 3726 &compaction_retries))
3717 goto retry; 3727 goto retry;
3718 3728
3729 /*
3730 * It's possible we raced with cpuset update so the OOM would be
3731 * premature (see below the nopage: label for full explanation).
3732 */
3733 if (read_mems_allowed_retry(cpuset_mems_cookie))
3734 goto retry_cpuset;
3735
3719 /* Reclaim has failed us, start killing things */ 3736 /* Reclaim has failed us, start killing things */
3720 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 3737 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
3721 if (page) 3738 if (page)
@@ -3728,6 +3745,16 @@ retry:
3728 } 3745 }
3729 3746
3730nopage: 3747nopage:
3748 /*
3749 * When updating a task's mems_allowed or mempolicy nodemask, it is
3750 * possible to race with parallel threads in such a way that our
3751 * allocation can fail while the mask is being updated. If we are about
3752 * to fail, check if the cpuset changed during allocation and if so,
3753 * retry.
3754 */
3755 if (read_mems_allowed_retry(cpuset_mems_cookie))
3756 goto retry_cpuset;
3757
3731 warn_alloc(gfp_mask, 3758 warn_alloc(gfp_mask,
3732 "page allocation failure: order:%u", order); 3759 "page allocation failure: order:%u", order);
3733got_pg: 3760got_pg:
@@ -3742,7 +3769,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3742 struct zonelist *zonelist, nodemask_t *nodemask) 3769 struct zonelist *zonelist, nodemask_t *nodemask)
3743{ 3770{
3744 struct page *page; 3771 struct page *page;
3745 unsigned int cpuset_mems_cookie;
3746 unsigned int alloc_flags = ALLOC_WMARK_LOW; 3772 unsigned int alloc_flags = ALLOC_WMARK_LOW;
3747 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ 3773 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
3748 struct alloc_context ac = { 3774 struct alloc_context ac = {
@@ -3779,9 +3805,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3779 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) 3805 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
3780 alloc_flags |= ALLOC_CMA; 3806 alloc_flags |= ALLOC_CMA;
3781 3807
3782retry_cpuset:
3783 cpuset_mems_cookie = read_mems_allowed_begin();
3784
3785 /* Dirty zone balancing only done in the fast path */ 3808 /* Dirty zone balancing only done in the fast path */
3786 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); 3809 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
3787 3810
@@ -3792,8 +3815,13 @@ retry_cpuset:
3792 */ 3815 */
3793 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, 3816 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
3794 ac.high_zoneidx, ac.nodemask); 3817 ac.high_zoneidx, ac.nodemask);
3795 if (!ac.preferred_zoneref) { 3818 if (!ac.preferred_zoneref->zone) {
3796 page = NULL; 3819 page = NULL;
3820 /*
3821 * This might be due to race with cpuset_current_mems_allowed
3822 * update, so make sure we retry with original nodemask in the
3823 * slow path.
3824 */
3797 goto no_zone; 3825 goto no_zone;
3798 } 3826 }
3799 3827
@@ -3802,6 +3830,7 @@ retry_cpuset:
3802 if (likely(page)) 3830 if (likely(page))
3803 goto out; 3831 goto out;
3804 3832
3833no_zone:
3805 /* 3834 /*
3806 * Runtime PM, block IO and its error handling path can deadlock 3835 * Runtime PM, block IO and its error handling path can deadlock
3807 * because I/O on the device might not complete. 3836 * because I/O on the device might not complete.
@@ -3813,21 +3842,10 @@ retry_cpuset:
3813 * Restore the original nodemask if it was potentially replaced with 3842 * Restore the original nodemask if it was potentially replaced with
3814 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 3843 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
3815 */ 3844 */
3816 if (cpusets_enabled()) 3845 if (unlikely(ac.nodemask != nodemask))
3817 ac.nodemask = nodemask; 3846 ac.nodemask = nodemask;
3818 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
3819 3847
3820no_zone: 3848 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
3821 /*
3822 * When updating a task's mems_allowed, it is possible to race with
3823 * parallel threads in such a way that an allocation can fail while
3824 * the mask is being updated. If a page allocation is about to fail,
3825 * check if the cpuset changed during allocation and if so, retry.
3826 */
3827 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
3828 alloc_mask = gfp_mask;
3829 goto retry_cpuset;
3830 }
3831 3849
3832out: 3850out:
3833 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 3851 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3904,8 +3922,8 @@ EXPORT_SYMBOL(free_pages);
3904 * drivers to provide a backing region of memory for use as either an 3922 * drivers to provide a backing region of memory for use as either an
3905 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. 3923 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
3906 */ 3924 */
3907static struct page *__page_frag_refill(struct page_frag_cache *nc, 3925static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
3908 gfp_t gfp_mask) 3926 gfp_t gfp_mask)
3909{ 3927{
3910 struct page *page = NULL; 3928 struct page *page = NULL;
3911 gfp_t gfp = gfp_mask; 3929 gfp_t gfp = gfp_mask;
@@ -3925,22 +3943,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
3925 return page; 3943 return page;
3926} 3944}
3927 3945
3928void __page_frag_drain(struct page *page, unsigned int order, 3946void __page_frag_cache_drain(struct page *page, unsigned int count)
3929 unsigned int count)
3930{ 3947{
3931 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 3948 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
3932 3949
3933 if (page_ref_sub_and_test(page, count)) { 3950 if (page_ref_sub_and_test(page, count)) {
3951 unsigned int order = compound_order(page);
3952
3934 if (order == 0) 3953 if (order == 0)
3935 free_hot_cold_page(page, false); 3954 free_hot_cold_page(page, false);
3936 else 3955 else
3937 __free_pages_ok(page, order); 3956 __free_pages_ok(page, order);
3938 } 3957 }
3939} 3958}
3940EXPORT_SYMBOL(__page_frag_drain); 3959EXPORT_SYMBOL(__page_frag_cache_drain);
3941 3960
3942void *__alloc_page_frag(struct page_frag_cache *nc, 3961void *page_frag_alloc(struct page_frag_cache *nc,
3943 unsigned int fragsz, gfp_t gfp_mask) 3962 unsigned int fragsz, gfp_t gfp_mask)
3944{ 3963{
3945 unsigned int size = PAGE_SIZE; 3964 unsigned int size = PAGE_SIZE;
3946 struct page *page; 3965 struct page *page;
@@ -3948,7 +3967,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
3948 3967
3949 if (unlikely(!nc->va)) { 3968 if (unlikely(!nc->va)) {
3950refill: 3969refill:
3951 page = __page_frag_refill(nc, gfp_mask); 3970 page = __page_frag_cache_refill(nc, gfp_mask);
3952 if (!page) 3971 if (!page)
3953 return NULL; 3972 return NULL;
3954 3973
@@ -3991,19 +4010,19 @@ refill:
3991 4010
3992 return nc->va + offset; 4011 return nc->va + offset;
3993} 4012}
3994EXPORT_SYMBOL(__alloc_page_frag); 4013EXPORT_SYMBOL(page_frag_alloc);
3995 4014
3996/* 4015/*
3997 * Frees a page fragment allocated out of either a compound or order 0 page. 4016 * Frees a page fragment allocated out of either a compound or order 0 page.
3998 */ 4017 */
3999void __free_page_frag(void *addr) 4018void page_frag_free(void *addr)
4000{ 4019{
4001 struct page *page = virt_to_head_page(addr); 4020 struct page *page = virt_to_head_page(addr);
4002 4021
4003 if (unlikely(put_page_testzero(page))) 4022 if (unlikely(put_page_testzero(page)))
4004 __free_pages_ok(page, compound_order(page)); 4023 __free_pages_ok(page, compound_order(page));
4005} 4024}
4006EXPORT_SYMBOL(__free_page_frag); 4025EXPORT_SYMBOL(page_frag_free);
4007 4026
4008static void *make_alloc_exact(unsigned long addr, unsigned int order, 4027static void *make_alloc_exact(unsigned long addr, unsigned int order,
4009 size_t size) 4028 size_t size)
@@ -7255,6 +7274,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7255 .zone = page_zone(pfn_to_page(start)), 7274 .zone = page_zone(pfn_to_page(start)),
7256 .mode = MIGRATE_SYNC, 7275 .mode = MIGRATE_SYNC,
7257 .ignore_skip_hint = true, 7276 .ignore_skip_hint = true,
7277 .gfp_mask = GFP_KERNEL,
7258 }; 7278 };
7259 INIT_LIST_HEAD(&cc.migratepages); 7279 INIT_LIST_HEAD(&cc.migratepages);
7260 7280