Merge branch 'next' into for-linus

Prepare first round of input updates for 3.20.
author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2015-02-10 14:35:36 -0500
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2015-02-10 14:35:36 -0500
commit: 4ba24fef3eb3b142197135223b90ced2f319cd53 (patch)
tree: a20c125b27740ec7b4c761b11d801108e1b316b2 /mm/page_alloc.c
parent: 47c1ffb2b6b630894e9a16442611c056ab21c057 (diff)
parent: 98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff)
1 files changed, 333 insertions, 343 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee961958021..7633c503a116 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,19 +48,18 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/page_cgroup.h>
+#include <linux/page_ext.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
-#include <linux/ftrace_event.h>
-#include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
-#include <linux/page-debug-flags.h>
+#include <linux/page_ext.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <linux/page_owner.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -85,6 +84,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
 */
 DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];
 #endif
 /*
@@ -111,6 +111,7 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
 /*
 * When calculating the number of globally allowed dirty pages, there
 * is a certain number of per-zone reserves that should not be
@@ -426,6 +427,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
+bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_guardpage_enabled __read_mostly;
+static int __init early_debug_pagealloc(char *buf)
+{
+        if (!buf)
+                return -EINVAL;
+        if (strcmp(buf, "on") == 0)
+                _debug_pagealloc_enabled = true;
+        return 0;
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+static bool need_debug_guardpage(void)
+{
+        /* If we don't use debug_pagealloc, we don't need guard page */
+        if (!debug_pagealloc_enabled())
+                return false;
+        return true;
+}
+static void init_debug_guardpage(void)
+{
+        if (!debug_pagealloc_enabled())
+                return;
+        _debug_guardpage_enabled = true;
+}
+struct page_ext_operations debug_guardpage_ops = {
+        .need = need_debug_guardpage,
+        .init = init_debug_guardpage,
+};
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
@@ -441,18 +478,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
-static inline void set_page_guard_flag(struct page *page)
+static inline void set_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype)
 {
-        __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+        struct page_ext *page_ext;
+        if (!debug_guardpage_enabled())
+                return;
+        page_ext = lookup_page_ext(page);
+        __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+        INIT_LIST_HEAD(&page->lru);
+        set_page_private(page, order);
+        /* Guard pages are not available for any usage */
+        __mod_zone_freepage_state(zone, -(1 << order), migratetype);
 }
-static inline void clear_page_guard_flag(struct page *page)
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype)
 {
-        __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+        struct page_ext *page_ext;
+        if (!debug_guardpage_enabled())
+                return;
+        page_ext = lookup_page_ext(page);
+        __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+        set_page_private(page, 0);
+        if (!is_migrate_isolate(migratetype))
+                __mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
 #else
-static inline void set_page_guard_flag(struct page *page) { }
+struct page_ext_operations debug_guardpage_ops = { NULL, };
-static inline void clear_page_guard_flag(struct page *page) { }
+static inline void set_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype) {}
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+                                unsigned int order, int migratetype) {}
 #endif
 static inline void set_page_order(struct page *page, unsigned int order)
@@ -468,29 +531,6 @@ static inline void rmv_page_order(struct page *page)
 }
 /*
- * Locate the struct page for both the matching buddy in our
- * pair (buddy1) and the combined O(n+1) page they form (page).
- *
- * 1) Any buddy B1 will have an order O twin B2 which satisfies
- * the following equation:
- *     B2 = B1 ^ (1 << O)
- * For example, if the starting buddy (buddy2) is #8 its order
- * 1 buddy is #10:
- *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
- *
- * 2) Any buddy B will have an order O+1 parent P which
- * satisfies the following equation:
- *     P = B & ~(1 << O)
- *
- * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
- */
-static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
-{
-        return page_idx ^ (1 << order);
-}
-/*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
 * (a) the buddy is not in a hole &&
@@ -570,6 +610,7 @@ static inline void __free_one_page(struct page *page,
        unsigned long combined_idx;
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
+        int max_order = MAX_ORDER;
        VM_BUG_ON(!zone_is_initialized(zone));
@@ -578,13 +619,24 @@ static inline void __free_one_page(struct page *page,
                        return;
        VM_BUG_ON(migratetype == -1);
+        if (is_migrate_isolate(migratetype)) {
+                /*
+                 * We restrict max order of merging to prevent merge
+                 * between freepages on isolate pageblock and normal
+                 * pageblock. Without this, pageblock isolation
+                 * could cause incorrect freepage accounting.
+                 */
+                max_order = min(MAX_ORDER, pageblock_order + 1);
+        } else {
+                __mod_zone_freepage_state(zone, 1 << order, migratetype);
+        }
-        page_idx = pfn & ((1 << MAX_ORDER) - 1);
+        page_idx = pfn & ((1 << max_order) - 1);
        VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
-        while (order < MAX_ORDER-1) {
+        while (order < max_order - 1) {
                buddy_idx = __find_buddy_index(page_idx, order);
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
@@ -594,10 +646,7 @@ static inline void __free_one_page(struct page *page,
                 * merge with it and move up one order.
                 */
                if (page_is_guard(buddy)) {
-                        clear_page_guard_flag(buddy);
+                        clear_page_guard(zone, buddy, order, migratetype);
-                        set_page_private(page, 0);
-                        __mod_zone_freepage_state(zone, 1 << order,
-                                                  migratetype);
                } else {
                        list_del(&buddy->lru);
                        zone->free_area[order].nr_free--;
@@ -651,8 +700,10 @@ static inline int free_pages_check(struct page *page)
                bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
                bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
        }
-        if (unlikely(mem_cgroup_bad_page_check(page)))
+#ifdef CONFIG_MEMCG
-                bad_reason = "cgroup check failed";
+        if (unlikely(page->mem_cgroup))
+                bad_reason = "page still charged to cgroup";
+#endif
        if (unlikely(bad_reason)) {
                bad_page(page, bad_reason, bad_flags);
                return 1;
@@ -716,14 +767,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
                        mt = get_freepage_migratetype(page);
+                        if (unlikely(has_isolate_pageblock(zone)))
+                                mt = get_pageblock_migratetype(page);
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
-                        if (likely(!is_migrate_isolate_page(page))) {
-                                __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
-                                if (is_migrate_cma(mt))
-                                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
-                        }
                } while (--to_free && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
@@ -740,9 +789,11 @@ static void free_one_page(struct zone *zone,
        if (nr_scanned)
                __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+        if (unlikely(has_isolate_pageblock(zone) ||
+                is_migrate_isolate(migratetype))) {
+                migratetype = get_pfnblock_migratetype(page, pfn);
+        }
        __free_one_page(page, pfn, zone, order, migratetype);
-        if (unlikely(!is_migrate_isolate(migratetype)))
-                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
 }
@@ -751,6 +802,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
+        VM_BUG_ON_PAGE(PageTail(page), page);
+        VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
        trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
@@ -761,6 +815,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        if (bad)
                return false;
+        reset_page_owner(page, order);
        if (!PageHighMem(page)) {
                debug_check_no_locks_freed(page_address(page),
                                           PAGE_SIZE << order);
@@ -867,23 +923,18 @@ static inline void expand(struct zone *zone, struct page *page,
                size >>= 1;
                VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
-#ifdef CONFIG_DEBUG_PAGEALLOC
+                if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
-                if (high < debug_guardpage_minorder()) {
+                        debug_guardpage_enabled() &&
+                        high < debug_guardpage_minorder()) {
                        /*
                         * Mark as guard pages (or page), that will allow to
                         * merge back to allocator when buddy will be freed.
                         * Corresponding page table entries will not be touched,
                         * pages will stay not present in virtual address space
                         */
-                        INIT_LIST_HEAD(&page[size].lru);
+                        set_page_guard(zone, &page[size], high, migratetype);
-                        set_page_guard_flag(&page[size]);
-                        set_page_private(&page[size], high);
-                        /* Guard pages are not available for any usage */
-                        __mod_zone_freepage_state(zone, -(1 << high),
-                                                  migratetype);
                        continue;
                }
-#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -908,8 +959,10 @@ static inline int check_new_page(struct page *page)
                bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
                bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
        }
-        if (unlikely(mem_cgroup_bad_page_check(page)))
+#ifdef CONFIG_MEMCG
-                bad_reason = "cgroup check failed";
+        if (unlikely(page->mem_cgroup))
+                bad_reason = "page still charged to cgroup";
+#endif
        if (unlikely(bad_reason)) {
                bad_page(page, bad_reason, bad_flags);
                return 1;
@@ -939,6 +992,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
        if (order && (gfp_flags & __GFP_COMP))
                prep_compound_page(page, order);
+        set_page_owner(page, order, gfp_flags);
        return 0;
 }
@@ -1014,7 +1069,7 @@ int move_freepages(struct zone *zone,
         * Remove at a later date when no bug reports exist related to
         * grouping pages by mobility
         */
-        BUG_ON(page_zone(start_page) != page_zone(end_page));
+        VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
        for (page = start_page; page <= end_page;) {
@@ -1277,55 +1332,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 #endif
 /*
- * Drain pages of the indicated processor.
+ * Drain pcplists of the indicated processor and zone.
 *
 * The processor must either be the current processor and the
 * thread pinned to the current processor or a processor that
 * is not online.
 */
-static void drain_pages(unsigned int cpu)
+static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
        unsigned long flags;
-        struct zone *zone;
+        struct per_cpu_pageset *pset;
+        struct per_cpu_pages *pcp;
-        for_each_populated_zone(zone) {
+        local_irq_save(flags);
-                struct per_cpu_pageset *pset;
+        pset = per_cpu_ptr(zone->pageset, cpu);
-                struct per_cpu_pages *pcp;
-                local_irq_save(flags);
+        pcp = &pset->pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
+        if (pcp->count) {
+                free_pcppages_bulk(zone, pcp->count, pcp);
+                pcp->count = 0;
+        }
+        local_irq_restore(flags);
+}
-                pcp = &pset->pcp;
+/*
-                if (pcp->count) {
+ * Drain pcplists of all zones on the indicated processor.
-                        free_pcppages_bulk(zone, pcp->count, pcp);
+ *
-                        pcp->count = 0;
+ * The processor must either be the current processor and the
-                }
+ * thread pinned to the current processor or a processor that
-                local_irq_restore(flags);
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
+{
+        struct zone *zone;
+        for_each_populated_zone(zone) {
+                drain_pages_zone(cpu, zone);
        }
 }
 /*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ *
+ * The CPU has to be pinned. When zone parameter is non-NULL, spill just
+ * the single zone's pages.
 */
-void drain_local_pages(void *arg)
+void drain_local_pages(struct zone *zone)
 {
-        drain_pages(smp_processor_id());
+        int cpu = smp_processor_id();
+        if (zone)
+                drain_pages_zone(cpu, zone);
+        else
+                drain_pages(cpu);
 }
 /*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
 *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
 * Note that this code is protected against sending an IPI to an offline
 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
 * nothing keeps CPUs from showing up after we populated the cpumask and
 * before the call to on_each_cpu_mask().
 */
-void drain_all_pages(void)
+void drain_all_pages(struct zone *zone)
 {
        int cpu;
-        struct per_cpu_pageset *pcp;
-        struct zone *zone;
        /*
         * Allocate in the BSS so we wont require allocation in
@@ -1340,20 +1415,31 @@ void drain_all_pages(void)
         * disables preemption as part of its processing
         */
        for_each_online_cpu(cpu) {
+                struct per_cpu_pageset *pcp;
+                struct zone *z;
                bool has_pcps = false;
-                for_each_populated_zone(zone) {
+                if (zone) {
                        pcp = per_cpu_ptr(zone->pageset, cpu);
-                        if (pcp->pcp.count) {
+                        if (pcp->pcp.count)
                                has_pcps = true;
-                                break;
+                } else {
+                        for_each_populated_zone(z) {
+                                pcp = per_cpu_ptr(z->pageset, cpu);
+                                if (pcp->pcp.count) {
+                                        has_pcps = true;
+                                        break;
+                                }
                        }
                }
                if (has_pcps)
                        cpumask_set_cpu(cpu, &cpus_with_pcps);
                else
                        cpumask_clear_cpu(cpu, &cpus_with_pcps);
        }
-        on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+        on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
+                                                                zone, 1);
 }
 #ifdef CONFIG_HIBERNATION
@@ -1480,12 +1566,15 @@ void split_page(struct page *page, unsigned int order)
                split_page(virt_to_page(page[0].shadow), order);
 #endif
-        for (i = 1; i < (1 << order); i++)
+        set_page_owner(page, 0, 0);
+        for (i = 1; i < (1 << order); i++) {
                set_page_refcounted(page + i);
+                set_page_owner(page + i, 0, 0);
+        }
 }
 EXPORT_SYMBOL_GPL(split_page);
-static int __isolate_free_page(struct page *page, unsigned int order)
+int __isolate_free_page(struct page *page, unsigned int order)
 {
        unsigned long watermark;
        struct zone *zone;
@@ -1521,6 +1610,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
                }
        }
+        set_page_owner(page, order, 0);
        return 1UL << order;
 }
@@ -1613,8 +1703,8 @@ again:
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
        if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
-            !zone_is_fair_depleted(zone))
+            !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
-                zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+                set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1715,7 +1805,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                        unsigned long mark, int classzone_idx, int alloc_flags,
                        long free_pages)
 {
-        /* free_pages my go negative - that's OK */
+        /* free_pages may go negative - that's OK */
        long min = mark;
        int o;
        long free_cma = 0;
@@ -1934,7 +2024,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
                mod_zone_page_state(zone, NR_ALLOC_BATCH,
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-                zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+                clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
        } while (zone++ != preferred_zone);
 }
@@ -1963,7 +2053,7 @@ zonelist_scan:
        /*
         * Scan zonelist, looking for a zone with enough free.
-         * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                high_zoneidx, nodemask) {
@@ -1974,7 +2064,7 @@ zonelist_scan:
                                continue;
                if (cpusets_enabled() &&
                        (alloc_flags & ALLOC_CPUSET) &&
-                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
+                        !cpuset_zone_allowed(zone, gfp_mask))
                                continue;
                /*
                 * Distribute pages in proportion to the individual
@@ -1985,7 +2075,7 @@ zonelist_scan:
                if (alloc_flags & ALLOC_FAIR) {
                        if (!zone_local(preferred_zone, zone))
                                break;
-                        if (zone_is_fair_depleted(zone)) {
+                        if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
                                nr_fair_skipped++;
                                continue;
                        }
@@ -2253,6 +2343,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        }
        /*
+         * PM-freezer should be notified that there might be an OOM killer on
+         * its way to kill and wake somebody up. This is too early and we might
+         * end up not killing anything but false positives are acceptable.
+         * See freeze_processes.
+         */
+        note_oom_kill();
+        /*
         * Go through the zonelist yet one more time, keep very high watermark
         * here, this is only to catch a parallel oom killing, we must fail if
         * we're still under heavy pressure.
@@ -2296,58 +2394,59 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int classzone_idx, int migratetype, enum migrate_mode mode,
-        bool *contended_compaction, bool *deferred_compaction,
+        int *contended_compaction, bool *deferred_compaction)
-        unsigned long *did_some_progress)
 {
-        if (!order)
+        unsigned long compact_result;
-                return NULL;
+        struct page *page;
-        if (compaction_deferred(preferred_zone, order)) {
+        if (!order)
-                *deferred_compaction = true;
                return NULL;
-        }
        current->flags |= PF_MEMALLOC;
-        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+        compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, mode,
-                                                contended_compaction);
+                                                contended_compaction,
+                                                alloc_flags, classzone_idx);
        current->flags &= ~PF_MEMALLOC;
-        if (*did_some_progress != COMPACT_SKIPPED) {
+        switch (compact_result) {
-                struct page *page;
+        case COMPACT_DEFERRED:
+                *deferred_compaction = true;
-                /* Page migration frees to the PCP lists but we want merging */
+                /* fall-through */
-                drain_pages(get_cpu());
+        case COMPACT_SKIPPED:
-                put_cpu();
+                return NULL;
+        default:
+                break;
+        }
-                page = get_page_from_freelist(gfp_mask, nodemask,
+        /*
-                                order, zonelist, high_zoneidx,
+         * At least in one zone compaction wasn't deferred or skipped, so let's
-                                alloc_flags & ~ALLOC_NO_WATERMARKS,
+         * count a compaction stall
-                                preferred_zone, classzone_idx, migratetype);
+         */
-                if (page) {
+        count_vm_event(COMPACTSTALL);
-                        preferred_zone->compact_blockskip_flush = false;
-                        compaction_defer_reset(preferred_zone, order, true);
-                        count_vm_event(COMPACTSUCCESS);
-                        return page;
-                }
-                /*
+        page = get_page_from_freelist(gfp_mask, nodemask,
-                 * It's bad if compaction run occurs and fails.
+                        order, zonelist, high_zoneidx,
-                 * The most likely reason is that pages exist,
+                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                 * but not enough to satisfy watermarks.
+                        preferred_zone, classzone_idx, migratetype);
-                 */
-                count_vm_event(COMPACTFAIL);
-                /*
+        if (page) {
-                 * As async compaction considers a subset of pageblocks, only
+                struct zone *zone = page_zone(page);
-                 * defer if the failure was a sync compaction failure.
-                 */
-                if (mode != MIGRATE_ASYNC)
-                        defer_compaction(preferred_zone, order);
-                cond_resched();
+                zone->compact_blockskip_flush = false;
+                compaction_defer_reset(zone, order, true);
+                count_vm_event(COMPACTSUCCESS);
+                return page;
        }
+        /*
+         * It's bad if compaction run occurs and fails. The most likely reason
+         * is that pages exist, but not enough to satisfy watermarks.
+         */
+        count_vm_event(COMPACTFAIL);
+        cond_resched();
        return NULL;
 }
 #else
@@ -2355,9 +2454,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int classzone_idx, int migratetype,
+        int classzone_idx, int migratetype, enum migrate_mode mode,
-        enum migrate_mode mode, bool *contended_compaction,
+        int *contended_compaction, bool *deferred_compaction)
-        bool *deferred_compaction, unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2422,7 +2520,7 @@ retry:
         * pages are pinned on the per-cpu lists. Drain them and try again
         */
        if (!page && !drained) {
-                drain_all_pages();
+                drain_all_pages(NULL);
                drained = true;
                goto retry;
        }
@@ -2457,12 +2555,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static void wake_all_kswapds(unsigned int order,
                             struct zonelist *zonelist,
                             enum zone_type high_zoneidx,
-                             struct zone *preferred_zone)
+                             struct zone *preferred_zone,
+                             nodemask_t *nodemask)
 {
        struct zoneref *z;
        struct zone *zone;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                                high_zoneidx, nodemask)
                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
@@ -2492,7 +2592,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                        alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
-                 * comment for __cpuset_node_allowed_softwall().
+                 * comment for __cpuset_node_allowed().
                 */
                alloc_flags &= ~ALLOC_CPUSET;
        } else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2509,7 +2609,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
 #ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
        return alloc_flags;
@@ -2533,7 +2633,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        enum migrate_mode migration_mode = MIGRATE_ASYNC;
        bool deferred_compaction = false;
-        bool contended_compaction = false;
+        int contended_compaction = COMPACT_CONTENDED_NONE;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2660,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 restart:
        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+                wake_all_kswapds(order, zonelist, high_zoneidx,
+                                preferred_zone, nodemask);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2734,40 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                        &deferred_compaction,
+                                        &deferred_compaction);
-                                        &did_some_progress);
        if (page)
                goto got_pg;
-        /*
+        /* Checks for THP-specific high-order allocations */
-         * If compaction is deferred for high-order allocations, it is because
+        if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
-         * sync compaction recently failed. In this is the case and the caller
+                /*
-         * requested a movable allocation that does not heavily disrupt the
+                 * If compaction is deferred for high-order allocations, it is
-         * system then fail the allocation instead of entering direct reclaim.
+                 * because sync compaction recently failed. If this is the case
-         */
+                 * and the caller requested a THP allocation, we do not want
-        if ((deferred_compaction || contended_compaction) &&
+                 * to heavily disrupt the system, so we fail the allocation
-                                                (gfp_mask & __GFP_NO_KSWAPD))
+                 * instead of entering direct reclaim.
-                goto nopage;
+                 */
+                if (deferred_compaction)
+                        goto nopage;
+                /*
+                 * In all zones where compaction was attempted (and not
+                 * deferred or skipped), lock contention has been detected.
+                 * For THP allocation we do not want to disrupt the others
+                 * so we fallback to base pages instead.
+                 */
+                if (contended_compaction == COMPACT_CONTENDED_LOCK)
+                        goto nopage;
+                /*
+                 * If compaction was aborted due to need_resched(), we do not
+                 * want to further increase allocation latency, unless it is
+                 * khugepaged trying to collapse.
+                 */
+                if (contended_compaction == COMPACT_CONTENDED_SCHED
+                        && !(current->flags & PF_KTHREAD))
+                        goto nopage;
+        }
        /*
         * It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2847,7 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                        &deferred_compaction,
+                                        &deferred_compaction);
-                                        &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2753,7 +2873,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct zone *preferred_zone;
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
-        int migratetype = allocflags_to_migratetype(gfp_mask);
+        int migratetype = gfpflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        int classzone_idx;
@@ -2775,6 +2895,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2786,10 +2909,6 @@ retry_cpuset:
                goto out;
        classzone_idx = zonelist_zone_idx(preferred_zoneref);
-#ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-                alloc_flags |= ALLOC_CMA;
-#endif
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3698,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
        zonelist->_zonerefs[pos].zone_idx = 0;
 }
+#if defined(CONFIG_64BIT)
+/*
+ * Devices that require DMA32/DMA are relatively rare and do not justify a
+ * penalty to every machine in case the specialised case applies. Default
+ * to Node-ordering on 64-bit NUMA machines
+ */
+static int default_zonelist_order(void)
+{
+        return ZONELIST_ORDER_NODE;
+}
+#else
+/*
+ * On 32-bit, the Normal zone needs to be preserved for allocations accessible
+ * by the kernel. If processes running on node 0 deplete the low memory zone
+ * then reclaim will occur more frequency increasing stalls and potentially
+ * be easier to OOM if a large percentage of the zone is under writeback or
+ * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
+ * Hence, default to zone ordering on 32-bit.
+ */
 static int default_zonelist_order(void)
 {
-        int nid, zone_type;
-        unsigned long low_kmem_size, total_size;
-        struct zone *z;
-        int average_size;
-        /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
-         * If they are really small and used heavily, the system can fall
-         * into OOM very easily.
-         * This function detect ZONE_DMA/DMA32 size and configures zone order.
-         */
-        /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
-        low_kmem_size = 0;
-        total_size = 0;
-        for_each_online_node(nid) {
-                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                        z = &NODE_DATA(nid)->node_zones[zone_type];
-                        if (populated_zone(z)) {
-                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->managed_pages;
-                                total_size += z->managed_pages;
-                        } else if (zone_type == ZONE_NORMAL) {
-                                /*
-                                 * If any node has only lowmem, then node order
-                                 * is preferred to allow kernel allocations
-                                 * locally; otherwise, they can easily infringe
-                                 * on other nodes when there is an abundance of
-                                 * lowmem available to allocate from.
-                                 */
-                                return ZONELIST_ORDER_NODE;
-                        }
-                }
-        }
-        if (!low_kmem_size ||  /* there are no DMA area. */
-            low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
-                return ZONELIST_ORDER_NODE;
-        /*
-         * look into each node's config.
-         * If there is a node whose DMA/DMA32 memory is very big area on
-         * local memory, NODE_ORDER may be suitable.
-         */
-        average_size = total_size /
-                                (nodes_weight(node_states[N_MEMORY]) + 1);
-        for_each_online_node(nid) {
-                low_kmem_size = 0;
-                total_size = 0;
-                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                        z = &NODE_DATA(nid)->node_zones[zone_type];
-                        if (populated_zone(z)) {
-                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->present_pages;
-                                total_size += z->present_pages;
-                        }
-                }
-                if (low_kmem_size &&
-                    total_size > average_size && /* ignore small node */
-                    low_kmem_size > total_size * 70/100)
-                        return ZONELIST_ORDER_NODE;
-        }
        return ZONELIST_ORDER_ZONE;
 }
+#endif /* CONFIG_64BIT */
 static void set_zonelist_order(void)
 {
@@ -3899,14 +3980,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
        else
                page_group_by_mobility_disabled = 0;
-        printk("Built %i zonelists in %s order, mobility grouping %s.  "
+        pr_info("Built %i zonelists in %s order, mobility grouping %s.  "
                "Total pages: %ld\n",
                        nr_online_nodes,
                        zonelist_order_name[current_zonelist_order],
                        page_group_by_mobility_disabled ? "off" : "on",
                        vm_total_pages);
 #ifdef CONFIG_NUMA
-        printk("Policy zone: %s\n", zone_names[policy_zone]);
+        pr_info("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
@@ -4838,7 +4919,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 #endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
-        pgdat_page_cgroup_init(pgdat);
+        pgdat_page_ext_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
@@ -4857,16 +4938,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 * and per-cpu initialisations
                 */
                memmap_pages = calc_memmap_size(size, realsize);
-                if (freesize >= memmap_pages) {
+                if (!is_highmem_idx(j)) {
-                        freesize -= memmap_pages;
+                        if (freesize >= memmap_pages) {
-                        if (memmap_pages)
+                                freesize -= memmap_pages;
-                                printk(KERN_DEBUG
+                                if (memmap_pages)
-                                       "  %s zone: %lu pages used for memmap\n",
+                                        printk(KERN_DEBUG
-                                       zone_names[j], memmap_pages);
+                                               "  %s zone: %lu pages used for memmap\n",
-                } else
+                                               zone_names[j], memmap_pages);
-                        printk(KERN_WARNING
+                        } else
-                                "  %s zone: %lu pages exceeds freesize %lu\n",
+                                printk(KERN_WARNING
-                                zone_names[j], memmap_pages, freesize);
+                                        "  %s zone: %lu pages exceeds freesize %lu\n",
+                                        zone_names[j], memmap_pages, freesize);
+                }
                /* Account for reserved pages */
                if (j == 0 && freesize > dma_reserve) {
@@ -4976,6 +5059,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+        printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+                        (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5338,33 +5423,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
-        printk("Zone ranges:\n");
+        pr_info("Zone ranges:\n");
        for (i = 0; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
-                printk(KERN_CONT "  %-8s ", zone_names[i]);
+                pr_info("  %-8s ", zone_names[i]);
                if (arch_zone_lowest_possible_pfn[i] ==
                                arch_zone_highest_possible_pfn[i])
-                        printk(KERN_CONT "empty\n");
+                        pr_cont("empty\n");
                else
-                        printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+                        pr_cont("[mem %0#10lx-%0#10lx]\n",
                                arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
                                (arch_zone_highest_possible_pfn[i]
                                        << PAGE_SHIFT) - 1);
        }
        /* Print out the PFNs ZONE_MOVABLE begins at in each node */
-        printk("Movable zone start for each node\n");
+        pr_info("Movable zone start for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
-                        printk("  Node %d: %#010lx\n", i,
+                        pr_info("  Node %d: %#010lx\n", i,
                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
        /* Print out the early node map */
-        printk("Early memory node ranges\n");
+        pr_info("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+                pr_info("  node %3d: [mem %#010lx-%#010lx]\n", nid,
                       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
        /* Initialise every node */
@@ -5500,9 +5585,9 @@ void __init mem_init_print_info(const char *str)
 #undef  adj_init_size
-        printk("Memory: %luK/%luK available "
+        pr_info("Memory: %luK/%luK available "
               "(%luK kernel code, %luK rwdata, %luK rodata, "
-               "%luK init, %luK bss, %luK reserved"
+               "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
 #ifdef  CONFIG_HIGHMEM
               ", %luK highmem"
 #endif
@@ -5510,7 +5595,8 @@ void __init mem_init_print_info(const char *str)
               nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
               codesize >> 10, datasize >> 10, rosize >> 10,
               (init_data_size + init_code_size) >> 10, bss_size >> 10,
-               (physpages - totalram_pages) << (PAGE_SHIFT-10),
+               (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
+               totalcma_pages << (PAGE_SHIFT-10),
 #ifdef  CONFIG_HIGHMEM
               totalhigh_pages << (PAGE_SHIFT-10),
 #endif
@@ -6202,9 +6288,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                if (!PageLRU(page))
                        found++;
                /*
-                 * If there are RECLAIMABLE pages, we need to check it.
+                 * If there are RECLAIMABLE pages, we need to check
-                 * But now, memory offline itself doesn't call shrink_slab()
+                 * it.  But now, memory offline itself doesn't call
-                 * and it still to be fixed.
+                 * shrink_node_slabs() and it still to be fixed.
                 */
                /*
                 * If the page is not RAM, page_count()should be 0.
@@ -6277,8 +6363,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                if (list_empty(&cc->migratepages)) {
                        cc->nr_migratepages = 0;
-                        pfn = isolate_migratepages_range(cc->zone, cc,
+                        pfn = isolate_migratepages_range(cc, pfn, end);
-                                                         pfn, end, true);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -6390,7 +6475,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         */
        lru_add_drain_all();
-        drain_all_pages();
+        drain_all_pages(cc.zone);
        order = 0;
        outer_start = start;
@@ -6404,13 +6489,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        /* Make sure the range is really isolated. */
        if (test_pages_isolated(outer_start, end, false)) {
-                pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
+                pr_info("%s: [%lx, %lx) PFNs busy\n",
-                       outer_start, end);
+                        __func__, outer_start, end);
                ret = -EBUSY;
                goto done;
        }
        /* Grab isolated pages from freelists. */
        outer_end = isolate_freepages_range(&cc, outer_start, end);
        if (!outer_end) {
@@ -6554,97 +6638,3 @@ bool is_free_buddy_page(struct page *page)
        return order < MAX_ORDER;
 }
 #endif
-static const struct trace_print_flags pageflag_names[] = {
-        {1UL << PG_locked,              "locked"        },
-        {1UL << PG_error,               "error"         },
-        {1UL << PG_referenced,          "referenced"    },
-        {1UL << PG_uptodate,            "uptodate"      },
-        {1UL << PG_dirty,               "dirty"         },
-        {1UL << PG_lru,                 "lru"           },
-        {1UL << PG_active,              "active"        },
-        {1UL << PG_slab,                "slab"          },
-        {1UL << PG_owner_priv_1,        "owner_priv_1"  },
-        {1UL << PG_arch_1,              "arch_1"        },
-        {1UL << PG_reserved,            "reserved"      },
-        {1UL << PG_private,             "private"       },
-        {1UL << PG_private_2,           "private_2"     },
-        {1UL << PG_writeback,           "writeback"     },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-        {1UL << PG_head,                "head"          },
-        {1UL << PG_tail,                "tail"          },
-#else
-        {1UL << PG_compound,            "compound"      },
-#endif
-        {1UL << PG_swapcache,           "swapcache"     },
-        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
-        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_swapbacked,          "swapbacked"    },
-        {1UL << PG_unevictable,         "unevictable"   },
-#ifdef CONFIG_MMU
-        {1UL << PG_mlocked,             "mlocked"       },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-        {1UL << PG_uncached,            "uncached"      },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
-        {1UL << PG_hwpoison,            "hwpoison"      },
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        {1UL << PG_compound_lock,       "compound_lock" },
-#endif
-};
-static void dump_page_flags(unsigned long flags)
-{
-        const char *delim = "";
-        unsigned long mask;
-        int i;
-        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
-        printk(KERN_ALERT "page flags: %#lx(", flags);
-        /* remove zone id */
-        flags &= (1UL << NR_PAGEFLAGS) - 1;
-        for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
-                mask = pageflag_names[i].mask;
-                if ((flags & mask) != mask)
-                        continue;
-                flags &= ~mask;
-                printk("%s%s", delim, pageflag_names[i].name);
-                delim = "|";
-        }
-        /* check for left over flags */
-        if (flags)
-                printk("%s%#lx", delim, flags);
-        printk(")\n");
-}
-void dump_page_badflags(struct page *page, const char *reason,
-                unsigned long badflags)
-{
-        printk(KERN_ALERT
-               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, atomic_read(&page->_count), page_mapcount(page),
-                page->mapping, page->index);
-        dump_page_flags(page->flags);
-        if (reason)
-                pr_alert("page dumped because: %s\n", reason);
-        if (page->flags & badflags) {
-                pr_alert("bad because of flags:\n");
-                dump_page_flags(page->flags & badflags);
-        }
-        mem_cgroup_print_bad_page(page);
-}
-void dump_page(struct page *page, const char *reason)
-{
-        dump_page_badflags(page, reason, 0);
-}
-EXPORT_SYMBOL(dump_page);
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2015-02-10 14:35:36 -0500
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2015-02-10 14:35:36 -0500
commit	4ba24fef3eb3b142197135223b90ced2f319cd53 (patch)
tree	a20c125b27740ec7b4c761b11d801108e1b316b2 /mm/page_alloc.c
parent	47c1ffb2b6b630894e9a16442611c056ab21c057 (diff)
parent	98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff)