Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar: "The main changes in this cycle are: - (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik van Riel, Peter Zijlstra et al. Yay! - optimize preemption counter handling: merge the NEED_RESCHED flag into the preempt_count variable, by Peter Zijlstra. - wait.h fixes and code reorganization from Peter Zijlstra - cfs_bandwidth fixes from Ben Segall - SMP load-balancer cleanups from Peter Zijstra - idle balancer improvements from Jason Low - other fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits) ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED stop_machine: Fix race between stop_two_cpus() and stop_cpus() sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus sched: Fix asymmetric scheduling for POWER7 sched: Move completion code from core.c to completion.c sched: Move wait code from core.c to wait.c sched: Move wait.c into kernel/sched/ sched/wait: Fix __wait_event_interruptible_lock_irq_timeout() sched: Avoid throttle_cfs_rq() racing with period_timer stopping sched: Guarantee new group-entities always have weight sched: Fix hrtimer_cancel()/rq->lock deadlock sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining sched: Fix race on toggling cfs_bandwidth_used sched: Remove extra put_online_cpus() inside sched_setaffinity() sched/rt: Fix task_tick_rt() comment sched/wait: Fix build breakage sched/wait: Introduce prepare_to_wait_event() sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too sched: Remove get_online_cpus() usage sched: Fix race in migrate_swap_stop() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-11 20:20:12 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-11 20:20:12 -0500
commit: 39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch)
tree: 40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /mm
parent: ad5d69899e52792671c1aa6c7360464c7edfe09c (diff)
parent: e5137b50a0640009fd63a3e65c14bc6e1be8796a (diff)
8 files changed, 218 insertions, 189 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cca80d96e509..2612f60f53ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1282,19 +1282,32 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        int page_nid = -1, this_nid = numa_node_id();
-        int target_nid;
+        int target_nid, last_cpupid = -1;
        bool page_locked;
        bool migrated = false;
+        int flags = 0;
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
        page = pmd_page(pmd);
+        BUG_ON(is_huge_zero_page(page));
        page_nid = page_to_nid(page);
+        last_cpupid = page_cpupid_last(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
-        if (page_nid == this_nid)
+        if (page_nid == this_nid) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+                flags |= TNF_FAULT_LOCAL;
+        }
+        /*
+         * Avoid grouping on DSO/COW pages in specific and RO pages
+         * in general, RO pages shouldn't hurt as much anyway since
+         * they can be in shared cache state.
+         */
+        if (!pmd_write(pmd))
+                flags |= TNF_NO_GROUP;
        /*
         * Acquire the page lock to serialise THP migrations but avoid dropping
@@ -1325,7 +1338,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                lock_page(page);
        anon_vma = page_lock_anon_vma_read(page);
-        /* Confirm the PTE did not while locked */
+        /* Confirm the PMD did not change while page_table_lock was released */
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp))) {
                unlock_page(page);
@@ -1341,8 +1354,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_unlock(&mm->page_table_lock);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
                                pmdp, pmd, addr, page, target_nid);
-        if (migrated)
+        if (migrated) {
+                flags |= TNF_MIGRATED;
                page_nid = target_nid;
+        }
        goto out;
 clear_pmdnuma:
@@ -1360,7 +1375,7 @@ out:
                page_unlock_anon_vma_read(anon_vma);
        if (page_nid != -1)
-                task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+                task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
        return 0;
 }
@@ -1458,6 +1473,12 @@ out:
        return ret;
 }
+/*
+ * Returns
+ *  - 0 if PMD could not be locked
+ *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, pgprot_t newprot, int prot_numa)
 {
@@ -1466,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                pmd_t entry;
-                entry = pmdp_get_and_clear(mm, addr, pmd);
+                ret = 1;
                if (!prot_numa) {
+                        entry = pmdp_get_and_clear(mm, addr, pmd);
                        entry = pmd_modify(entry, newprot);
+                        ret = HPAGE_PMD_NR;
                        BUG_ON(pmd_write(entry));
                } else {
                        struct page *page = pmd_page(*pmd);
-                        /* only check non-shared pages */
+                        /*
-                        if (page_mapcount(page) == 1 &&
+                         * Do not trap faults against the zero page. The
+                         * read-only data is likely to be read-cached on the
+                         * local CPU cache and it is less useful to know about
+                         * local vs remote hits on the zero page.
+                         */
+                        if (!is_huge_zero_page(page) &&
                            !pmd_numa(*pmd)) {
+                                entry = pmdp_get_and_clear(mm, addr, pmd);
                                entry = pmd_mknuma(entry);
+                                ret = HPAGE_PMD_NR;
                        }
                }
-                set_pmd_at(mm, addr, pmd, entry);
+                /* Set PMD if cleared earlier */
+                if (ret == HPAGE_PMD_NR)
+                        set_pmd_at(mm, addr, pmd, entry);
                spin_unlock(&vma->vm_mm->page_table_lock);
-                ret = 1;
        }
        return ret;
@@ -1662,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page,
                page_tail->mapping = page->mapping;
                page_tail->index = page->index + i;
-                page_nid_xchg_last(page_tail, page_nid_last(page));
+                page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index d176154c243f..1f2287eaa88e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
 #include "internal.h"
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -2721,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                get_page(dirty_page);
 reuse:
+                /*
+                 * Clear the pages cpupid information as the existing
+                 * information potentially belongs to a now completely
+                 * unrelated process.
+                 */
+                if (old_page)
+                        page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3521,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                                unsigned long addr, int page_nid)
+                                unsigned long addr, int page_nid,
+                                int *flags)
 {
        get_page(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
-        if (page_nid == numa_node_id())
+        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+                *flags |= TNF_FAULT_LOCAL;
+        }
        return mpol_misplaced(page, vma, addr);
 }
@@ -3538,8 +3549,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page = NULL;
        spinlock_t *ptl;
        int page_nid = -1;
+        int last_cpupid;
        int target_nid;
        bool migrated = false;
+        int flags = 0;
        /*
        * The "pte" at this point cannot be used safely without
@@ -3566,9 +3579,26 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(ptep, ptl);
                return 0;
        }
+        BUG_ON(is_zero_pfn(page_to_pfn(page)));
+        /*
+         * Avoid grouping on DSO/COW pages in specific and RO pages
+         * in general, RO pages shouldn't hurt as much anyway since
+         * they can be in shared cache state.
+         */
+        if (!pte_write(pte))
+                flags |= TNF_NO_GROUP;
+        /*
+         * Flag if the page is shared between multiple address spaces. This
+         * is later used when determining whether to group tasks together
+         */
+        if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+                flags |= TNF_SHARED;
+        last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
-        target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+        target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
        pte_unmap_unlock(ptep, ptl);
        if (target_nid == -1) {
                put_page(page);
@@ -3576,102 +3606,17 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        /* Migrate to the requested node */
-        migrated = migrate_misplaced_page(page, target_nid);
+        migrated = migrate_misplaced_page(page, vma, target_nid);
-        if (migrated)
+        if (migrated) {
                page_nid = target_nid;
+                flags |= TNF_MIGRATED;
+        }
 out:
        if (page_nid != -1)
-                task_numa_fault(page_nid, 1, migrated);
+                task_numa_fault(last_cpupid, page_nid, 1, flags);
-        return 0;
-}
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long addr, pmd_t *pmdp)
-{
-        pmd_t pmd;
-        pte_t *pte, *orig_pte;
-        unsigned long _addr = addr & PMD_MASK;
-        unsigned long offset;
-        spinlock_t *ptl;
-        bool numa = false;
-        spin_lock(&mm->page_table_lock);
-        pmd = *pmdp;
-        if (pmd_numa(pmd)) {
-                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
-                numa = true;
-        }
-        spin_unlock(&mm->page_table_lock);
-        if (!numa)
-                return 0;
-        /* we're in a page fault so some vma must be in the range */
-        BUG_ON(!vma);
-        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
-        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
-        VM_BUG_ON(offset >= PMD_SIZE);
-        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
-        pte += offset >> PAGE_SHIFT;
-        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
-                pte_t pteval = *pte;
-                struct page *page;
-                int page_nid = -1;
-                int target_nid;
-                bool migrated = false;
-                if (!pte_present(pteval))
-                        continue;
-                if (!pte_numa(pteval))
-                        continue;
-                if (addr >= vma->vm_end) {
-                        vma = find_vma(mm, addr);
-                        /* there's a pte present so there must be a vma */
-                        BUG_ON(!vma);
-                        BUG_ON(addr < vma->vm_start);
-                }
-                if (pte_numa(pteval)) {
-                        pteval = pte_mknonnuma(pteval);
-                        set_pte_at(mm, addr, pte, pteval);
-                }
-                page = vm_normal_page(vma, addr, pteval);
-                if (unlikely(!page))
-                        continue;
-                /* only check non-shared pages */
-                if (unlikely(page_mapcount(page) != 1))
-                        continue;
-                page_nid = page_to_nid(page);
-                target_nid = numa_migrate_prep(page, vma, addr, page_nid);
-                pte_unmap_unlock(pte, ptl);
-                if (target_nid != -1) {
-                        migrated = migrate_misplaced_page(page, target_nid);
-                        if (migrated)
-                                page_nid = target_nid;
-                } else {
-                        put_page(page);
-                }
-                if (page_nid != -1)
-                        task_numa_fault(page_nid, 1, migrated);
-                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-        }
-        pte_unmap_unlock(orig_pte, ptl);
-        return 0;
-}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long addr, pmd_t *pmdp)
-{
-        BUG();
        return 0;
 }
-#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
@@ -3811,8 +3756,8 @@ retry:
                }
        }
-        if (pmd_numa(*pmd))
+        /* THP should already have been handled */
-                return do_pmd_numa_page(mm, vma, address, pmd);
+        BUG_ON(pmd_numa(*pmd));
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
        return pol;
 }
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+{
+        struct mempolicy *pol = get_task_policy(task);
+        if (vma) {
+                if (vma->vm_ops && vma->vm_ops->get_policy) {
+                        bool ret = false;
+                        pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+                        if (pol && (pol->flags & MPOL_F_MOF))
+                                ret = true;
+                        mpol_cond_put(pol);
+                        return ret;
+                } else if (vma->vm_policy) {
+                        pol = vma->vm_policy;
+                }
+        }
+        if (!pol)
+                return default_policy.flags & MPOL_F_MOF;
+        return pol->flags & MPOL_F_MOF;
+}
 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 {
        enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+        /* Never defer a private fault */
+        if (cpupid_match_pid(p, last_cpupid))
+                return false;
+        if (p->numa_migrate_deferred) {
+                p->numa_migrate_deferred--;
+                return true;
+        }
+        return false;
+}
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+        p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+        return false;
+}
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /**
 * mpol_misplaced - check whether current page node is valid in policy
 *
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
        struct zone *zone;
        int curnid = page_to_nid(page);
        unsigned long pgoff;
+        int thiscpu = raw_smp_processor_id();
+        int thisnid = cpu_to_node(thiscpu);
        int polnid = -1;
        int ret = -1;
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
        /* Migrate the page towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
-                int last_nid;
+                int last_cpupid;
+                int this_cpupid;
-                polnid = numa_node_id();
+                polnid = thisnid;
+                this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
                /*
                 * Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                 * it less likely we act on an unlikely task<->page
                 * relation.
                 */
-                last_nid = page_nid_xchg_last(page, polnid);
+                last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
-                if (last_nid != polnid)
+                if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+                        /* See sysctl_numa_balancing_migrate_deferred comment */
+                        if (!cpupid_match_pid(current, last_cpupid))
+                                defer_numa_migrate(current);
+                        goto out;
+                }
+                /*
+                 * The quadratic filter above reduces extraneous migration
+                 * of shared pages somewhat. This code reduces it even more,
+                 * reducing the overhead of page migrations of shared pages.
+                 * This makes workloads with shared pages rely more on
+                 * "move task near its memory", and less on "move memory
+                 * towards its task", which is exactly what we want.
+                 */
+                if (numa_migrate_deferred(current, last_cpupid))
                        goto out;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index c04692774e88..dfc8300ecbb2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,6 +445,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
+        int cpupid;
        if (PageHuge(page) || PageTransHuge(page))
                copy_huge_page(newpage, page);
        else
@@ -481,6 +483,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                        __set_page_dirty_nobuffers(newpage);
        }
+        /*
+         * Copy NUMA information to the new page, to prevent over-eager
+         * future migrations of this same page.
+         */
+        cpupid = page_cpupid_xchg_last(page, -1);
+        page_cpupid_xchg_last(newpage, cpupid);
        mlock_migrate_page(newpage, page);
        ksm_migrate_page(newpage, page);
        /*
@@ -1500,7 +1509,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                          __GFP_NOWARN) &
                                         ~GFP_IOFS, 0);
        if (newpage)
-                page_nid_xchg_last(newpage, page_nid_last(page));
+                page_cpupid_xchg_last(newpage, page_cpupid_last(page));
        return newpage;
 }
@@ -1601,7 +1610,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 * node. Caller is expected to have an elevated reference count on
 * the page that will be dropped by this function before returning.
 */
-int migrate_misplaced_page(struct page *page, int node)
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+                           int node)
 {
        pg_data_t *pgdat = NODE_DATA(node);
        int isolated;
@@ -1609,10 +1619,11 @@ int migrate_misplaced_page(struct page *page, int node)
        LIST_HEAD(migratepages);
        /*
-         * Don't migrate pages that are mapped in multiple processes.
+         * Don't migrate file pages that are mapped in multiple processes
-         * TODO: Handle false sharing detection instead of this hammer
+         * with execute permissions as they are probably shared libraries.
         */
-        if (page_mapcount(page) != 1)
+        if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+            (vma->vm_flags & VM_EXEC))
                goto out;
        /*
@@ -1663,13 +1674,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        int page_lru = page_is_file_cache(page);
        /*
-         * Don't migrate pages that are mapped in multiple processes.
-         * TODO: Handle false sharing detection instead of this hammer
-         */
-        if (page_mapcount(page) != 1)
-                goto out_dropref;
-        /*
         * Rate-limit the amount of data that is being migrated to a node.
         * Optimal placement is no good if the memory bus is saturated and
         * all the time is being spent migrating!
@@ -1682,7 +1686,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        if (!new_page)
                goto out_fail;
-        page_nid_xchg_last(new_page, page_nid_last(page));
+        page_cpupid_xchg_last(new_page, page_cpupid_last(page));
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..68562e92d50c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
        unsigned long or_mask, add_mask;
        shift = 8 * sizeof(unsigned long);
-        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
+        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
-                "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
+                "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
                SECTIONS_WIDTH,
                NODES_WIDTH,
                ZONES_WIDTH,
-                LAST_NID_WIDTH,
+                LAST_CPUPID_WIDTH,
                NR_PAGEFLAGS);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
-                "Section %d Node %d Zone %d Lastnid %d\n",
+                "Section %d Node %d Zone %d Lastcpupid %d\n",
                SECTIONS_SHIFT,
                NODES_SHIFT,
                ZONES_SHIFT,
-                LAST_NID_SHIFT);
+                LAST_CPUPID_SHIFT);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
-                "Section %lu Node %lu Zone %lu Lastnid %lu\n",
+                "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
                (unsigned long)SECTIONS_PGSHIFT,
                (unsigned long)NODES_PGSHIFT,
                (unsigned long)ZONES_PGSHIFT,
-                (unsigned long)LAST_NID_PGSHIFT);
+                (unsigned long)LAST_CPUPID_PGSHIFT);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
                "Node/Zone ID: %lu -> %lu\n",
                (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
                "Node not in page flags");
 #endif
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
-                "Last nid not in page flags");
+                "Last cpupid not in page flags");
 #endif
        if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..bf34fb8556db 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
 }
-#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
-int page_nid_xchg_last(struct page *page, int nid)
+int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
        unsigned long old_flags, flags;
-        int last_nid;
+        int last_cpupid;
        do {
                old_flags = flags = page->flags;
-                last_nid = page_nid_last(page);
+                last_cpupid = page_cpupid_last(page);
-                flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
+                flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
-                flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+                flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
        } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
-        return last_nid;
+        return last_cpupid;
 }
 #endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 412ba2b7326a..a597f2ffcd6f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable, int prot_numa, bool *ret_all_same_node)
+                int dirty_accountable, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, oldpte;
        spinlock_t *ptl;
        unsigned long pages = 0;
-        bool all_same_node = true;
-        int last_nid = -1;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                page = vm_normal_page(vma, addr, oldpte);
                                if (page) {
-                                        int this_nid = page_to_nid(page);
+                                        if (!pte_numa(oldpte)) {
-                                        if (last_nid == -1)
-                                                last_nid = this_nid;
-                                        if (last_nid != this_nid)
-                                                all_same_node = false;
-                                        /* only check non-shared pages */
-                                        if (!pte_numa(oldpte) &&
-                                            page_mapcount(page) == 1) {
                                                ptent = pte_mknuma(ptent);
                                                updated = true;
                                        }
@@ -104,33 +94,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                if (pte_swp_soft_dirty(oldpte))
                                        newpte = pte_swp_mksoft_dirty(newpte);
                                set_pte_at(mm, addr, pte, newpte);
+                                pages++;
                        }
-                        pages++;
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
-        *ret_all_same_node = all_same_node;
        return pages;
 }
-#ifdef CONFIG_NUMA_BALANCING
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                                       pmd_t *pmd)
-{
-        spin_lock(&mm->page_table_lock);
-        set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
-        spin_unlock(&mm->page_table_lock);
-}
-#else
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                                       pmd_t *pmd)
-{
-        BUG();
-}
-#endif /* CONFIG_NUMA_BALANCING */
 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                pud_t *pud, unsigned long addr, unsigned long end,
                pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -138,34 +112,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
        pmd_t *pmd;
        unsigned long next;
        unsigned long pages = 0;
-        bool all_same_node;
        pmd = pmd_offset(pud, addr);
        do {
+                unsigned long this_pages;
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                split_huge_page_pmd(vma, addr, pmd);
-                        else if (change_huge_pmd(vma, pmd, addr, newprot,
+                        else {
-                                                 prot_numa)) {
+                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
-                                pages++;
+                                                newprot, prot_numa);
-                                continue;
+                                if (nr_ptes) {
+                                        if (nr_ptes == HPAGE_PMD_NR)
+                                                pages++;
+                                        continue;
+                                }
                        }
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                pages += change_pte_range(vma, pmd, addr, next, newprot,
+                this_pages = change_pte_range(vma, pmd, addr, next, newprot,
-                                 dirty_accountable, prot_numa, &all_same_node);
+                                 dirty_accountable, prot_numa);
+                pages += this_pages;
-                /*
-                 * If we are changing protections for NUMA hinting faults then
-                 * set pmd_numa if the examined pages were all on the same
-                 * node. This allows a regular PMD to be handled as one fault
-                 * and effectively batches the taking of the PTL
-                 */
-                if (prot_numa && all_same_node)
-                        change_pmd_protnuma(vma->vm_mm, addr, pmd);
        } while (pmd++, addr = next, addr != end);
        return pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fac451a..73d812f16dde 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
-        page_nid_reset_last(page);
+        page_cpupid_reset_last(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                page_mapcount_reset(page);
-                page_nid_reset_last(page);
+                page_cpupid_reset_last(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-11 20:20:12 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-11 20:20:12 -0500
commit	39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch)
tree	40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /mm
parent	ad5d69899e52792671c1aa6c7360464c7edfe09c (diff)
parent	e5137b50a0640009fd63a3e65c14bc6e1be8796a (diff)