aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:07 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:35 -0400
commitb795854b1fa70f6aee923ae5df74ff7afeaddcaa (patch)
treefd109d9f3778c7bc934fedb3cda2b5bfb1293375
parent073b5beea735c7e1970686c94ff1f3aaac790a2a (diff)
sched/numa: Set preferred NUMA node based on number of private faults
Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman <mgorman@suse.de> [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/mm.h89
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/page-flags-layout.h28
-rw-r--r--kernel/sched/fair.c12
-rw-r--r--mm/huge_memory.c8
-rw-r--r--mm/memory.c16
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c26
-rw-r--r--mm/page_alloc.c4
12 files changed, 149 insertions, 82 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55ee8855..bb412ce2a8b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
581 * sets it, so none of the operations on it need to be atomic. 581 * sets it, so none of the operations on it need to be atomic.
582 */ 582 */
583 583
584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ 584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */
585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) 585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) 586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) 587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
588#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) 588#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH)
589 589
590/* 590/*
591 * Define the bit shifts to access each section. For non-existent 591 * Define the bit shifts to access each section. For non-existent
@@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) 595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) 596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) 597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
598#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) 598#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0))
599 599
600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ 600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
601#ifdef NODE_NOT_IN_PAGE_FLAGS 601#ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) 617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
618#define NODES_MASK ((1UL << NODES_WIDTH) - 1) 618#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
620#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) 620#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1)
621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) 621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
622 622
623static inline enum zone_type page_zonenum(const struct page *page) 623static inline enum zone_type page_zonenum(const struct page *page)
@@ -661,48 +661,93 @@ static inline int page_to_nid(const struct page *page)
661#endif 661#endif
662 662
663#ifdef CONFIG_NUMA_BALANCING 663#ifdef CONFIG_NUMA_BALANCING
664#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 664static inline int nid_pid_to_nidpid(int nid, int pid)
665static inline int page_nid_xchg_last(struct page *page, int nid)
666{ 665{
667 return xchg(&page->_last_nid, nid); 666 return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
668} 667}
669 668
670static inline int page_nid_last(struct page *page) 669static inline int nidpid_to_pid(int nidpid)
671{ 670{
672 return page->_last_nid; 671 return nidpid & LAST__PID_MASK;
673} 672}
674static inline void page_nid_reset_last(struct page *page) 673
674static inline int nidpid_to_nid(int nidpid)
675{
676 return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK;
677}
678
679static inline bool nidpid_pid_unset(int nidpid)
680{
681 return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK);
682}
683
684static inline bool nidpid_nid_unset(int nidpid)
675{ 685{
676 page->_last_nid = -1; 686 return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK);
687}
688
689#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
690static inline int page_nidpid_xchg_last(struct page *page, int nid)
691{
692 return xchg(&page->_last_nidpid, nid);
693}
694
695static inline int page_nidpid_last(struct page *page)
696{
697 return page->_last_nidpid;
698}
699static inline void page_nidpid_reset_last(struct page *page)
700{
701 page->_last_nidpid = -1;
677} 702}
678#else 703#else
679static inline int page_nid_last(struct page *page) 704static inline int page_nidpid_last(struct page *page)
680{ 705{
681 return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; 706 return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK;
682} 707}
683 708
684extern int page_nid_xchg_last(struct page *page, int nid); 709extern int page_nidpid_xchg_last(struct page *page, int nidpid);
685 710
686static inline void page_nid_reset_last(struct page *page) 711static inline void page_nidpid_reset_last(struct page *page)
687{ 712{
688 int nid = (1 << LAST_NID_SHIFT) - 1; 713 int nidpid = (1 << LAST_NIDPID_SHIFT) - 1;
689 714
690 page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 715 page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT);
691 page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 716 page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT;
692} 717}
693#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ 718#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */
694#else 719#else
695static inline int page_nid_xchg_last(struct page *page, int nid) 720static inline int page_nidpid_xchg_last(struct page *page, int nidpid)
696{ 721{
697 return page_to_nid(page); 722 return page_to_nid(page);
698} 723}
699 724
700static inline int page_nid_last(struct page *page) 725static inline int page_nidpid_last(struct page *page)
701{ 726{
702 return page_to_nid(page); 727 return page_to_nid(page);
703} 728}
704 729
705static inline void page_nid_reset_last(struct page *page) 730static inline int nidpid_to_nid(int nidpid)
731{
732 return -1;
733}
734
735static inline int nidpid_to_pid(int nidpid)
736{
737 return -1;
738}
739
740static inline int nid_pid_to_nidpid(int nid, int pid)
741{
742 return -1;
743}
744
745static inline bool nidpid_pid_unset(int nidpid)
746{
747 return 1;
748}
749
750static inline void page_nidpid_reset_last(struct page *page)
706{ 751{
707} 752}
708#endif 753#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b7adf1d4310c..38a902a6d1e3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -174,8 +174,8 @@ struct page {
174 void *shadow; 174 void *shadow;
175#endif 175#endif
176 176
177#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 177#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
178 int _last_nid; 178 int _last_nidpid;
179#endif 179#endif
180} 180}
181/* 181/*
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 93506a114034..02bc9184f16b 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -38,10 +38,10 @@
38 * The last is when there is insufficient space in page->flags and a separate 38 * The last is when there is insufficient space in page->flags and a separate
39 * lookup is necessary. 39 * lookup is necessary.
40 * 40 *
41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | 41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
42 * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | 42 * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS |
43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | 43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
44 * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | 44 * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS |
45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | 45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
46 */ 46 */
47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -62,15 +62,21 @@
62#endif 62#endif
63 63
64#ifdef CONFIG_NUMA_BALANCING 64#ifdef CONFIG_NUMA_BALANCING
65#define LAST_NID_SHIFT NODES_SHIFT 65#define LAST__PID_SHIFT 8
66#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1)
67
68#define LAST__NID_SHIFT NODES_SHIFT
69#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1)
70
71#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT)
66#else 72#else
67#define LAST_NID_SHIFT 0 73#define LAST_NIDPID_SHIFT 0
68#endif 74#endif
69 75
70#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS 76#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
71#define LAST_NID_WIDTH LAST_NID_SHIFT 77#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT
72#else 78#else
73#define LAST_NID_WIDTH 0 79#define LAST_NIDPID_WIDTH 0
74#endif 80#endif
75 81
76/* 82/*
@@ -81,8 +87,8 @@
81#define NODE_NOT_IN_PAGE_FLAGS 87#define NODE_NOT_IN_PAGE_FLAGS
82#endif 88#endif
83 89
84#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 90#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0
85#define LAST_NID_NOT_IN_PAGE_FLAGS 91#define LAST_NIDPID_NOT_IN_PAGE_FLAGS
86#endif 92#endif
87 93
88#endif /* _LINUX_PAGE_FLAGS_LAYOUT */ 94#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 862d20d02e5c..b1de7c55e9f7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -988,7 +988,7 @@ static void task_numa_placement(struct task_struct *p)
988/* 988/*
989 * Got a PROT_NONE fault for a page on @node. 989 * Got a PROT_NONE fault for a page on @node.
990 */ 990 */
991void task_numa_fault(int last_nid, int node, int pages, bool migrated) 991void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
992{ 992{
993 struct task_struct *p = current; 993 struct task_struct *p = current;
994 int priv; 994 int priv;
@@ -1000,8 +1000,14 @@ void task_numa_fault(int last_nid, int node, int pages, bool migrated)
1000 if (!p->mm) 1000 if (!p->mm)
1001 return; 1001 return;
1002 1002
1003 /* For now, do not attempt to detect private/shared accesses */ 1003 /*
1004 priv = 1; 1004 * First accesses are treated as private, otherwise consider accesses
1005 * to be private if the accessing pid has not changed
1006 */
1007 if (!nidpid_pid_unset(last_nidpid))
1008 priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid));
1009 else
1010 priv = 1;
1005 1011
1006 /* Allocate buffer to track faults on a per-node basis */ 1012 /* Allocate buffer to track faults on a per-node basis */
1007 if (unlikely(!p->numa_faults)) { 1013 if (unlikely(!p->numa_faults)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2a28c2c6c165..0baf0e4d5203 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1282,7 +1282,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1282 struct page *page; 1282 struct page *page;
1283 unsigned long haddr = addr & HPAGE_PMD_MASK; 1283 unsigned long haddr = addr & HPAGE_PMD_MASK;
1284 int page_nid = -1, this_nid = numa_node_id(); 1284 int page_nid = -1, this_nid = numa_node_id();
1285 int target_nid, last_nid = -1; 1285 int target_nid, last_nidpid = -1;
1286 bool page_locked; 1286 bool page_locked;
1287 bool migrated = false; 1287 bool migrated = false;
1288 1288
@@ -1293,7 +1293,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1293 page = pmd_page(pmd); 1293 page = pmd_page(pmd);
1294 BUG_ON(is_huge_zero_page(page)); 1294 BUG_ON(is_huge_zero_page(page));
1295 page_nid = page_to_nid(page); 1295 page_nid = page_to_nid(page);
1296 last_nid = page_nid_last(page); 1296 last_nidpid = page_nidpid_last(page);
1297 count_vm_numa_event(NUMA_HINT_FAULTS); 1297 count_vm_numa_event(NUMA_HINT_FAULTS);
1298 if (page_nid == this_nid) 1298 if (page_nid == this_nid)
1299 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1299 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
@@ -1362,7 +1362,7 @@ out:
1362 page_unlock_anon_vma_read(anon_vma); 1362 page_unlock_anon_vma_read(anon_vma);
1363 1363
1364 if (page_nid != -1) 1364 if (page_nid != -1)
1365 task_numa_fault(last_nid, page_nid, HPAGE_PMD_NR, migrated); 1365 task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated);
1366 1366
1367 return 0; 1367 return 0;
1368} 1368}
@@ -1682,7 +1682,7 @@ static void __split_huge_page_refcount(struct page *page,
1682 page_tail->mapping = page->mapping; 1682 page_tail->mapping = page->mapping;
1683 1683
1684 page_tail->index = page->index + i; 1684 page_tail->index = page->index + i;
1685 page_nid_xchg_last(page_tail, page_nid_last(page)); 1685 page_nidpid_xchg_last(page_tail, page_nidpid_last(page));
1686 1686
1687 BUG_ON(!PageAnon(page_tail)); 1687 BUG_ON(!PageAnon(page_tail));
1688 BUG_ON(!PageUptodate(page_tail)); 1688 BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index 3e3b4b8b6c41..cc7f20691c82 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nidpid.
74#endif 74#endif
75 75
76#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3536 struct page *page = NULL; 3536 struct page *page = NULL;
3537 spinlock_t *ptl; 3537 spinlock_t *ptl;
3538 int page_nid = -1; 3538 int page_nid = -1;
3539 int last_nid; 3539 int last_nidpid;
3540 int target_nid; 3540 int target_nid;
3541 bool migrated = false; 3541 bool migrated = false;
3542 3542
@@ -3567,7 +3567,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3567 } 3567 }
3568 BUG_ON(is_zero_pfn(page_to_pfn(page))); 3568 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3569 3569
3570 last_nid = page_nid_last(page); 3570 last_nidpid = page_nidpid_last(page);
3571 page_nid = page_to_nid(page); 3571 page_nid = page_to_nid(page);
3572 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3572 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3573 pte_unmap_unlock(ptep, ptl); 3573 pte_unmap_unlock(ptep, ptl);
@@ -3583,7 +3583,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3583 3583
3584out: 3584out:
3585 if (page_nid != -1) 3585 if (page_nid != -1)
3586 task_numa_fault(last_nid, page_nid, 1, migrated); 3586 task_numa_fault(last_nidpid, page_nid, 1, migrated);
3587 return 0; 3587 return 0;
3588} 3588}
3589 3589
@@ -3598,7 +3598,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3598 unsigned long offset; 3598 unsigned long offset;
3599 spinlock_t *ptl; 3599 spinlock_t *ptl;
3600 bool numa = false; 3600 bool numa = false;
3601 int last_nid; 3601 int last_nidpid;
3602 3602
3603 spin_lock(&mm->page_table_lock); 3603 spin_lock(&mm->page_table_lock);
3604 pmd = *pmdp; 3604 pmd = *pmdp;
@@ -3643,7 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3643 if (unlikely(!page)) 3643 if (unlikely(!page))
3644 continue; 3644 continue;
3645 3645
3646 last_nid = page_nid_last(page); 3646 last_nidpid = page_nidpid_last(page);
3647 page_nid = page_to_nid(page); 3647 page_nid = page_to_nid(page);
3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3649 pte_unmap_unlock(pte, ptl); 3649 pte_unmap_unlock(pte, ptl);
@@ -3656,7 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3656 } 3656 }
3657 3657
3658 if (page_nid != -1) 3658 if (page_nid != -1)
3659 task_numa_fault(last_nid, page_nid, 1, migrated); 3659 task_numa_fault(last_nidpid, page_nid, 1, migrated);
3660 3660
3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); 3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3662 } 3662 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..aff1f1ed3dc5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2348,9 +2348,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2348 2348
2349 /* Migrate the page towards the node whose CPU is referencing it */ 2349 /* Migrate the page towards the node whose CPU is referencing it */
2350 if (pol->flags & MPOL_F_MORON) { 2350 if (pol->flags & MPOL_F_MORON) {
2351 int last_nid; 2351 int last_nidpid;
2352 int this_nidpid;
2352 2353
2353 polnid = numa_node_id(); 2354 polnid = numa_node_id();
2355 this_nidpid = nid_pid_to_nidpid(polnid, current->pid);
2354 2356
2355 /* 2357 /*
2356 * Multi-stage node selection is used in conjunction 2358 * Multi-stage node selection is used in conjunction
@@ -2373,8 +2375,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2373 * it less likely we act on an unlikely task<->page 2375 * it less likely we act on an unlikely task<->page
2374 * relation. 2376 * relation.
2375 */ 2377 */
2376 last_nid = page_nid_xchg_last(page, polnid); 2378 last_nidpid = page_nidpid_xchg_last(page, this_nidpid);
2377 if (last_nid != polnid) 2379 if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid)
2378 goto out; 2380 goto out;
2379 } 2381 }
2380 2382
diff --git a/mm/migrate.c b/mm/migrate.c
index fcba2f46bb80..025d1e3d2ad2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1498 __GFP_NOWARN) & 1498 __GFP_NOWARN) &
1499 ~GFP_IOFS, 0); 1499 ~GFP_IOFS, 0);
1500 if (newpage) 1500 if (newpage)
1501 page_nid_xchg_last(newpage, page_nid_last(page)); 1501 page_nidpid_xchg_last(newpage, page_nidpid_last(page));
1502 1502
1503 return newpage; 1503 return newpage;
1504} 1504}
@@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1675 if (!new_page) 1675 if (!new_page)
1676 goto out_fail; 1676 goto out_fail;
1677 1677
1678 page_nid_xchg_last(new_page, page_nid_last(page)); 1678 page_nidpid_xchg_last(new_page, page_nidpid_last(page));
1679 1679
1680 isolated = numamigrate_isolate_page(pgdat, page); 1680 isolated = numamigrate_isolate_page(pgdat, page);
1681 if (!isolated) { 1681 if (!isolated) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..467de579784b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
71 unsigned long or_mask, add_mask; 71 unsigned long or_mask, add_mask;
72 72
73 shift = 8 * sizeof(unsigned long); 73 shift = 8 * sizeof(unsigned long);
74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; 74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NIDPID_SHIFT;
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
76 "Section %d Node %d Zone %d Lastnid %d Flags %d\n", 76 "Section %d Node %d Zone %d Lastnidpid %d Flags %d\n",
77 SECTIONS_WIDTH, 77 SECTIONS_WIDTH,
78 NODES_WIDTH, 78 NODES_WIDTH,
79 ZONES_WIDTH, 79 ZONES_WIDTH,
80 LAST_NID_WIDTH, 80 LAST_NIDPID_WIDTH,
81 NR_PAGEFLAGS); 81 NR_PAGEFLAGS);
82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
83 "Section %d Node %d Zone %d Lastnid %d\n", 83 "Section %d Node %d Zone %d Lastnidpid %d\n",
84 SECTIONS_SHIFT, 84 SECTIONS_SHIFT,
85 NODES_SHIFT, 85 NODES_SHIFT,
86 ZONES_SHIFT, 86 ZONES_SHIFT,
87 LAST_NID_SHIFT); 87 LAST_NIDPID_SHIFT);
88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", 88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
89 "Section %lu Node %lu Zone %lu Lastnid %lu\n", 89 "Section %lu Node %lu Zone %lu Lastnidpid %lu\n",
90 (unsigned long)SECTIONS_PGSHIFT, 90 (unsigned long)SECTIONS_PGSHIFT,
91 (unsigned long)NODES_PGSHIFT, 91 (unsigned long)NODES_PGSHIFT,
92 (unsigned long)ZONES_PGSHIFT, 92 (unsigned long)ZONES_PGSHIFT,
93 (unsigned long)LAST_NID_PGSHIFT); 93 (unsigned long)LAST_NIDPID_PGSHIFT);
94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", 94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
95 "Node/Zone ID: %lu -> %lu\n", 95 "Node/Zone ID: %lu -> %lu\n",
96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), 96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
103 "Node not in page flags"); 103 "Node not in page flags");
104#endif 104#endif
105#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 105#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS
106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
107 "Last nid not in page flags"); 107 "Last nidpid not in page flags");
108#endif 108#endif
109 109
110 if (SECTIONS_WIDTH) { 110 if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..25bb477deb26 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99 99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) 100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NIDPID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid) 101int page_nidpid_xchg_last(struct page *page, int nidpid)
102{ 102{
103 unsigned long old_flags, flags; 103 unsigned long old_flags, flags;
104 int last_nid; 104 int last_nidpid;
105 105
106 do { 106 do {
107 old_flags = flags = page->flags; 107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page); 108 last_nidpid = page_nidpid_last(page);
109 109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 110 flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 111 flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); 112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113 113
114 return last_nid; 114 return last_nidpid;
115} 115}
116#endif 116#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 41e02923fcd9..f0b087d1069c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,15 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
37 37
38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable, int prot_numa, bool *ret_all_same_node) 40 int dirty_accountable, int prot_numa, bool *ret_all_same_nidpid)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm; 42 struct mm_struct *mm = vma->vm_mm;
43 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
44 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0; 45 unsigned long pages = 0;
46 bool all_same_node = true; 46 bool all_same_nidpid = true;
47 int last_nid = -1; 47 int last_nid = -1;
48 int last_pid = -1;
48 49
49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 50 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
50 arch_enter_lazy_mmu_mode(); 51 arch_enter_lazy_mmu_mode();
@@ -63,11 +64,18 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 64
64 page = vm_normal_page(vma, addr, oldpte); 65 page = vm_normal_page(vma, addr, oldpte);
65 if (page) { 66 if (page) {
66 int this_nid = page_to_nid(page); 67 int nidpid = page_nidpid_last(page);
68 int this_nid = nidpid_to_nid(nidpid);
69 int this_pid = nidpid_to_pid(nidpid);
70
67 if (last_nid == -1) 71 if (last_nid == -1)
68 last_nid = this_nid; 72 last_nid = this_nid;
69 if (last_nid != this_nid) 73 if (last_pid == -1)
70 all_same_node = false; 74 last_pid = this_pid;
75 if (last_nid != this_nid ||
76 last_pid != this_pid) {
77 all_same_nidpid = false;
78 }
71 79
72 if (!pte_numa(oldpte)) { 80 if (!pte_numa(oldpte)) {
73 ptent = pte_mknuma(ptent); 81 ptent = pte_mknuma(ptent);
@@ -107,7 +115,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
107 arch_leave_lazy_mmu_mode(); 115 arch_leave_lazy_mmu_mode();
108 pte_unmap_unlock(pte - 1, ptl); 116 pte_unmap_unlock(pte - 1, ptl);
109 117
110 *ret_all_same_node = all_same_node; 118 *ret_all_same_nidpid = all_same_nidpid;
111 return pages; 119 return pages;
112} 120}
113 121
@@ -134,7 +142,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
134 pmd_t *pmd; 142 pmd_t *pmd;
135 unsigned long next; 143 unsigned long next;
136 unsigned long pages = 0; 144 unsigned long pages = 0;
137 bool all_same_node; 145 bool all_same_nidpid;
138 146
139 pmd = pmd_offset(pud, addr); 147 pmd = pmd_offset(pud, addr);
140 do { 148 do {
@@ -158,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
158 if (pmd_none_or_clear_bad(pmd)) 166 if (pmd_none_or_clear_bad(pmd))
159 continue; 167 continue;
160 pages += change_pte_range(vma, pmd, addr, next, newprot, 168 pages += change_pte_range(vma, pmd, addr, next, newprot,
161 dirty_accountable, prot_numa, &all_same_node); 169 dirty_accountable, prot_numa, &all_same_nidpid);
162 170
163 /* 171 /*
164 * If we are changing protections for NUMA hinting faults then 172 * If we are changing protections for NUMA hinting faults then
@@ -166,7 +174,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
166 * node. This allows a regular PMD to be handled as one fault 174 * node. This allows a regular PMD to be handled as one fault
167 * and effectively batches the taking of the PTL 175 * and effectively batches the taking of the PTL
168 */ 176 */
169 if (prot_numa && all_same_node) 177 if (prot_numa && all_same_nidpid)
170 change_pmd_protnuma(vma->vm_mm, addr, pmd); 178 change_pmd_protnuma(vma->vm_mm, addr, pmd);
171 } while (pmd++, addr = next, addr != end); 179 } while (pmd++, addr = next, addr != end);
172 180
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fac451a..89bedd0e4cad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
626 bad_page(page); 626 bad_page(page);
627 return 1; 627 return 1;
628 } 628 }
629 page_nid_reset_last(page); 629 page_nidpid_reset_last(page);
630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
632 return 0; 632 return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4015 mminit_verify_page_links(page, zone, nid, pfn); 4015 mminit_verify_page_links(page, zone, nid, pfn);
4016 init_page_count(page); 4016 init_page_count(page);
4017 page_mapcount_reset(page); 4017 page_mapcount_reset(page);
4018 page_nid_reset_last(page); 4018 page_nidpid_reset_last(page);
4019 SetPageReserved(page); 4019 SetPageReserved(page);
4020 /* 4020 /*
4021 * Mark the block movable so that blocks are reserved for 4021 * Mark the block movable so that blocks are reserved for