From 6036f373ea03687d355634fa70fb04baa95ab75e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 10 Nov 2010 10:35:54 -0800
Subject: x86, cpu: Only CPU features determine NX capabilities

Fix the NX feature boot warning when NX is missing to correctly
reflect that BIOSes cannot disable NX now.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
LKML-Reference: <1289414154-7829-5-git-send-email-kees.cook@canonical.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/setup_nx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
 {
 	if (!cpu_has_nx) {
 		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-		       "missing in CPU or disabled in BIOS!\n");
+		       "missing in CPU!\n");
 	} else {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 		if (disable_nx) {
-- 
cgit v1.2.2


From 9223081f54e3dc5045fe41a475165d9003c9a779 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 13 Nov 2010 10:52:09 -0800
Subject: x86: Use online node real index in calulate_tbl_offset()

Found a NUMA system that doesn't have RAM installed at the first
socket which hangs while executing init scripts.

bisected it to:

 | commit 932967202182743c01a2eee4bdfa2c42697bc586
 | Author: Shaohua Li <shaohua.li@intel.com>
 | Date:   Wed Oct 20 11:07:03 2010 +0800
 |
 |     x86: Spread tlb flush vector between nodes

It turns out when first socket is not online it could have cpus on
node1 tlb_offset set to bigger than NUM_INVALIDATE_TLB_VECTORS.

That could affect systems like 4 sockets, but socket 2 doesn't
have installed, sockets 3 will get too big tlb_offset.

Need to use real online node idx.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Shaohua Li <shaohua.li@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CDEDE59.40603@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/tlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 12cdbb17ad18..6acc724d5d8f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 
 static void __cpuinit calculate_tlb_offset(void)
 {
-	int cpu, node, nr_node_vecs;
+	int cpu, node, nr_node_vecs, idx = 0;
 	/*
 	 * we are changing tlb_vector_offset for each CPU in runtime, but this
 	 * will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
 		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
 
 	for_each_online_node(node) {
-		int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+		int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
 			nr_node_vecs;
 		int cpu_offset = 0;
 		for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,6 +248,7 @@ static void __cpuinit calculate_tlb_offset(void)
 			cpu_offset++;
 			cpu_offset = cpu_offset % nr_node_vecs;
 		}
+		idx++;
 	}
 }
 
-- 
cgit v1.2.2


From 64edc8ed5ffae999d8d413ba006850e9e34166cb Mon Sep 17 00:00:00 2001
From: matthieu castet <castet.matthieu@free.fr>
Date: Tue, 16 Nov 2010 22:30:27 +0100
Subject: x86: Fix improper large page preservation

This patch fixes a bug in try_preserve_large_page() which may
result in improper large page preservation and improper
application of page attributes to the memory area outside of the
original change request.

More specifically, the problem manifests itself when set_memory_*()
is called for several pages at the beginning of the large page and
try_preserve_large_page() erroneously concludes that the change can
be applied to whole large page.

The fix consists of 3 parts:

  1. Addition of "required" protection attributes in
     static_protections(), so .data and .bss can be guaranteed to
     stay "RW"

  2. static_protections() is now called for every small
     page within large page to determine compatibility of new
     protection attributes (instead of just small pages within the
     requested range).

  3. Large page can be preserved only if attribute change is
     large-page-aligned and covers whole large page.

 -v1: Try_preserve_large_page() patch for Linux 2.6.34-rc2
 -v2: Replaced pfn check with address check for kernel rw-data

Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: James Morris <jmorris@namei.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dave Jones <davej@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CE2F7F3.8030809@free.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..6f2a6b6deb6b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -255,6 +255,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
+	pgprot_t required = __pgprot(0);
 
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -278,6 +279,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
+	/*
+	 * .data and .bss should always be writable.
+	 */
+	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+		pgprot_val(required) |= _PAGE_RW;
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
@@ -317,6 +324,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
 	return prot;
 }
@@ -393,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 {
 	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
-	pgprot_t old_prot, new_prot;
+	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
 	unsigned int level;
 
@@ -438,10 +446,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * We are safe now. Check whether the new pgprot is the same:
 	 */
 	old_pte = *kpte;
-	old_prot = new_prot = pte_pgprot(old_pte);
+	old_prot = new_prot = req_prot = pte_pgprot(old_pte);
 
-	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
-	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 
 	/*
 	 * old_pte points to the large page base address. So we need
@@ -450,17 +458,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 	cpa->pfn = pfn;
 
-	new_prot = static_protections(new_prot, address, pfn);
+	new_prot = static_protections(req_prot, address, pfn);
 
 	/*
 	 * We need to check the full range, whether
 	 * static_protection() requires a different pgprot for one of
 	 * the pages in the range we try to preserve:
 	 */
-	addr = address + PAGE_SIZE;
-	pfn++;
-	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
-		pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+	addr = address & pmask;
+	pfn = pte_pfn(old_pte);
+	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 
 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 			goto out_unlock;
@@ -483,7 +491,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * that we limited the number of possible pages already to
 	 * the number of pages in the large page.
 	 */
-	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 		/*
 		 * The address is aligned and the number of pages
 		 * covers the full page.
-- 
cgit v1.2.2


From 5bd5a452662bc37c54fb6828db1a3faf87e6511c Mon Sep 17 00:00:00 2001
From: Matthieu Castet <castet.matthieu@free.fr>
Date: Tue, 16 Nov 2010 22:31:26 +0100
Subject: x86: Add NX protection for kernel data

This patch expands functionality of CONFIG_DEBUG_RODATA to set main
(static) kernel data area as NX.

The following steps are taken to achieve this:

 1. Linker script is adjusted so .text always starts and ends on a page bound
 2. Linker script is adjusted so .rodata always start and end on a page boundary
 3. NX is set for all pages from _etext through _end in mark_rodata_ro.
 4. free_init_pages() sets released memory NX in arch/x86/mm/init.c
 5. bios rom is set to x when pcibios is used.

The results of patch application may be observed in the diff of kernel page
table dumps:

pcibios:

 -- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
 ++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
  0x00000000-0xc0000000           3G                           pmd
  ---[ Kernel Mapping ]---
 -0xc0000000-0xc0100000           1M     RW             GLB x  pte
 +0xc0000000-0xc00a0000         640K     RW             GLB NX pte
 +0xc00a0000-0xc0100000         384K     RW             GLB x  pte
 -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
 +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
 +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
 -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
 +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
  0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
  0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
  0xf7bfe000-0xf7c00000           8K                           pte

No pcibios:

 -- data_nx_pt_before.txt       2009-10-13 07:48:59.000000000 -0400
 ++ data_nx_pt_after.txt        2009-10-13 07:26:46.000000000 -0400
  0x00000000-0xc0000000           3G                           pmd
  ---[ Kernel Mapping ]---
 -0xc0000000-0xc0100000           1M     RW             GLB x  pte
 +0xc0000000-0xc0100000           1M     RW             GLB NX pte
 -0xc0100000-0xc03d7000        2908K     ro             GLB x  pte
 +0xc0100000-0xc0318000        2144K     ro             GLB x  pte
 +0xc0318000-0xc03d7000         764K     ro             GLB NX pte
 -0xc03d7000-0xc0600000        2212K     RW             GLB x  pte
 +0xc03d7000-0xc0600000        2212K     RW             GLB NX pte
  0xc0600000-0xf7a00000         884M     RW         PSE GLB NX pmd
  0xf7a00000-0xf7bfe000        2040K     RW             GLB NX pte
  0xf7bfe000-0xf7c00000           8K                           pte

The patch has been originally developed for Linux 2.6.34-rc2 x86 by
Siarhei Liakh <sliakh.lkml@gmail.com> and Xuxian Jiang <jiang@cs.ncsu.edu>.

 -v1:  initial patch for 2.6.30
 -v2:  patch for 2.6.31-rc7
 -v3:  moved all code into arch/x86, adjusted credits
 -v4:  fixed ifdef, removed credits from CREDITS
 -v5:  fixed an address calculation bug in mark_nxdata_nx()
 -v6:  added acked-by and PT dump diff to commit log
 -v7:  minor adjustments for -tip
 -v8:  rework with the merge of "Set first MB as RW+NX"

Signed-off-by: Siarhei Liakh <sliakh.lkml@gmail.com>
Signed-off-by: Xuxian Jiang <jiang@cs.ncsu.edu>
Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: James Morris <jmorris@namei.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dave Jones <davej@redhat.com>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4CE2F82E.60601@free.fr>
[ minor cleanliness edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init.c     |  3 ++-
 arch/x86/mm/init_32.c  | 20 +++++++++++++++++++-
 arch/x86/mm/init_64.c  |  3 ++-
 arch/x86/mm/pageattr.c |  5 ++++-
 4 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	/*
 	 * We just marked the kernel text read only above, now that
 	 * we are going to free part of that, we need to make that
-	 * writeable first.
+	 * writeable and non-executable first.
 	 */
+	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
 	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..f89b5bb4e93f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
 		return 1;
 	return 0;
 }
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
 
+static void mark_nxdata_nx(void)
+{
+	/*
+	 * When this called, init has already been executed and released,
+	 * so everything past _etext sould be NX.
+	 */
+	unsigned long start = PFN_ALIGN(_etext);
+	/*
+	 * This comes from is_kernel_text upper limit. Also HPAGE where used:
+	 */
+	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+	mark_nxdata_nx();
 }
 #endif
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a59296af80..ce59c05cae12 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -788,6 +788,7 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+	unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK);
 	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 	unsigned long data_start = (unsigned long) &_sdata;
@@ -802,7 +803,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6f2a6b6deb6b..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -261,8 +262,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	 * The BIOS area between 640k and 1Mb needs to be executable for
 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 	 */
-	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
 
 	/*
 	 * The kernel text needs to be executable for obvious reasons
-- 
cgit v1.2.2


From 9c0729dc8062bed96189bd14ac6d4920f3958743 Mon Sep 17 00:00:00 2001
From: Soeren Sandmann Pedersen <sandmann@redhat.com>
Date: Fri, 5 Nov 2010 05:59:39 -0400
Subject: x86: Eliminate bp argument from the stack tracing routines

The various stack tracing routines take a 'bp' argument in which the
caller is supposed to provide the base pointer to use, or 0 if doesn't
have one. Since bp is garbage whenever CONFIG_FRAME_POINTER is not
defined, this means all callers in principle should either always pass
0, or be conditional on CONFIG_FRAME_POINTER.

However, there are only really three use cases for stack tracing:

(a) Trace the current task, including IRQ stack if any
(b) Trace the current task, but skip IRQ stack
(c) Trace some other task

In all cases, if CONFIG_FRAME_POINTER is not defined, bp should just
be 0.  If it _is_ defined, then

- in case (a) bp should be gotten directly from the CPU's register, so
  the caller should pass NULL for regs,

- in case (b) the caller should should pass the IRQ registers to
  dump_trace(),

- in case (c) bp should be gotten from the top of the task's stack, so
  the caller should pass NULL for regs.

Hence, the bp argument is not necessary because the combination of
task and regs is sufficient to determine an appropriate value for bp.

This patch introduces a new inline function stack_frame(task, regs)
that computes the desired bp. This function is then called from the
two versions of dump_stack().

Signed-off-by: Soren Sandmann <ssp@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@infradead.org>,
Cc: Frederic Weisbecker <fweisbec@gmail.com>,
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>,
LKML-Reference: <m3oc9rop28.fsf@dhcp-100-3-82.bos.redhat.com>>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/mm/kmemcheck/error.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
 	e->trace.entries = e->trace_entries;
 	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
 	e->trace.skip = 0;
-	save_stack_trace_bp(&e->trace, regs->bp);
+	save_stack_trace_regs(&e->trace, regs);
 
 	/* Round address down to nearest 16 bytes */
 	shadow_copy = kmemcheck_shadow_lookup(address
-- 
cgit v1.2.2


From 691513f70d3957939a318da970987b876c720861 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Mon, 22 Nov 2010 14:03:28 +0100
Subject: x86: Resume trampoline must be executable

commit 5bd5a452(x86: Add NX protection for kernel data) marked the
trampoline area NX - which unsurprisingly breaks resume and cpu
hotplug.

Revert the portion of that commit, which touches the trampoline.

Originally-from: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <1290410581.2405.24.camel@minggr.sh.intel.com>
Cc: Matthieu Castet <castet.matthieu@free.fr>
Cc: Siarhei Liakh <sliakh.lkml@gmail.com>
Cc: Xuxian Jiang <jiang@cs.ncsu.edu>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Andi Kleen <andi@firstfloor.org>
Tested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ce59c05cae12..71a59296af80 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -788,7 +788,6 @@ void mark_rodata_ro(void)
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
-	unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK);
 	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 	unsigned long data_start = (unsigned long) &_sdata;
@@ -803,7 +802,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
-- 
cgit v1.2.2


From f1157141636848f52c5f74040bed0ba355cf59b7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 7 Dec 2010 00:55:29 -0800
Subject: x86, apic: Remove early_init_lapic_mapping()

It is almost the same as smp_register_lapic_addr(). We just need to
let smp_read_mpc() call smp_register_lapic_addr() when early==1.

Add the apic_printk to smp_register_lapic_address()

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
LKML-Reference: <4CFDF681.3030509@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/amdtopology_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 51fae9cfdecb..08a0069b87a5 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -66,7 +66,6 @@ static __init void early_get_boot_cpu_id(void)
 	if (smp_found_config)
 		early_get_smp_config();
 #endif
-	early_init_lapic_mapping();
 }
 
 int __init amd_get_nodes(struct bootnode *physnodes)
-- 
cgit v1.2.2


From c10d1e260f7cb6766dc76b4e36ed8f4be53f195a Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 17 Nov 2010 06:09:52 +0000
Subject: x86, olpc: Add OLPC device-tree support

Make use of PROC_DEVICETREE to export the tree, and sparc's PROMTREE code to
call into OLPC's Open Firmware to build the tree.

v5: fix buglet with root node check (introduced in v4)

v4: address some minor style issues pointed out by Grant, and explicitly cast
    negative phandle checks to s32.

v3: rename olpc_prom to olpc_dt
  - rework Kconfig entries
  - drop devtree build hook from proc, instead adding a call to x86's
    paging_init (similarly to how sparc64 does it)
  - switch allocation from using slab to alloc_bootmem.  this allows
    the DT to be built earlier during boot (during setup_arch); the
    downside is that there are some 1200 bootmem reservations that are
    done during boot.  Not ideal..
  - add a helper olpc_ofw_is_installed function to test for the
    existence and successful detection of OLPC's OFW.

Signed-off-by: Andres Salomon <dilinger@queued.net>
LKML-Reference: <20101116220952.26526a80@queued.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/init_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..8c852e4af452 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
 #include <asm/bugs.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/paravirt.h>
@@ -715,6 +716,7 @@ void __init paging_init(void)
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
+	olpc_dt_build_devicetree();
 	sparse_init();
 	zone_sizes_init();
 }
-- 
cgit v1.2.2


From d3bd058826aa8b79590cca6c8e6d1557bf576ada Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 16 Dec 2010 19:09:58 -0800
Subject: x86, acpi: Parse all SRAT cpu entries even above the cpu number
 limitation

Recent Intel new system have different order in MADT, aka will list all thread0
at first, then all thread1.
But SRAT table still old order, it will list cpus in one socket all together.

If the user have compiled limited NR_CPUS or boot with nr_cpus=, could have missed
to put some cpus apic id to node mapping into apicid_to_node[].

for example for 4 sockets system with 64 cpus with nr_cpus=32 will get crash...

[    9.106288] Total of 32 processors activated (136190.88 BogoMIPS).
[    9.235021] divide error: 0000 [#1] SMP
[    9.235315] last sysfs file:
[    9.235481] CPU 1
[    9.235592] Modules linked in:
[    9.245398]
[    9.245478] Pid: 2, comm: kthreadd Not tainted 2.6.37-rc1-tip-yh-01782-ge92ef79-dirty #274      /Sun Fire x4800
[    9.265415] RIP: 0010:[<ffffffff81075a8f>]  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
...
[    9.645938] RIP  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
[    9.665356]  RSP <ffff88103f8d1c40>
[    9.665568] ---[ end trace 2296156d35fdfc87 ]---

So let just parse all cpu entries in SRAT.

Also add apicid checking with MAX_LOCAL_APIC, in case We could out of boundaries of
apicid_to_node[].

it fixes following bug too.
https://bugzilla.kernel.org/show_bug.cgi?id=22662

-v2: expand to 32bit according to hpa
   need to add MAX_LOCAL_APIC for 32bit

Reported-and-Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Reported-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Tested-by: Myron Stowe <myron.stowe@hp.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D0AD486.9020704@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/srat_32.c |  1 +
 arch/x86/mm/srat_64.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..f16434568a51 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
 	/* mark this node as "seen" in node bitmap */
 	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
 
+	/* don't need to check apic_id here, because it is always 8 bits */
 	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
 
 	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..171a0aacb99a 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 	}
 
 	apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 	else
 		apic_id = pa->apic_id;
+
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
+
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
-- 
cgit v1.2.2


From 4e76f4e67a106ed827ca721b4c8b622047cd2f6d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Dec 2010 17:23:47 -0800
Subject: x86, numa: Avoid compiling NUMA emulation functions without
 CONFIG_NUMA_EMU

Both acpi_get_nodes() and amd_get_nodes() are only necessary when
CONFIG_NUMA_EMU is enabled, so avoid compiling them when the option is
disabled.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701210.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/amdtopology_64.c | 2 ++
 arch/x86/mm/srat_64.c        | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 51fae9cfdecb..fe050af614e2 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -69,6 +69,7 @@ static __init void early_get_boot_cpu_id(void)
 	early_init_lapic_mapping();
 }
 
+#ifdef CONFIG_NUMA_EMU
 int __init amd_get_nodes(struct bootnode *physnodes)
 {
 	int i;
@@ -81,6 +82,7 @@ int __init amd_get_nodes(struct bootnode *physnodes)
 	}
 	return ret;
 }
+#endif /* CONFIG_NUMA_EMU */
 
 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 {
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..8241bf0f6eb2 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -339,6 +339,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 
 void __init acpi_numa_arch_fixup(void) {}
 
+#ifdef CONFIG_NUMA_EMU
 int __init acpi_get_nodes(struct bootnode *physnodes)
 {
 	int i;
@@ -351,6 +352,7 @@ int __init acpi_get_nodes(struct bootnode *physnodes)
 	}
 	return ret;
 }
+#endif /* CONFIG_NUMA_EMU */
 
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
-- 
cgit v1.2.2


From f51bf3073a145a5b3263fd882c52d6ec04b687da Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Dec 2010 17:23:51 -0800
Subject: x86, numa: Fake apicid and pxm mappings for NUMA emulation

This patch adds the equivalent of acpi_fake_nodes() for AMD Northbridge
platforms.  The goal is to fake the apicid-to-node mappings for NUMA
emulation so the physical topology of the machine is correctly maintained
within the kernel.

This change also fakes proximity domains for both ACPI and k8 code so the
physical distance between emulated nodes is maintained via
node_distance().  This exports the correct distances via
/sys/devices/system/node/.../distance based on the underlying topology.

A new helper function, fake_physnodes(), is introduced to correctly
invoke the correct NUMA code to fake these two mappings based on the
system type.  If there is no underlying NUMA configuration, all cpus are
mapped to node 0 for local distance.

Since acpi_fake_nodes() is no longer called with CONFIG_ACPI_NUMA, it's
prototype can be removed from the header file for such a configuration.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701360.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/amdtopology_64.c | 91 ++++++++++++++++++++++++++++++++++++--------
 arch/x86/mm/numa_64.c        | 20 +++++++++-
 arch/x86/mm/srat_64.c        |  2 -
 3 files changed, 94 insertions(+), 19 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index fe050af614e2..eb5cbb97b68d 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -27,6 +27,7 @@
 #include <asm/amd_nb.h>
 
 static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 
 static __init int find_northbridge(void)
@@ -69,21 +70,6 @@ static __init void early_get_boot_cpu_id(void)
 	early_init_lapic_mapping();
 }
 
-#ifdef CONFIG_NUMA_EMU
-int __init amd_get_nodes(struct bootnode *physnodes)
-{
-	int i;
-	int ret = 0;
-
-	for_each_node_mask(i, nodes_parsed) {
-		physnodes[ret].start = nodes[i].start;
-		physnodes[ret].end = nodes[i].end;
-		ret++;
-	}
-	return ret;
-}
-#endif /* CONFIG_NUMA_EMU */
-
 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long start = PFN_PHYS(start_pfn);
@@ -116,7 +102,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 		base = read_pci_config(0, nb, 1, 0x40 + i*8);
 		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
 
-		nodeid = limit & 7;
+		nodeids[i] = nodeid = limit & 7;
 		if ((base & 3) == 0) {
 			if (i < numnodes)
 				pr_info("Skipping disabled node %d\n", i);
@@ -196,6 +182,79 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_EMU
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+int __init amd_get_nodes(struct bootnode *physnodes)
+{
+	int i;
+	int ret = 0;
+
+	for_each_node_mask(i, nodes_parsed) {
+		physnodes[ret].start = nodes[i].start;
+		physnodes[ret].end = nodes[i].end;
+		ret++;
+	}
+	return ret;
+}
+
+static int __init find_node_by_addr(unsigned long addr)
+{
+	int ret = NUMA_NO_NODE;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		if (addr >= nodes[i].start && addr < nodes[i].end) {
+			ret = i;
+			break;
+		}
+	return ret;
+}
+
+/*
+ * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+ * setup to represent the physical topology but reflect the emulated
+ * environment.  For each emulated node, the real node which it appears on is
+ * found and a fake pxm to nid mapping is created which mirrors the actual
+ * locality.  node_distance() then represents the correct distances between
+ * emulated nodes by using the fake acpi mappings to pxms.
+ */
+void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
+{
+	unsigned int bits;
+	unsigned int cores;
+	unsigned int apicid_base = 0;
+	int i;
+
+	bits = boot_cpu_data.x86_coreid_bits;
+	cores = 1 << bits;
+	early_get_boot_cpu_id();
+	if (boot_cpu_physical_apicid > 0)
+		apicid_base = boot_cpu_physical_apicid;
+
+	for (i = 0; i < nr_nodes; i++) {
+		int index;
+		int nid;
+		int j;
+
+		nid = find_node_by_addr(nodes[i].start);
+		if (nid == NUMA_NO_NODE)
+			continue;
+
+		index = nodeids[nid] << bits;
+		if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
+			for (j = apicid_base; j < cores + apicid_base; j++)
+				fake_apicid_to_node[index + j] = i;
+#ifdef CONFIG_ACPI_NUMA
+		__acpi_map_pxm_to_node(nid, i);
+#endif
+	}
+	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+}
+#endif /* CONFIG_NUMA_EMU */
+
 int __init amd_scan_nodes(void)
 {
 	unsigned int bits;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7762a517d69d..cc390f3a1bde 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -324,6 +324,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 	return ret;
 }
 
+static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+{
+	int i;
+
+	BUG_ON(acpi && amd);
+#ifdef CONFIG_ACPI_NUMA
+	if (acpi)
+		acpi_fake_nodes(nodes, nr_nodes);
+#endif
+#ifdef CONFIG_AMD_NUMA
+	if (amd)
+		amd_fake_nodes(nodes, nr_nodes);
+#endif
+	if (!acpi && !amd)
+		for (i = 0; i < nr_cpu_ids; i++)
+			numa_set_node(i, 0);
+}
+
 /*
  * Setups up nid to range from addr to addr + size.  If the end
  * boundary is greater than max_addr, then max_addr is used instead.
@@ -595,7 +613,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 						nodes[i].end >> PAGE_SHIFT);
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
-	acpi_fake_nodes(nodes, num_nodes);
+	fake_physnodes(acpi, amd, num_nodes);
 	numa_init_array();
 	return 0;
 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 8241bf0f6eb2..c48b443706c5 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -497,8 +497,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
 	int i, j;
 
-	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
-			 "topology.\n");
 	for (i = 0; i < num_nodes; i++) {
 		int nid, pxm;
 
-- 
cgit v1.2.2


From c1c3443c9c5e9be92641029ed229a41563e44506 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Dec 2010 17:23:54 -0800
Subject: x86, numa: Fake node-to-cpumask for NUMA emulation

It's necessary to fake the node-to-cpumask mapping so that an emulated
node ID returns a cpumask that includes all cpus that have affinity to
the memory it represents.

This is a little intrusive because it requires knowledge of the physical
topology of the system.  setup_physnodes() gives us that information, but
since NUMA emulation ends up altering the physnodes array, it's necessary
to reset it before cpus are brought online.

Accordingly, the physnodes array is moved out of init.data and into
cpuinit.data since it will be needed on cpuup callbacks.

This works regardless of whether numa=fake is used on the command line,
or the setup of the fake node succeeds or fails.  The physnodes array
always contains the physical topology of the machine if CONFIG_NUMA_EMU
is enabled and can be used to setup the correct node-to-cpumask mappings
in all cases since setup_physnodes() is called whenever the array needs
to be repopulated with the correct data.

To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are
rewritten for CONFIG_NUMA_EMU so that we first find the physical node to
which each cpu has local affinity, then iterate through all online nodes
to find the emulated nodes that have local affinity to that physical
node, and then finally map the cpu to each of those emulated nodes.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/numa_64.c | 99 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 79 insertions(+), 20 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cc390f3a1bde..dd300c491f1f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -260,7 +260,7 @@ void __init numa_init_array(void)
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
 static char *cmdline __initdata;
 
 static int __init setup_physnodes(unsigned long start, unsigned long end,
@@ -270,6 +270,7 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 	int ret = 0;
 	int i;
 
+	memset(physnodes, 0, sizeof(physnodes));
 #ifdef CONFIG_ACPI_NUMA
 	if (acpi)
 		nr_nodes = acpi_get_nodes(physnodes);
@@ -370,8 +371,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  * to max_addr.  The return value is the number of nodes allocated.
  */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
-						int nr_phys_nodes, int nr_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
 {
 	nodemask_t physnode_mask = NODE_MASK_NONE;
 	u64 size;
@@ -402,7 +402,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
 		return -1;
 	}
 
-	for (i = 0; i < nr_phys_nodes; i++)
+	for (i = 0; i < MAX_NUMNODES; i++)
 		if (physnodes[i].start != physnodes[i].end)
 			node_set(i, physnode_mask);
 
@@ -571,11 +571,9 @@ static int __init numa_emulation(unsigned long start_pfn,
 {
 	u64 addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
-	int num_phys_nodes;
 	int num_nodes;
 	int i;
 
-	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
 	/*
 	 * If the numa=fake command-line contains a 'M' or 'G', it represents
 	 * the fixed node size.  Otherwise, if it is just a single number N,
@@ -590,7 +588,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 		unsigned long n;
 
 		n = simple_strtoul(cmdline, NULL, 0);
-		num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
+		num_nodes = split_nodes_interleave(addr, max_addr, n);
 	}
 
 	if (num_nodes < 0)
@@ -613,6 +611,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 						nodes[i].end >> PAGE_SHIFT);
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
+	setup_physnodes(addr, max_addr, acpi, amd);
 	fake_physnodes(acpi, amd, num_nodes);
 	numa_init_array();
 	return 0;
@@ -628,8 +627,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
+	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+			acpi, amd);
 	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
 		return;
+	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+			acpi, amd);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 #endif
@@ -785,6 +788,7 @@ void __cpuinit numa_clear_node(int cpu)
 
 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 
+#ifndef CONFIG_NUMA_EMU
 void __cpuinit numa_add_cpu(int cpu)
 {
 	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -794,6 +798,51 @@ void __cpuinit numa_remove_cpu(int cpu)
 {
 	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 }
+#else
+void __cpuinit numa_add_cpu(int cpu)
+{
+	unsigned long addr;
+	u16 apicid;
+	int physnid;
+	int nid = NUMA_NO_NODE;
+
+	apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+	if (apicid != BAD_APICID)
+		nid = apicid_to_node[apicid];
+	if (nid == NUMA_NO_NODE)
+		nid = early_cpu_to_node(cpu);
+	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+	/*
+	 * Use the starting address of the emulated node to find which physical
+	 * node it is allocated on.
+	 */
+	addr = node_start_pfn(nid) << PAGE_SHIFT;
+	for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
+		if (addr >= physnodes[physnid].start &&
+		    addr < physnodes[physnid].end)
+			break;
+
+	/*
+	 * Map the cpu to each emulated node that is allocated on the physical
+	 * node of the cpu's apic id.
+	 */
+	for_each_online_node(nid) {
+		addr = node_start_pfn(nid) << PAGE_SHIFT;
+		if (addr >= physnodes[physnid].start &&
+		    addr < physnodes[physnid].end)
+			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+	}
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	int i;
+
+	for_each_online_node(i)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#endif /* !CONFIG_NUMA_EMU */
 
 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
 
@@ -805,22 +854,32 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
 	int node = early_cpu_to_node(cpu);
 	struct cpumask *mask;
 	char buf[64];
+	int i;
 
-	mask = node_to_cpumask_map[node];
-	if (mask == NULL) {
-		printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
-		dump_stack();
-		return;
-	}
+	for_each_online_node(i) {
+		unsigned long addr;
 
-	if (enable)
-		cpumask_set_cpu(cpu, mask);
-	else
-		cpumask_clear_cpu(cpu, mask);
+		addr = node_start_pfn(i) << PAGE_SHIFT;
+		if (addr < physnodes[node].start ||
+					addr >= physnodes[node].end)
+			continue;
+		mask = node_to_cpumask_map[node];
+		if (mask == NULL) {
+			pr_err("node_to_cpumask_map[%i] NULL\n", i);
+			dump_stack();
+			return;
+		}
+
+		if (enable)
+			cpumask_set_cpu(cpu, mask);
+		else
+			cpumask_clear_cpu(cpu, mask);
 
-	cpulist_scnprintf(buf, sizeof(buf), mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+		cpulist_scnprintf(buf, sizeof(buf), mask);
+		printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+			enable ? "numa_add_cpu" : "numa_remove_cpu",
+			cpu, node, buf);
+	}
 }
 
 void __cpuinit numa_add_cpu(int cpu)
-- 
cgit v1.2.2


From a387e95a49743cf9835c5299ca549232618d8249 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 22 Dec 2010 17:23:56 -0800
Subject: x86, numa: Fix cpu to node mapping for sparse node ids

NUMA boot code assumes that physical node ids start at 0, but the DIMMs
that the apic id represents may not be reachable.  If this is the case,
node 0 is never online and cpus never end up getting appropriately
assigned to a node.  This causes the cpumask of all online nodes to be
empty and machines crash with kernel code assuming online nodes have
valid cpus.

The fix is to appropriately map all the address ranges for physical nodes
and ensure the cpu to node mapping function checks all possible nodes (up
to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the
number of physical nodes, for valid address ranges.

This requires no longer "compressing" the address ranges of nodes in the
physical node map from 0-N, but rather leave indices in physnodes[] to
represent the actual node id of the physical node.  Accordingly, the
topology exported by both amd_get_nodes() and acpi_get_nodes() no longer
must return the number of nodes to iterate through; all such iterations
will now be to MAX_NUMNODES.

This change also passes the end address of system RAM (which may be
different from normal operation if mem= is specified on the command line)
before the physnodes[] array is populated.  ACPI parsed nodes are
truncated to fit within the address range that respect the mem=
boundaries and even some physical nodes may become unreachable in such
cases.

When NUMA emulation does succeed, any apicid to node mapping that exists
for unreachable nodes are given default values so that proximity domains
can still be assigned.  This is important for node_distance() to
function as desired.

Signed-off-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221702090.3701@chino.kir.corp.google.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/amdtopology_64.c |  9 +++------
 arch/x86/mm/numa_64.c        | 18 +++---------------
 arch/x86/mm/srat_64.c        | 22 ++++++++++++++++------
 3 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index eb5cbb97b68d..0df2623d1039 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -187,17 +187,14 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-int __init amd_get_nodes(struct bootnode *physnodes)
+void __init amd_get_nodes(struct bootnode *physnodes)
 {
 	int i;
-	int ret = 0;
 
 	for_each_node_mask(i, nodes_parsed) {
-		physnodes[ret].start = nodes[i].start;
-		physnodes[ret].end = nodes[i].end;
-		ret++;
+		physnodes[i].start = nodes[i].start;
+		physnodes[i].end = nodes[i].end;
 	}
-	return ret;
 }
 
 static int __init find_node_by_addr(unsigned long addr)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dd300c491f1f..3d73201ba347 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -266,25 +266,24 @@ static char *cmdline __initdata;
 static int __init setup_physnodes(unsigned long start, unsigned long end,
 					int acpi, int amd)
 {
-	int nr_nodes = 0;
 	int ret = 0;
 	int i;
 
 	memset(physnodes, 0, sizeof(physnodes));
 #ifdef CONFIG_ACPI_NUMA
 	if (acpi)
-		nr_nodes = acpi_get_nodes(physnodes);
+		acpi_get_nodes(physnodes, start, end);
 #endif
 #ifdef CONFIG_AMD_NUMA
 	if (amd)
-		nr_nodes = amd_get_nodes(physnodes);
+		amd_get_nodes(physnodes);
 #endif
 	/*
 	 * Basic sanity checking on the physical node map: there may be errors
 	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
 	 * kernel parameter is used.
 	 */
-	for (i = 0; i < nr_nodes; i++) {
+	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (physnodes[i].start == physnodes[i].end)
 			continue;
 		if (physnodes[i].start > end) {
@@ -299,17 +298,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
 			physnodes[i].start = start;
 		if (physnodes[i].end > end)
 			physnodes[i].end = end;
-	}
-
-	/*
-	 * Remove all nodes that have no memory or were truncated because of the
-	 * limited address range.
-	 */
-	for (i = 0; i < nr_nodes; i++) {
-		if (physnodes[i].start == physnodes[i].end)
-			continue;
-		physnodes[ret].start = physnodes[i].start;
-		physnodes[ret].end = physnodes[i].end;
 		ret++;
 	}
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c48b443706c5..a756bcf3fa48 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -340,17 +340,16 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 void __init acpi_numa_arch_fixup(void) {}
 
 #ifdef CONFIG_NUMA_EMU
-int __init acpi_get_nodes(struct bootnode *physnodes)
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+				unsigned long end)
 {
 	int i;
-	int ret = 0;
 
 	for_each_node_mask(i, nodes_parsed) {
-		physnodes[ret].start = nodes[i].start;
-		physnodes[ret].end = nodes[i].end;
-		ret++;
+		cutoff_node(i, start, end);
+		physnodes[i].start = nodes[i].start;
+		physnodes[i].end = nodes[i].end;
 	}
-	return ret;
 }
 #endif /* CONFIG_NUMA_EMU */
 
@@ -516,6 +515,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 			    fake_apicid_to_node[j] == NUMA_NO_NODE)
 				fake_apicid_to_node[j] = i;
 	}
+
+	/*
+	 * If there are apicid-to-node mappings for physical nodes that do not
+	 * have a corresponding emulated node, it should default to a guaranteed
+	 * value.
+	 */
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		if (apicid_to_node[i] != NUMA_NO_NODE &&
+		    fake_apicid_to_node[i] == NUMA_NO_NODE)
+			fake_apicid_to_node[i] = 0;
+
 	for (i = 0; i < num_nodes; i++)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-- 
cgit v1.2.2


From d906f0eb2f0e6d1a24c479f69a9c39e7e45c5ae8 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 30 Dec 2010 10:54:16 -0800
Subject: x86, numa: Fix CONFIG_DEBUG_PER_CPU_MAPS without NUMA emulation

"x86, numa: Fake node-to-cpumask for NUMA emulation" broke the
build when CONFIG_DEBUG_PER_CPU_MAPS is set and CONFIG_NUMA_EMU
is not.  This is because it is possible to map a cpu to multiple
nodes when NUMA emulation is used; the patch required a physical
node address table to find those nodes that was only available
when CONFIG_NUMA_EMU was enabled.

This extracts the common debug functionality to its own function
for CONFIG_DEBUG_PER_CPU_MAPS and uses it regardless of whether
CONFIG_NUMA_EMU is set or not.

NUMA emulation will now iterate over the set of possible nodes
for each cpu and call the new debug function whereas only the
cpu's node will be used without NUMA emulation enabled.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <alpine.DEB.2.00.1012301053590.12995@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa_64.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d73201ba347..1e72102e80c9 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -833,15 +833,48 @@ void __cpuinit numa_remove_cpu(int cpu)
 #endif /* !CONFIG_NUMA_EMU */
 
 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
+static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+	int node = early_cpu_to_node(cpu);
+	struct cpumask *mask;
+	char buf[64];
+
+	mask = node_to_cpumask_map[node];
+	if (!mask) {
+		pr_err("node_to_cpumask_map[%i] NULL\n", node);
+		dump_stack();
+		return NULL;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable ? "numa_add_cpu" : "numa_remove_cpu",
+		cpu, node, buf);
+	return mask;
+}
 
 /*
  * --------- debug versions of the numa functions ---------
  */
+#ifndef CONFIG_NUMA_EMU
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	struct cpumask *mask;
+
+	mask = debug_cpumask_set_cpu(cpu, enable);
+	if (!mask)
+		return;
+
+	if (enable)
+		cpumask_set_cpu(cpu, mask);
+	else
+		cpumask_clear_cpu(cpu, mask);
+}
+#else
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
 	int node = early_cpu_to_node(cpu);
 	struct cpumask *mask;
-	char buf[64];
 	int i;
 
 	for_each_online_node(i) {
@@ -851,24 +884,17 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
 		if (addr < physnodes[node].start ||
 					addr >= physnodes[node].end)
 			continue;
-		mask = node_to_cpumask_map[node];
-		if (mask == NULL) {
-			pr_err("node_to_cpumask_map[%i] NULL\n", i);
-			dump_stack();
+		mask = debug_cpumask_set_cpu(cpu, enable);
+		if (!mask)
 			return;
-		}
 
 		if (enable)
 			cpumask_set_cpu(cpu, mask);
 		else
 			cpumask_clear_cpu(cpu, mask);
-
-		cpulist_scnprintf(buf, sizeof(buf), mask);
-		printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-			enable ? "numa_add_cpu" : "numa_remove_cpu",
-			cpu, node, buf);
 	}
 }
+#endif /* CONFIG_NUMA_EMU */
 
 void __cpuinit numa_add_cpu(int cpu)
 {
-- 
cgit v1.2.2


From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:32 -0800
Subject: thp: alter compound get_page/put_page

Alter compound get_page/put_page to keep references on subpages too, in
order to allow __split_huge_page_refcount to split an hugepage even while
subpages have been pinned by one of the get_user_pages() variants.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..06f56fcf9a77 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -105,6 +105,16 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	atomic_add(nr, &page->_count);
 }
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(atomic_read(&page->_count) < 0);
+	atomic_inc(&page->_count);
+}
+
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -128,6 +138,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
+		if (PageTail(page))
+			get_huge_page_tail(page);
 		(*nr)++;
 		page++;
 		refs++;
-- 
cgit v1.2.2


From db3eb96f4e6281b84dd33c8980dacc27f2efe177 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:41 -0800
Subject: thp: add pmd mangling functions to x86

Add needed pmd mangling functions with symmetry with their pte
counterparts.  pmdp_splitting_flush() is the only new addition on the pmd_
methods and it's needed to serialize the VM against split_huge_page.  It
simply atomically sets the splitting bit in a similar way
pmdp_clear_flush_young atomically clears the accessed bit.
pmdp_splitting_flush() also has to flush the tlb to make it effective
against gup_fast, but it wouldn't really require to flush the tlb too.
Just the tlb flush is the simplest operation we can invoke to serialize
pmdp_splitting_flush() against gup_fast.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pgtable.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..65e92d58f942 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	return changed;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp,
+			  pmd_t entry, int dirty)
+{
+	int changed = !pmd_same(*pmdp, entry);
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	if (changed && dirty) {
+		*pmdp = entry;
+		pmd_update_defer(vma->vm_mm, address, pmdp);
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	}
+
+	return changed;
+}
+#endif
+
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
 			      unsigned long addr, pte_t *ptep)
 {
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
 	return ret;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pmd_t *pmdp)
+{
+	int ret = 0;
+
+	if (pmd_young(*pmdp))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *) &pmdp->pmd);
+
+	if (ret)
+		pmd_update(vma->vm_mm, addr, pmdp);
+
+	return ret;
+}
+#endif
+
 int ptep_clear_flush_young(struct vm_area_struct *vma,
 			   unsigned long address, pte_t *ptep)
 {
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
 	return young;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pmd_t *pmdp)
+{
+	int young;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	young = pmdp_test_and_clear_young(vma, address, pmdp);
+	if (young)
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+	return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp)
+{
+	int set;
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+				(unsigned long *)&pmdp->pmd);
+	if (set) {
+		pmd_update(vma->vm_mm, address, pmdp);
+		/* need tlb flush only to serialize against gup-fast */
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	}
+}
+#endif
+
 /**
  * reserve_top_address - reserves a hole in the top of kernel address space
  * @reserve - size of hole to reserve
-- 
cgit v1.2.2


From 64cc6ae001d70bc59e5f854e6b5678f59110df16 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:42 -0800
Subject: thp: bail out gup_fast on splitting pmd

Force gup_fast to take the slow path and block if the pmd is splitting,
not only if it's none.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 06f56fcf9a77..269aa53932e0 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -160,7 +160,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		pmd_t pmd = *pmdp;
 
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
+		/*
+		 * The pmd_trans_splitting() check below explains why
+		 * pmdp_splitting_flush has to flush the tlb, to stop
+		 * this gup-fast code from running while we set the
+		 * splitting bit in the pmd. Returning zero will take
+		 * the slow path that will call wait_split_huge_page()
+		 * if the pmd is still in splitting state. gup-fast
+		 * can't because it has irq disabled and
+		 * wait_split_huge_page() would never return as the
+		 * tlb flush IPI wouldn't run.
+		 */
+		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
 			return 0;
 		if (unlikely(pmd_large(pmd))) {
 			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
-- 
cgit v1.2.2


From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 13 Jan 2011 15:47:01 -0800
Subject: thp: add x86 32bit support

Add support for transparent hugepages to x86 32bit.

Share the same VM_ bitflag for VM_MAPPED_COPY.  mm/nommu.c will never
support transparent hugepages.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 65e92d58f942..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -362,7 +362,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
 	if (pmd_young(*pmdp))
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
-					 (unsigned long *) &pmdp->pmd);
+					 (unsigned long *)pmdp);
 
 	if (ret)
 		pmd_update(vma->vm_mm, addr, pmdp);
@@ -404,7 +404,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	int set;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
-				(unsigned long *)&pmdp->pmd);
+				(unsigned long *)pmdp);
 	if (set) {
 		pmd_update(vma->vm_mm, address, pmdp);
 		/* need tlb flush only to serialize against gup-fast */
-- 
cgit v1.2.2


From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:10 -0800
Subject: thp: mmu_notifier_test_young

For GRU and EPT, we need gup-fast to set referenced bit too (this is why
it's correct to return 0 when shadow_access_mask is zero, it requires
gup-fast to set the referenced bit).  qemu-kvm access already sets the
young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow
paging EPT minor fault we relay on gup-fast to signal the page is in
use...

We also need to check the young bits on the secondary pagetables for NPT
and not nested shadow mmu as the data may never get accessed again by the
primary pte.

Without this closer accuracy, we'd have to remove the heuristic that
avoids collapsing hugepages in hugepage virtual regions that have not even
a single subpage in use.

->test_young is full backwards compatible with GRU and other usages that
don't have young bits in pagetables set by the hardware and that should
nuke the secondary mmu mappings when ->clear_flush_young runs just like
EPT does.

Removing the heuristic that checks the young bit in
khugepaged/collapse_huge_page completely isn't so bad either probably but
I thought it was worth it and this makes it reliable.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/gup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 269aa53932e0..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #include <asm/pgtable.h>
 
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
 		get_page(page);
+		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
 
@@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	VM_BUG_ON(page != compound_head(page));
 	VM_BUG_ON(page_count(page) == 0);
 	atomic_add(nr, &page->_count);
+	SetPageReferenced(page);
 }
 
 static inline void get_huge_page_tail(struct page *page)
-- 
cgit v1.2.2


From 9032160275ba018003ff390835ff8ed2b5b788b8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 19 Jan 2011 08:57:21 +0000
Subject: x86: Unify "numa=" command line option handling

In order to be able to suppress the use of SRAT tables that
32-bit Linux can't deal with (in one case known to lead to a
non-bootable system, unless disabling ACPI altogether), move the
"numa=" option handling to common code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Renninger <trenn@suse.de>
LKML-Reference: <4D36B581020000780002D0FF@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa.c    | 22 ++++++++++++++++++++++
 arch/x86/mm/numa_64.c | 24 +++++-------------------
 arch/x86/mm/srat_32.c |  1 -
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <linux/bootmem.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+int __initdata numa_off;
+
+static __init int numa_setup(char *opt)
+{
+	if (!opt)
+		return -EINVAL;
+	if (!strncmp(opt, "off", 3))
+		numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+	if (!strncmp(opt, "fake=", 5))
+		numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+	if (!strncmp(opt, "noacpi", 6))
+		acpi_numa = -1;
+#endif
+	return 0;
+}
+early_param("numa", numa_setup);
 
 /*
  * Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1e72102e80c9..95ea1551eebc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
@@ -263,6 +262,11 @@ static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
 static char *cmdline __initdata;
 
+void __init numa_emu_cmdline(char *str)
+{
+	cmdline = str;
+}
+
 static int __init setup_physnodes(unsigned long start, unsigned long end,
 					int acpi, int amd)
 {
@@ -670,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 }
 
-static __init int numa_setup(char *opt)
-{
-	if (!opt)
-		return -EINVAL;
-	if (!strncmp(opt, "off", 3))
-		numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
-	if (!strncmp(opt, "fake=", 5))
-		cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
-	if (!strncmp(opt, "noacpi", 6))
-		acpi_numa = -1;
-#endif
-	return 0;
-}
-early_param("numa", numa_setup);
-
 #ifdef CONFIG_NUMA
 
 static __init int find_near_online_node(int node)
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f16434568a51..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 static int __initdata num_memory_chunks; /* total number of memory chunks */
 static u8 __initdata apicid_to_pxm[MAX_APICID];
 
-int numa_off __initdata;
 int acpi_numa __initdata;
 
 static __init void bad_srat(void)
-- 
cgit v1.2.2


From f12d3d04e8f6223276abb068c5d72852174b8c31 Mon Sep 17 00:00:00 2001
From: Matthieu CASTET <castet.matthieu@free.fr>
Date: Thu, 20 Jan 2011 21:11:45 +0100
Subject: x86, nx: Don't force pages RW when setting NX bits

Xen want page table pages read only.

But the initial page table (from head_*.S) live in .data or .bss.

That was broken by 64edc8ed5ffae999d8d413ba006850e9e34166cb.  There is
absolutely no reason to force these pages RW after they have already
been marked RO.

Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/pageattr.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca14ac4..d343b3c81f3c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
-	pgprot_t required = __pgprot(0);
 
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -282,12 +281,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
-	/*
-	 * .data and .bss should always be writable.
-	 */
-	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
-	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
-		pgprot_val(required) |= _PAGE_RW;
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
@@ -327,7 +320,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
-	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
 	return prot;
 }
-- 
cgit v1.2.2