From 6036f373ea03687d355634fa70fb04baa95ab75e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Nov 2010 10:35:54 -0800 Subject: x86, cpu: Only CPU features determine NX capabilities Fix the NX feature boot warning when NX is missing to correctly reflect that BIOSes cannot disable NX now. Signed-off-by: Kees Cook LKML-Reference: <1289414154-7829-5-git-send-email-kees.cook@canonical.com> Acked-by: Pekka Enberg Acked-by: Alan Cox Signed-off-by: H. Peter Anvin --- arch/x86/mm/setup_nx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa34086..410531d3c292 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -41,7 +41,7 @@ void __init x86_report_nx(void) { if (!cpu_has_nx) { printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU or disabled in BIOS!\n"); + "missing in CPU!\n"); } else { #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) if (disable_nx) { -- cgit v1.2.2 From 9223081f54e3dc5045fe41a475165d9003c9a779 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 13 Nov 2010 10:52:09 -0800 Subject: x86: Use online node real index in calulate_tbl_offset() Found a NUMA system that doesn't have RAM installed at the first socket which hangs while executing init scripts. bisected it to: | commit 932967202182743c01a2eee4bdfa2c42697bc586 | Author: Shaohua Li | Date: Wed Oct 20 11:07:03 2010 +0800 | | x86: Spread tlb flush vector between nodes It turns out when first socket is not online it could have cpus on node1 tlb_offset set to bigger than NUM_INVALIDATE_TLB_VECTORS. That could affect systems like 4 sockets, but socket 2 doesn't have installed, sockets 3 will get too big tlb_offset. Need to use real online node idx. Signed-off-by: Yinghai Lu Acked-by: Shaohua Li Cc: Linus Torvalds LKML-Reference: <4CDEDE59.40603@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 12cdbb17ad18..6acc724d5d8f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static void __cpuinit calculate_tlb_offset(void) { - int cpu, node, nr_node_vecs; + int cpu, node, nr_node_vecs, idx = 0; /* * we are changing tlb_vector_offset for each CPU in runtime, but this * will not cause inconsistency, as the write is atomic under X86. we @@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void) nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; for_each_online_node(node) { - int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * + int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * nr_node_vecs; int cpu_offset = 0; for_each_cpu(cpu, cpumask_of_node(node)) { @@ -248,6 +248,7 @@ static void __cpuinit calculate_tlb_offset(void) cpu_offset++; cpu_offset = cpu_offset % nr_node_vecs; } + idx++; } } -- cgit v1.2.2 From 64edc8ed5ffae999d8d413ba006850e9e34166cb Mon Sep 17 00:00:00 2001 From: matthieu castet Date: Tue, 16 Nov 2010 22:30:27 +0100 Subject: x86: Fix improper large page preservation This patch fixes a bug in try_preserve_large_page() which may result in improper large page preservation and improper application of page attributes to the memory area outside of the original change request. More specifically, the problem manifests itself when set_memory_*() is called for several pages at the beginning of the large page and try_preserve_large_page() erroneously concludes that the change can be applied to whole large page. The fix consists of 3 parts: 1. Addition of "required" protection attributes in static_protections(), so .data and .bss can be guaranteed to stay "RW" 2. static_protections() is now called for every small page within large page to determine compatibility of new protection attributes (instead of just small pages within the requested range). 3. Large page can be preserved only if attribute change is large-page-aligned and covers whole large page. -v1: Try_preserve_large_page() patch for Linux 2.6.34-rc2 -v2: Replaced pfn check with address check for kernel rw-data Signed-off-by: Siarhei Liakh Signed-off-by: Xuxian Jiang Reviewed-by: Suresh Siddha Cc: Arjan van de Ven Cc: James Morris Cc: Andi Kleen Cc: Rusty Russell Cc: Stephen Rothwell Cc: Dave Jones Cc: Kees Cook Cc: Linus Torvalds LKML-Reference: <4CE2F7F3.8030809@free.fr> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 532e7933d606..6f2a6b6deb6b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -255,6 +255,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, unsigned long pfn) { pgprot_t forbidden = __pgprot(0); + pgprot_t required = __pgprot(0); /* * The BIOS area between 640k and 1Mb needs to be executable for @@ -278,6 +279,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; + /* + * .data and .bss should always be writable. + */ + if (within(address, (unsigned long)_sdata, (unsigned long)_edata) || + within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) + pgprot_val(required) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* @@ -317,6 +324,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + prot = __pgprot(pgprot_val(prot) | pgprot_val(required)); return prot; } @@ -393,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, { unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; pte_t new_pte, old_pte, *tmp; - pgprot_t old_prot, new_prot; + pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; unsigned int level; @@ -438,10 +446,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * We are safe now. Check whether the new pgprot is the same: */ old_pte = *kpte; - old_prot = new_prot = pte_pgprot(old_pte); + old_prot = new_prot = req_prot = pte_pgprot(old_pte); - pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); - pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * old_pte points to the large page base address. So we need @@ -450,17 +458,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; - new_prot = static_protections(new_prot, address, pfn); + new_prot = static_protections(req_prot, address, pfn); /* * We need to check the full range, whether * static_protection() requires a different pgprot for one of * the pages in the range we try to preserve: */ - addr = address + PAGE_SIZE; - pfn++; - for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(new_prot, addr, pfn); + addr = address & pmask; + pfn = pte_pfn(old_pte); + for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(req_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) goto out_unlock; @@ -483,7 +491,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * that we limited the number of possible pages already to * the number of pages in the large page. */ - if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { /* * The address is aligned and the number of pages * covers the full page. -- cgit v1.2.2 From 5bd5a452662bc37c54fb6828db1a3faf87e6511c Mon Sep 17 00:00:00 2001 From: Matthieu Castet Date: Tue, 16 Nov 2010 22:31:26 +0100 Subject: x86: Add NX protection for kernel data This patch expands functionality of CONFIG_DEBUG_RODATA to set main (static) kernel data area as NX. The following steps are taken to achieve this: 1. Linker script is adjusted so .text always starts and ends on a page bound 2. Linker script is adjusted so .rodata always start and end on a page boundary 3. NX is set for all pages from _etext through _end in mark_rodata_ro. 4. free_init_pages() sets released memory NX in arch/x86/mm/init.c 5. bios rom is set to x when pcibios is used. The results of patch application may be observed in the diff of kernel page table dumps: pcibios: -- data_nx_pt_before.txt 2009-10-13 07:48:59.000000000 -0400 ++ data_nx_pt_after.txt 2009-10-13 07:26:46.000000000 -0400 0x00000000-0xc0000000 3G pmd ---[ Kernel Mapping ]--- -0xc0000000-0xc0100000 1M RW GLB x pte +0xc0000000-0xc00a0000 640K RW GLB NX pte +0xc00a0000-0xc0100000 384K RW GLB x pte -0xc0100000-0xc03d7000 2908K ro GLB x pte +0xc0100000-0xc0318000 2144K ro GLB x pte +0xc0318000-0xc03d7000 764K ro GLB NX pte -0xc03d7000-0xc0600000 2212K RW GLB x pte +0xc03d7000-0xc0600000 2212K RW GLB NX pte 0xc0600000-0xf7a00000 884M RW PSE GLB NX pmd 0xf7a00000-0xf7bfe000 2040K RW GLB NX pte 0xf7bfe000-0xf7c00000 8K pte No pcibios: -- data_nx_pt_before.txt 2009-10-13 07:48:59.000000000 -0400 ++ data_nx_pt_after.txt 2009-10-13 07:26:46.000000000 -0400 0x00000000-0xc0000000 3G pmd ---[ Kernel Mapping ]--- -0xc0000000-0xc0100000 1M RW GLB x pte +0xc0000000-0xc0100000 1M RW GLB NX pte -0xc0100000-0xc03d7000 2908K ro GLB x pte +0xc0100000-0xc0318000 2144K ro GLB x pte +0xc0318000-0xc03d7000 764K ro GLB NX pte -0xc03d7000-0xc0600000 2212K RW GLB x pte +0xc03d7000-0xc0600000 2212K RW GLB NX pte 0xc0600000-0xf7a00000 884M RW PSE GLB NX pmd 0xf7a00000-0xf7bfe000 2040K RW GLB NX pte 0xf7bfe000-0xf7c00000 8K pte The patch has been originally developed for Linux 2.6.34-rc2 x86 by Siarhei Liakh and Xuxian Jiang . -v1: initial patch for 2.6.30 -v2: patch for 2.6.31-rc7 -v3: moved all code into arch/x86, adjusted credits -v4: fixed ifdef, removed credits from CREDITS -v5: fixed an address calculation bug in mark_nxdata_nx() -v6: added acked-by and PT dump diff to commit log -v7: minor adjustments for -tip -v8: rework with the merge of "Set first MB as RW+NX" Signed-off-by: Siarhei Liakh Signed-off-by: Xuxian Jiang Signed-off-by: Matthieu CASTET Cc: Arjan van de Ven Cc: James Morris Cc: Andi Kleen Cc: Rusty Russell Cc: Stephen Rothwell Cc: Dave Jones Cc: Kees Cook Cc: Linus Torvalds LKML-Reference: <4CE2F82E.60601@free.fr> [ minor cleanliness edits ] Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 ++- arch/x86/mm/init_32.c | 20 +++++++++++++++++++- arch/x86/mm/init_64.c | 3 ++- arch/x86/mm/pageattr.c | 5 ++++- 4 files changed, 27 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7d..947f42abe820 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that - * writeable first. + * writeable and non-executable first. */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e969f9f401b..f89b5bb4e93f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) static inline int is_kernel_text(unsigned long addr) { - if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) return 1; return 0; } @@ -912,6 +912,23 @@ void set_kernel_text_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); } +static void mark_nxdata_nx(void) +{ + /* + * When this called, init has already been executed and released, + * so everything past _etext sould be NX. + */ + unsigned long start = PFN_ALIGN(_etext); + /* + * This comes from is_kernel_text upper limit. Also HPAGE where used: + */ + unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); + set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); @@ -946,6 +963,7 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif + mark_nxdata_nx(); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 71a59296af80..ce59c05cae12 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -788,6 +788,7 @@ void mark_rodata_ro(void) unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK); unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); unsigned long data_start = (unsigned long) &_sdata; @@ -802,7 +803,7 @@ void mark_rodata_ro(void) * The rodata section (but not the kernel text!) should also be * not-executable. */ - set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT); rodata_test(); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6f2a6b6deb6b..8b830ca14ac4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -261,8 +262,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * The BIOS area between 640k and 1Mb needs to be executable for * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. */ - if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) +#ifdef CONFIG_PCI_BIOS + if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_NX; +#endif /* * The kernel text needs to be executable for obvious reasons -- cgit v1.2.2 From 9c0729dc8062bed96189bd14ac6d4920f3958743 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Pedersen Date: Fri, 5 Nov 2010 05:59:39 -0400 Subject: x86: Eliminate bp argument from the stack tracing routines The various stack tracing routines take a 'bp' argument in which the caller is supposed to provide the base pointer to use, or 0 if doesn't have one. Since bp is garbage whenever CONFIG_FRAME_POINTER is not defined, this means all callers in principle should either always pass 0, or be conditional on CONFIG_FRAME_POINTER. However, there are only really three use cases for stack tracing: (a) Trace the current task, including IRQ stack if any (b) Trace the current task, but skip IRQ stack (c) Trace some other task In all cases, if CONFIG_FRAME_POINTER is not defined, bp should just be 0. If it _is_ defined, then - in case (a) bp should be gotten directly from the CPU's register, so the caller should pass NULL for regs, - in case (b) the caller should should pass the IRQ registers to dump_trace(), - in case (c) bp should be gotten from the top of the task's stack, so the caller should pass NULL for regs. Hence, the bp argument is not necessary because the combination of task and regs is sufficient to determine an appropriate value for bp. This patch introduces a new inline function stack_frame(task, regs) that computes the desired bp. This function is then called from the two versions of dump_stack(). Signed-off-by: Soren Sandmann Acked-by: Steven Rostedt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Arjan van de Ven , Cc: Frederic Weisbecker , Cc: Arnaldo Carvalho de Melo , LKML-Reference: > Signed-off-by: Frederic Weisbecker --- arch/x86/mm/kmemcheck/error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436f..704a37cedddb 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_bp(&e->trace, regs->bp); + save_stack_trace_regs(&e->trace, regs); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address -- cgit v1.2.2 From 691513f70d3957939a318da970987b876c720861 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Mon, 22 Nov 2010 14:03:28 +0100 Subject: x86: Resume trampoline must be executable commit 5bd5a452(x86: Add NX protection for kernel data) marked the trampoline area NX - which unsurprisingly breaks resume and cpu hotplug. Revert the portion of that commit, which touches the trampoline. Originally-from: Lin Ming LKML-Reference: <1290410581.2405.24.camel@minggr.sh.intel.com> Cc: Matthieu Castet Cc: Siarhei Liakh Cc: Xuxian Jiang Cc: Ingo Molnar Cc: Arjan van de Ven Cc: Andi Kleen Tested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ce59c05cae12..71a59296af80 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -788,7 +788,6 @@ void mark_rodata_ro(void) unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long kernel_end = (((unsigned long)&__init_end + HPAGE_SIZE) & HPAGE_MASK); unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); unsigned long data_start = (unsigned long) &_sdata; @@ -803,7 +802,7 @@ void mark_rodata_ro(void) * The rodata section (but not the kernel text!) should also be * not-executable. */ - set_memory_nx(rodata_start, (kernel_end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); rodata_test(); -- cgit v1.2.2 From f1157141636848f52c5f74040bed0ba355cf59b7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 7 Dec 2010 00:55:29 -0800 Subject: x86, apic: Remove early_init_lapic_mapping() It is almost the same as smp_register_lapic_addr(). We just need to let smp_read_mpc() call smp_register_lapic_addr() when early==1. Add the apic_printk to smp_register_lapic_address() Signed-off-by: Yinghai Lu Cc: Suresh Siddha Cc: "Eric W. Biederman" LKML-Reference: <4CFDF681.3030509@kernel.org> Signed-off-by: Thomas Gleixner --- arch/x86/mm/amdtopology_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 51fae9cfdecb..08a0069b87a5 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -66,7 +66,6 @@ static __init void early_get_boot_cpu_id(void) if (smp_found_config) early_get_smp_config(); #endif - early_init_lapic_mapping(); } int __init amd_get_nodes(struct bootnode *physnodes) -- cgit v1.2.2 From c10d1e260f7cb6766dc76b4e36ed8f4be53f195a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 17 Nov 2010 06:09:52 +0000 Subject: x86, olpc: Add OLPC device-tree support Make use of PROC_DEVICETREE to export the tree, and sparc's PROMTREE code to call into OLPC's Open Firmware to build the tree. v5: fix buglet with root node check (introduced in v4) v4: address some minor style issues pointed out by Grant, and explicitly cast negative phandle checks to s32. v3: rename olpc_prom to olpc_dt - rework Kconfig entries - drop devtree build hook from proc, instead adding a call to x86's paging_init (similarly to how sparc64 does it) - switch allocation from using slab to alloc_bootmem. this allows the DT to be built earlier during boot (during setup_arch); the downside is that there are some 1200 bootmem reservations that are done during boot. Not ideal.. - add a helper olpc_ofw_is_installed function to test for the existence and successful detection of OLPC's OFW. Signed-off-by: Andres Salomon LKML-Reference: <20101116220952.26526a80@queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e969f9f401b..8c852e4af452 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -715,6 +716,7 @@ void __init paging_init(void) /* * NOTE: at this point the bootmem allocator is fully available. */ + olpc_dt_build_devicetree(); sparse_init(); zone_sizes_init(); } -- cgit v1.2.2 From d3bd058826aa8b79590cca6c8e6d1557bf576ada Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 16 Dec 2010 19:09:58 -0800 Subject: x86, acpi: Parse all SRAT cpu entries even above the cpu number limitation Recent Intel new system have different order in MADT, aka will list all thread0 at first, then all thread1. But SRAT table still old order, it will list cpus in one socket all together. If the user have compiled limited NR_CPUS or boot with nr_cpus=, could have missed to put some cpus apic id to node mapping into apicid_to_node[]. for example for 4 sockets system with 64 cpus with nr_cpus=32 will get crash... [ 9.106288] Total of 32 processors activated (136190.88 BogoMIPS). [ 9.235021] divide error: 0000 [#1] SMP [ 9.235315] last sysfs file: [ 9.235481] CPU 1 [ 9.235592] Modules linked in: [ 9.245398] [ 9.245478] Pid: 2, comm: kthreadd Not tainted 2.6.37-rc1-tip-yh-01782-ge92ef79-dirty #274 /Sun Fire x4800 [ 9.265415] RIP: 0010:[] [] select_task_rq_fair+0x4f0/0x623 ... [ 9.645938] RIP [] select_task_rq_fair+0x4f0/0x623 [ 9.665356] RSP [ 9.665568] ---[ end trace 2296156d35fdfc87 ]--- So let just parse all cpu entries in SRAT. Also add apicid checking with MAX_LOCAL_APIC, in case We could out of boundaries of apicid_to_node[]. it fixes following bug too. https://bugzilla.kernel.org/show_bug.cgi?id=22662 -v2: expand to 32bit according to hpa need to add MAX_LOCAL_APIC for 32bit Reported-and-Tested-by: Wu Fengguang Reported-by: Bjorn Helgaas Tested-by: Myron Stowe Signed-off-by: Yinghai Lu LKML-Reference: <4D0AD486.9020704@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/srat_32.c | 1 + arch/x86/mm/srat_64.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index a17dffd136c1..f16434568a51 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) /* mark this node as "seen" in node bitmap */ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + /* don't need to check apic_id here, because it is always 8 bits */ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..171a0aacb99a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; @@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; -- cgit v1.2.2 From 4e76f4e67a106ed827ca721b4c8b622047cd2f6d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:47 -0800 Subject: x86, numa: Avoid compiling NUMA emulation functions without CONFIG_NUMA_EMU Both acpi_get_nodes() and amd_get_nodes() are only necessary when CONFIG_NUMA_EMU is enabled, so avoid compiling them when the option is disabled. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 2 ++ arch/x86/mm/srat_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index 51fae9cfdecb..fe050af614e2 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -69,6 +69,7 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } +#ifdef CONFIG_NUMA_EMU int __init amd_get_nodes(struct bootnode *physnodes) { int i; @@ -81,6 +82,7 @@ int __init amd_get_nodes(struct bootnode *physnodes) } return ret; } +#endif /* CONFIG_NUMA_EMU */ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..8241bf0f6eb2 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -339,6 +339,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} +#ifdef CONFIG_NUMA_EMU int __init acpi_get_nodes(struct bootnode *physnodes) { int i; @@ -351,6 +352,7 @@ int __init acpi_get_nodes(struct bootnode *physnodes) } return ret; } +#endif /* CONFIG_NUMA_EMU */ /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(unsigned long start, unsigned long end) -- cgit v1.2.2 From f51bf3073a145a5b3263fd882c52d6ec04b687da Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:51 -0800 Subject: x86, numa: Fake apicid and pxm mappings for NUMA emulation This patch adds the equivalent of acpi_fake_nodes() for AMD Northbridge platforms. The goal is to fake the apicid-to-node mappings for NUMA emulation so the physical topology of the machine is correctly maintained within the kernel. This change also fakes proximity domains for both ACPI and k8 code so the physical distance between emulated nodes is maintained via node_distance(). This exports the correct distances via /sys/devices/system/node/.../distance based on the underlying topology. A new helper function, fake_physnodes(), is introduced to correctly invoke the correct NUMA code to fake these two mappings based on the system type. If there is no underlying NUMA configuration, all cpus are mapped to node 0 for local distance. Since acpi_fake_nodes() is no longer called with CONFIG_ACPI_NUMA, it's prototype can be removed from the header file for such a configuration. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 91 ++++++++++++++++++++++++++++++++++++-------- arch/x86/mm/numa_64.c | 20 +++++++++- arch/x86/mm/srat_64.c | 2 - 3 files changed, 94 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index fe050af614e2..eb5cbb97b68d 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -27,6 +27,7 @@ #include static struct bootnode __initdata nodes[8]; +static unsigned char __initdata nodeids[8]; static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; static __init int find_northbridge(void) @@ -69,21 +70,6 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -#ifdef CONFIG_NUMA_EMU -int __init amd_get_nodes(struct bootnode *physnodes) -{ - int i; - int ret = 0; - - for_each_node_mask(i, nodes_parsed) { - physnodes[ret].start = nodes[i].start; - physnodes[ret].end = nodes[i].end; - ret++; - } - return ret; -} -#endif /* CONFIG_NUMA_EMU */ - int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long start = PFN_PHYS(start_pfn); @@ -116,7 +102,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); - nodeid = limit & 7; + nodeids[i] = nodeid = limit & 7; if ((base & 3) == 0) { if (i < numnodes) pr_info("Skipping disabled node %d\n", i); @@ -196,6 +182,79 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) return 0; } +#ifdef CONFIG_NUMA_EMU +static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int __init amd_get_nodes(struct bootnode *physnodes) +{ + int i; + int ret = 0; + + for_each_node_mask(i, nodes_parsed) { + physnodes[ret].start = nodes[i].start; + physnodes[ret].end = nodes[i].end; + ret++; + } + return ret; +} + +static int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for (i = 0; i < 8; i++) + if (addr >= nodes[i].start && addr < nodes[i].end) { + ret = i; + break; + } + return ret; +} + +/* + * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be + * setup to represent the physical topology but reflect the emulated + * environment. For each emulated node, the real node which it appears on is + * found and a fake pxm to nid mapping is created which mirrors the actual + * locality. node_distance() then represents the correct distances between + * emulated nodes by using the fake acpi mappings to pxms. + */ +void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) +{ + unsigned int bits; + unsigned int cores; + unsigned int apicid_base = 0; + int i; + + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + early_get_boot_cpu_id(); + if (boot_cpu_physical_apicid > 0) + apicid_base = boot_cpu_physical_apicid; + + for (i = 0; i < nr_nodes; i++) { + int index; + int nid; + int j; + + nid = find_node_by_addr(nodes[i].start); + if (nid == NUMA_NO_NODE) + continue; + + index = nodeids[nid] << bits; + if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) + for (j = apicid_base; j < cores + apicid_base; j++) + fake_apicid_to_node[index + j] = i; +#ifdef CONFIG_ACPI_NUMA + __acpi_map_pxm_to_node(nid, i); +#endif + } + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); +} +#endif /* CONFIG_NUMA_EMU */ + int __init amd_scan_nodes(void) { unsigned int bits; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7762a517d69d..cc390f3a1bde 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -324,6 +324,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, return ret; } +static void __init fake_physnodes(int acpi, int amd, int nr_nodes) +{ + int i; + + BUG_ON(acpi && amd); +#ifdef CONFIG_ACPI_NUMA + if (acpi) + acpi_fake_nodes(nodes, nr_nodes); +#endif +#ifdef CONFIG_AMD_NUMA + if (amd) + amd_fake_nodes(nodes, nr_nodes); +#endif + if (!acpi && !amd) + for (i = 0; i < nr_cpu_ids; i++) + numa_set_node(i, 0); +} + /* * Setups up nid to range from addr to addr + size. If the end * boundary is greater than max_addr, then max_addr is used instead. @@ -595,7 +613,7 @@ static int __init numa_emulation(unsigned long start_pfn, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } - acpi_fake_nodes(nodes, num_nodes); + fake_physnodes(acpi, amd, num_nodes); numa_init_array(); return 0; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 8241bf0f6eb2..c48b443706c5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -497,8 +497,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { int i, j; - printk(KERN_INFO "Faking PXM affinity for fake nodes on real " - "topology.\n"); for (i = 0; i < num_nodes; i++) { int nid, pxm; -- cgit v1.2.2 From c1c3443c9c5e9be92641029ed229a41563e44506 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:54 -0800 Subject: x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 99 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 20 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cc390f3a1bde..dd300c491f1f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -260,7 +260,7 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode physnodes[MAX_NUMNODES] __initdata; +static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, @@ -270,6 +270,7 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, int ret = 0; int i; + memset(physnodes, 0, sizeof(physnodes)); #ifdef CONFIG_ACPI_NUMA if (acpi) nr_nodes = acpi_get_nodes(physnodes); @@ -370,8 +371,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr * to max_addr. The return value is the number of nodes allocated. */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, - int nr_phys_nodes, int nr_nodes) +static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) { nodemask_t physnode_mask = NODE_MASK_NONE; u64 size; @@ -402,7 +402,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, return -1; } - for (i = 0; i < nr_phys_nodes; i++) + for (i = 0; i < MAX_NUMNODES; i++) if (physnodes[i].start != physnodes[i].end) node_set(i, physnode_mask); @@ -571,11 +571,9 @@ static int __init numa_emulation(unsigned long start_pfn, { u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_phys_nodes; int num_nodes; int i; - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -590,7 +588,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long n; n = simple_strtoul(cmdline, NULL, 0); - num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); + num_nodes = split_nodes_interleave(addr, max_addr, n); } if (num_nodes < 0) @@ -613,6 +611,7 @@ static int __init numa_emulation(unsigned long start_pfn, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } + setup_physnodes(addr, max_addr, acpi, amd); fake_physnodes(acpi, amd, num_nodes); numa_init_array(); return 0; @@ -628,8 +627,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, + acpi, amd); if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) return; + setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, + acpi, amd); nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif @@ -785,6 +788,7 @@ void __cpuinit numa_clear_node(int cpu) #ifndef CONFIG_DEBUG_PER_CPU_MAPS +#ifndef CONFIG_NUMA_EMU void __cpuinit numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); @@ -794,6 +798,51 @@ void __cpuinit numa_remove_cpu(int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } +#else +void __cpuinit numa_add_cpu(int cpu) +{ + unsigned long addr; + u16 apicid; + int physnid; + int nid = NUMA_NO_NODE; + + apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + if (apicid != BAD_APICID) + nid = apicid_to_node[apicid]; + if (nid == NUMA_NO_NODE) + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + /* + * Use the starting address of the emulated node to find which physical + * node it is allocated on. + */ + addr = node_start_pfn(nid) << PAGE_SHIFT; + for (physnid = 0; physnid < MAX_NUMNODES; physnid++) + if (addr >= physnodes[physnid].start && + addr < physnodes[physnid].end) + break; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) { + addr = node_start_pfn(nid) << PAGE_SHIFT; + if (addr >= physnodes[physnid].start && + addr < physnodes[physnid].end) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); + } +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#endif /* !CONFIG_NUMA_EMU */ #else /* CONFIG_DEBUG_PER_CPU_MAPS */ @@ -805,22 +854,32 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) int node = early_cpu_to_node(cpu); struct cpumask *mask; char buf[64]; + int i; - mask = node_to_cpumask_map[node]; - if (mask == NULL) { - printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); - dump_stack(); - return; - } + for_each_online_node(i) { + unsigned long addr; - if (enable) - cpumask_set_cpu(cpu, mask); - else - cpumask_clear_cpu(cpu, mask); + addr = node_start_pfn(i) << PAGE_SHIFT; + if (addr < physnodes[node].start || + addr >= physnodes[node].end) + continue; + mask = node_to_cpumask_map[node]; + if (mask == NULL) { + pr_err("node_to_cpumask_map[%i] NULL\n", i); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + } } void __cpuinit numa_add_cpu(int cpu) -- cgit v1.2.2 From a387e95a49743cf9835c5299ca549232618d8249 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 22 Dec 2010 17:23:56 -0800 Subject: x86, numa: Fix cpu to node mapping for sparse node ids NUMA boot code assumes that physical node ids start at 0, but the DIMMs that the apic id represents may not be reachable. If this is the case, node 0 is never online and cpus never end up getting appropriately assigned to a node. This causes the cpumask of all online nodes to be empty and machines crash with kernel code assuming online nodes have valid cpus. The fix is to appropriately map all the address ranges for physical nodes and ensure the cpu to node mapping function checks all possible nodes (up to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the number of physical nodes, for valid address ranges. This requires no longer "compressing" the address ranges of nodes in the physical node map from 0-N, but rather leave indices in physnodes[] to represent the actual node id of the physical node. Accordingly, the topology exported by both amd_get_nodes() and acpi_get_nodes() no longer must return the number of nodes to iterate through; all such iterations will now be to MAX_NUMNODES. This change also passes the end address of system RAM (which may be different from normal operation if mem= is specified on the command line) before the physnodes[] array is populated. ACPI parsed nodes are truncated to fit within the address range that respect the mem= boundaries and even some physical nodes may become unreachable in such cases. When NUMA emulation does succeed, any apicid to node mapping that exists for unreachable nodes are given default values so that proximity domains can still be assigned. This is important for node_distance() to function as desired. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/amdtopology_64.c | 9 +++------ arch/x86/mm/numa_64.c | 18 +++--------------- arch/x86/mm/srat_64.c | 22 ++++++++++++++++------ 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index eb5cbb97b68d..0df2623d1039 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -187,17 +187,14 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -int __init amd_get_nodes(struct bootnode *physnodes) +void __init amd_get_nodes(struct bootnode *physnodes) { int i; - int ret = 0; for_each_node_mask(i, nodes_parsed) { - physnodes[ret].start = nodes[i].start; - physnodes[ret].end = nodes[i].end; - ret++; + physnodes[i].start = nodes[i].start; + physnodes[i].end = nodes[i].end; } - return ret; } static int __init find_node_by_addr(unsigned long addr) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dd300c491f1f..3d73201ba347 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -266,25 +266,24 @@ static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, int acpi, int amd) { - int nr_nodes = 0; int ret = 0; int i; memset(physnodes, 0, sizeof(physnodes)); #ifdef CONFIG_ACPI_NUMA if (acpi) - nr_nodes = acpi_get_nodes(physnodes); + acpi_get_nodes(physnodes, start, end); #endif #ifdef CONFIG_AMD_NUMA if (amd) - nr_nodes = amd_get_nodes(physnodes); + amd_get_nodes(physnodes); #endif /* * Basic sanity checking on the physical node map: there may be errors * if the SRAT or AMD code incorrectly reported the topology or the mem= * kernel parameter is used. */ - for (i = 0; i < nr_nodes; i++) { + for (i = 0; i < MAX_NUMNODES; i++) { if (physnodes[i].start == physnodes[i].end) continue; if (physnodes[i].start > end) { @@ -299,17 +298,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, physnodes[i].start = start; if (physnodes[i].end > end) physnodes[i].end = end; - } - - /* - * Remove all nodes that have no memory or were truncated because of the - * limited address range. - */ - for (i = 0; i < nr_nodes; i++) { - if (physnodes[i].start == physnodes[i].end) - continue; - physnodes[ret].start = physnodes[i].start; - physnodes[ret].end = physnodes[i].end; ret++; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index c48b443706c5..a756bcf3fa48 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -340,17 +340,16 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) void __init acpi_numa_arch_fixup(void) {} #ifdef CONFIG_NUMA_EMU -int __init acpi_get_nodes(struct bootnode *physnodes) +void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, + unsigned long end) { int i; - int ret = 0; for_each_node_mask(i, nodes_parsed) { - physnodes[ret].start = nodes[i].start; - physnodes[ret].end = nodes[i].end; - ret++; + cutoff_node(i, start, end); + physnodes[i].start = nodes[i].start; + physnodes[i].end = nodes[i].end; } - return ret; } #endif /* CONFIG_NUMA_EMU */ @@ -516,6 +515,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) fake_apicid_to_node[j] == NUMA_NO_NODE) fake_apicid_to_node[j] = i; } + + /* + * If there are apicid-to-node mappings for physical nodes that do not + * have a corresponding emulated node, it should default to a guaranteed + * value. + */ + for (i = 0; i < MAX_LOCAL_APIC; i++) + if (apicid_to_node[i] != NUMA_NO_NODE && + fake_apicid_to_node[i] == NUMA_NO_NODE) + fake_apicid_to_node[i] = 0; + for (i = 0; i < num_nodes; i++) __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); -- cgit v1.2.2 From d906f0eb2f0e6d1a24c479f69a9c39e7e45c5ae8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 30 Dec 2010 10:54:16 -0800 Subject: x86, numa: Fix CONFIG_DEBUG_PER_CPU_MAPS without NUMA emulation "x86, numa: Fake node-to-cpumask for NUMA emulation" broke the build when CONFIG_DEBUG_PER_CPU_MAPS is set and CONFIG_NUMA_EMU is not. This is because it is possible to map a cpu to multiple nodes when NUMA emulation is used; the patch required a physical node address table to find those nodes that was only available when CONFIG_NUMA_EMU was enabled. This extracts the common debug functionality to its own function for CONFIG_DEBUG_PER_CPU_MAPS and uses it regardless of whether CONFIG_NUMA_EMU is set or not. NUMA emulation will now iterate over the set of possible nodes for each cpu and call the new debug function whereas only the cpu's node will be used without NUMA emulation enabled. Reported-by: Ingo Molnar Signed-off-by: David Rientjes Acked-by: Yinghai Lu LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d73201ba347..1e72102e80c9 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -833,15 +833,48 @@ void __cpuinit numa_remove_cpu(int cpu) #endif /* !CONFIG_NUMA_EMU */ #else /* CONFIG_DEBUG_PER_CPU_MAPS */ +static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + struct cpumask *mask; + char buf[64]; + + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return NULL; + } + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return mask; +} /* * --------- debug versions of the numa functions --------- */ +#ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + struct cpumask *mask; + + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) + return; + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); +} +#else static void __cpuinit numa_set_cpumask(int cpu, int enable) { int node = early_cpu_to_node(cpu); struct cpumask *mask; - char buf[64]; int i; for_each_online_node(i) { @@ -851,24 +884,17 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) if (addr < physnodes[node].start || addr >= physnodes[node].end) continue; - mask = node_to_cpumask_map[node]; - if (mask == NULL) { - pr_err("node_to_cpumask_map[%i] NULL\n", i); - dump_stack(); + mask = debug_cpumask_set_cpu(cpu, enable); + if (!mask) return; - } if (enable) cpumask_set_cpu(cpu, mask); else cpumask_clear_cpu(cpu, mask); - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); } } +#endif /* CONFIG_NUMA_EMU */ void __cpuinit numa_add_cpu(int cpu) { -- cgit v1.2.2 From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: alter compound get_page/put_page Alter compound get_page/put_page to keep references on subpages too, in order to allow __split_huge_page_refcount to split an hugepage even while subpages have been pinned by one of the get_user_pages() variants. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799d..06f56fcf9a77 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -105,6 +105,16 @@ static inline void get_head_page_multiple(struct page *page, int nr) atomic_add(nr, &page->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -128,6 +138,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; -- cgit v1.2.2 From db3eb96f4e6281b84dd33c8980dacc27f2efe177 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:41 -0800 Subject: thp: add pmd mangling functions to x86 Add needed pmd mangling functions with symmetry with their pte counterparts. pmdp_splitting_flush() is the only new addition on the pmd_ methods and it's needed to serialize the VM against split_huge_page. It simply atomically sets the splitting bit in a similar way pmdp_clear_flush_young atomically clears the accessed bit. pmdp_splitting_flush() also has to flush the tlb to make it effective against gup_fast, but it wouldn't really require to flush the tlb too. Just the tlb flush is the simplest operation we can invoke to serialize pmdp_splitting_flush() against gup_fast. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d7bc89..65e92d58f942 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + + return changed; +} +#endif + int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &pmdp->pmd); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif + int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { @@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)&pmdp->pmd); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve -- cgit v1.2.2 From 64cc6ae001d70bc59e5f854e6b5678f59110df16 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:42 -0800 Subject: thp: bail out gup_fast on splitting pmd Force gup_fast to take the slow path and block if the pmd is splitting, not only if it's none. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 06f56fcf9a77..269aa53932e0 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -160,7 +160,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) -- cgit v1.2.2 From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:01 -0800 Subject: thp: add x86 32bit support Add support for transparent hugepages to x86 32bit. Share the same VM_ bitflag for VM_MAPPED_COPY. mm/nommu.c will never support transparent hugepages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 65e92d58f942..500242d3c96d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -362,7 +362,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - (unsigned long *) &pmdp->pmd); + (unsigned long *)pmdp); if (ret) pmd_update(vma->vm_mm, addr, pmdp); @@ -404,7 +404,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, int set; VM_BUG_ON(address & ~HPAGE_PMD_MASK); set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)&pmdp->pmd); + (unsigned long *)pmdp); if (set) { pmd_update(vma->vm_mm, address, pmdp); /* need tlb flush only to serialize against gup-fast */ -- cgit v1.2.2 From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: mmu_notifier_test_young For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 269aa53932e0..dbe34b931374 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); atomic_add(nr, &page->_count); + SetPageReferenced(page); } static inline void get_huge_page_tail(struct page *page) -- cgit v1.2.2 From 9032160275ba018003ff390835ff8ed2b5b788b8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 19 Jan 2011 08:57:21 +0000 Subject: x86: Unify "numa=" command line option handling In order to be able to suppress the use of SRAT tables that 32-bit Linux can't deal with (in one case known to lead to a non-bootable system, unless disabling ACPI altogether), move the "numa=" option handling to common code. Signed-off-by: Jan Beulich Reviewed-by: Thomas Renninger Cc: Tejun Heo Cc: Thomas Renninger LKML-Reference: <4D36B581020000780002D0FF@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 22 ++++++++++++++++++++++ arch/x86/mm/numa_64.c | 24 +++++------------------- arch/x86/mm/srat_32.c | 1 - 3 files changed, 27 insertions(+), 20 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 787c52ca49c3..ebf6d7887a38 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -2,6 +2,28 @@ #include #include #include +#include +#include + +int __initdata numa_off; + +static __init int numa_setup(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt, "off", 3)) + numa_off = 1; +#ifdef CONFIG_NUMA_EMU + if (!strncmp(opt, "fake=", 5)) + numa_emu_cmdline(opt + 5); +#endif +#ifdef CONFIG_ACPI_NUMA + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; +#endif + return 0; +} +early_param("numa", numa_setup); /* * Which logical CPUs are on which nodes diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1e72102e80c9..95ea1551eebc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; @@ -263,6 +262,11 @@ static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; static char *cmdline __initdata; +void __init numa_emu_cmdline(char *str) +{ + cmdline = str; +} + static int __init setup_physnodes(unsigned long start, unsigned long end, int acpi, int amd) { @@ -670,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -static __init int numa_setup(char *opt) -{ - if (!opt) - return -EINVAL; - if (!strncmp(opt, "off", 3)) - numa_off = 1; -#ifdef CONFIG_NUMA_EMU - if (!strncmp(opt, "fake=", 5)) - cmdline = opt + 5; -#endif -#ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt, "noacpi", 6)) - acpi_numa = -1; -#endif - return 0; -} -early_param("numa", numa_setup); - #ifdef CONFIG_NUMA static __init int find_near_online_node(int node) diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index f16434568a51..ae96e7b8051d 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; static int __initdata num_memory_chunks; /* total number of memory chunks */ static u8 __initdata apicid_to_pxm[MAX_APICID]; -int numa_off __initdata; int acpi_numa __initdata; static __init void bad_srat(void) -- cgit v1.2.2 From f12d3d04e8f6223276abb068c5d72852174b8c31 Mon Sep 17 00:00:00 2001 From: Matthieu CASTET Date: Thu, 20 Jan 2011 21:11:45 +0100 Subject: x86, nx: Don't force pages RW when setting NX bits Xen want page table pages read only. But the initial page table (from head_*.S) live in .data or .bss. That was broken by 64edc8ed5ffae999d8d413ba006850e9e34166cb. There is absolutely no reason to force these pages RW after they have already been marked RO. Signed-off-by: Matthieu CASTET Tested-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8b830ca14ac4..d343b3c81f3c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, unsigned long pfn) { pgprot_t forbidden = __pgprot(0); - pgprot_t required = __pgprot(0); /* * The BIOS area between 640k and 1Mb needs to be executable for @@ -282,12 +281,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; - /* - * .data and .bss should always be writable. - */ - if (within(address, (unsigned long)_sdata, (unsigned long)_edata) || - within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) - pgprot_val(required) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* @@ -327,7 +320,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); - prot = __pgprot(pgprot_val(prot) | pgprot_val(required)); return prot; } -- cgit v1.2.2