aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 16:27:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 16:27:33 -0500
commite33c01972239fee4696679ae5f7d1f340f424999 (patch)
treebd4bc3223ba572719b3a53142fdb98910450fe64 /arch
parent343036cea2854acf8d4b4c930c0063223bc6b8a2 (diff)
parentccef086454d4c97e7b722e9303390207d681cb4c (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (36 commits) x86, mm: Correct the implementation of is_untracked_pat_range() x86/pat: Trivial: don't create debugfs for memtype if pat is disabled x86, mtrr: Fix sorting of mtrr after subtracting x86: Move find_smp_config() earlier and avoid bootmem usage x86, platform: Change is_untracked_pat_range() to bool; cleanup init x86: Change is_ISA_range() into an inline function x86, mm: is_untracked_pat_range() takes a normal semiclosed range x86, mm: Call is_untracked_pat_range() rather than is_ISA_range() x86: UV SGI: Don't track GRU space in PAT x86: SGI UV: Fix BAU initialization x86, numa: Use near(er) online node instead of roundrobin for NUMA x86, numa, bootmem: Only free bootmem on NUMA failure path x86: Change crash kernel to reserve via reserve_early() x86: Eliminate redundant/contradicting cache line size config options x86: When cleaning MTRRs, do not fold WP into UC x86: remove "extern" from function prototypes in <asm/proto.h> x86, mm: Report state of NX protections during boot x86, mm: Clean up and simplify NX enablement x86, pageattr: Make set_memory_(x|nx) aware of NX support x86, sleep: Always save the value of EFER ... Fix up conflicts (added both iommu_shutdown and is_untracked_pat_range) to 'struct x86_platform_ops') in arch/x86/include/asm/x86_init.h arch/x86/kernel/x86_init.c
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig.cpu14
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h1
-rw-r--r--arch/x86/include/asm/e820.h23
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/mpspec.h11
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/proto.h17
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/kernel/acpi/sleep.c24
-rw-r--r--arch/x86/kernel/apic/numaq_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c19
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c51
-rw-r--r--arch/x86/kernel/ftrace.c17
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/machine_kexec_32.c6
-rw-r--r--arch/x86/kernel/mpparse.c44
-rw-r--r--arch/x86/kernel/setup.c106
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/visws_quirks.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S38
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c35
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/numa_32.c4
-rw-r--r--arch/x86/mm/numa_64.c252
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/mm/pat.c13
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_64.c29
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/xen/enlighten.c4
42 files changed, 650 insertions, 339 deletions
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 5e99762eb5c2..08e442bc3ab9 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
301 301
302# 302#
303# Define implied options from the CPU selection here 303# Define implied options from the CPU selection here
304config X86_L1_CACHE_BYTES 304config X86_INTERNODE_CACHE_SHIFT
305 int 305 int
306 default "128" if MPSC 306 default "12" if X86_VSMP
307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 307 default "7" if NUMA
308 308 default X86_L1_CACHE_SHIFT
309config X86_INTERNODE_CACHE_BYTES
310 int
311 default "4096" if X86_VSMP
312 default X86_L1_CACHE_BYTES if !X86_VSMP
313 309
314config X86_CMPXCHG 310config X86_CMPXCHG
315 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
317config X86_L1_CACHE_SHIFT 313config X86_L1_CACHE_SHIFT
318 int 314 int
319 default "7" if MPENTIUM4 || MPSC 315 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
323 319
324config X86_XADD 320config X86_XADD
325 def_bool y 321 def_bool y
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
107 lgdt gdt(%ebp) 107 lgdt gdt(%ebp)
108 108
109 /* Enable PAE mode */ 109 /* Enable PAE mode */
110 xorl %eax, %eax 110 movl $(X86_CR4_PAE), %eax
111 orl $(X86_CR4_PAE), %eax
112 movl %eax, %cr4 111 movl %eax, %cr4
113 112
114 /* 113 /*
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
4 4
5#undef i386 5#undef i386
6 6
7#include <asm/cache.h>
7#include <asm/page_types.h> 8#include <asm/page_types.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
46 *(.data.*) 47 *(.data.*)
47 _edata = . ; 48 _edata = . ;
48 } 49 }
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 50 . = ALIGN(L1_CACHE_BYTES);
50 .bss : { 51 .bss : {
51 _bss = . ; 52 _bss = . ;
52 *(.bss) 53 *(.bss)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..60d2b2db0bc5 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
118extern unsigned long acpi_wakeup_address; 118extern unsigned long acpi_wakeup_address;
119 119
120/* early initialization routine */ 120/* early initialization routine */
121extern void acpi_reserve_bootmem(void); 121extern void acpi_reserve_wakeup_memory(void);
122 122
123/* 123/*
124 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
@@ -158,6 +158,7 @@ struct bootnode;
158 158
159#ifdef CONFIG_ACPI_NUMA 159#ifdef CONFIG_ACPI_NUMA
160extern int acpi_numa; 160extern int acpi_numa;
161extern int acpi_get_nodes(struct bootnode *physnodes);
161extern int acpi_scan_nodes(unsigned long start, unsigned long end); 162extern int acpi_scan_nodes(unsigned long start, unsigned long end);
162#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 163#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
163extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 164extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data.read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
14
12#ifdef CONFIG_X86_VSMP 15#ifdef CONFIG_X86_VSMP
13/* vSMP Internode cacheline shift */
14#define INTERNODE_CACHE_SHIFT (12)
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16#define __cacheline_aligned_in_smp \ 17#define __cacheline_aligned_in_smp \
17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 18 __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
18 __page_aligned_data 19 __page_aligned_data
19#endif 20#endif
20#endif 21#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 9076add593a8..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -177,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
177#ifdef CONFIG_DEBUG_RODATA 177#ifdef CONFIG_DEBUG_RODATA
178void mark_rodata_ro(void); 178void mark_rodata_ro(void);
179extern const int rodata_test_data; 179extern const int rodata_test_data;
180extern int kernel_set_to_readonly;
180void set_kernel_text_rw(void); 181void set_kernel_text_rw(void);
181void set_kernel_text_ro(void); 182void set_kernel_text_ro(void);
182#else 183#else
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..761249e396fe 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
61 struct e820entry map[E820_X_MAX]; 61 struct e820entry map[E820_X_MAX];
62}; 62};
63 63
64#define ISA_START_ADDRESS 0xa0000
65#define ISA_END_ADDRESS 0x100000
66
67#define BIOS_BEGIN 0x000a0000
68#define BIOS_END 0x00100000
69
64#ifdef __KERNEL__ 70#ifdef __KERNEL__
65/* see comment in arch/x86/kernel/e820.c */ 71/* see comment in arch/x86/kernel/e820.c */
66extern struct e820map e820; 72extern struct e820map e820;
@@ -126,15 +132,18 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 132extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 133extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 134extern char *default_machine_specific_memory_setup(void);
129#endif /* __KERNEL__ */
130#endif /* __ASSEMBLY__ */
131 135
132#define ISA_START_ADDRESS 0xa0000 136/*
133#define ISA_END_ADDRESS 0x100000 137 * Returns true iff the specified range [s,e) is completely contained inside
134#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) 138 * the ISA region.
139 */
140static inline bool is_ISA_range(u64 s, u64 e)
141{
142 return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
143}
135 144
136#define BIOS_BEGIN 0x000a0000 145#endif /* __KERNEL__ */
137#define BIOS_END 0x00100000 146#endif /* __ASSEMBLY__ */
138 147
139#ifdef __KERNEL__ 148#ifdef __KERNEL__
140#include <linux/ioport.h> 149#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..f70e60071fe8 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,13 +4,16 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
7 8
8extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges; 10extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 11extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 12extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 13extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
14 17
15#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node) 19static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 61d90b1331c3..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
71 71
72static inline void find_smp_config(void) 72static inline void find_smp_config(void)
73{ 73{
74 x86_init.mpparse.find_smp_config(1); 74 x86_init.mpparse.find_smp_config();
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80} 75}
81 76
82#ifdef CONFIG_X86_MPPARSE 77#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else 84# else
90# define default_mpc_oem_bus_info NULL 85# define default_mpc_oem_bus_info NULL
91# endif 86# endif
92extern void default_find_smp_config(unsigned int reserve); 87extern void default_find_smp_config(void);
93extern void default_get_smp_config(unsigned int early); 88extern void default_get_smp_config(unsigned int early);
94#else 89#else
95static inline void early_reserve_e820_mpc_new(void) { } 90static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
97#define default_mpc_apic_id NULL 92#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL 93#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL 94#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop 95#define default_find_smp_config x86_init_noop
101#define default_get_smp_config x86_init_uint_noop 96#define default_get_smp_config x86_init_uint_noop
102#endif 97#endif
103 98
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..642fe34b36a2 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
49extern unsigned long init_memory_mapping(unsigned long start, 49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end); 50 unsigned long end);
51 51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); 52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
53 int acpi, int k8);
53extern void free_initmem(void); 54extern void free_initmem(void);
54 55
55#endif /* !__ASSEMBLY__ */ 56#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18 18
19#include <asm/x86_init.h>
20
19/* 21/*
20 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
21 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
270 unsigned long new_flags) 272 unsigned long new_flags)
271{ 273{
272 /* 274 /*
273 * PAT type is always WB for ISA. So no need to check. 275 * PAT type is always WB for untracked ranges, so no need to check.
274 */ 276 */
275 if (is_ISA_range(paddr, paddr + size - 1)) 277 if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
276 return 1; 278 return 1;
277 279
278 /* 280 /*
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..4009f6534f52 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,18 +5,19 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8extern void early_idt_handler(void); 8void early_idt_handler(void);
9 9
10extern void system_call(void); 10void system_call(void);
11extern void syscall_init(void); 11void syscall_init(void);
12 12
13extern void ia32_syscall(void); 13void ia32_syscall(void);
14extern void ia32_cstar_target(void); 14void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void); 15void ia32_sysenter_target(void);
16 16
17extern void syscall32_cpu_init(void); 17void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19void x86_configure_nx(void);
20void x86_report_nx(void);
20 21
21extern int reboot_force; 22extern int reboot_force;
22 23
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
2#define _ASM_X86_SECTIONS_H 2#define _ASM_X86_SECTIONS_H
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5#include <asm/uaccess.h>
5 6
6extern char __brk_base[], __brk_limit[]; 7extern char __brk_base[], __brk_limit[];
8extern struct exception_table_entry __stop___ex_table[];
9
10#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
11extern char __end_rodata_hpage_align[];
12#endif
7 13
8#endif /* _ASM_X86_SECTIONS_H */ 14#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d8e71459f025..ea0e8ea15e15 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc); 26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve); 29 void (*find_smp_config)(void);
30 void (*get_smp_config)(unsigned int early); 30 void (*get_smp_config)(unsigned int early);
31}; 31};
32 32
@@ -125,12 +125,14 @@ struct x86_cpuinit_ops {
125 * @calibrate_tsc: calibrate TSC 125 * @calibrate_tsc: calibrate TSC
126 * @get_wallclock: get time from HW clock like RTC etc. 126 * @get_wallclock: get time from HW clock like RTC etc.
127 * @set_wallclock: set time back to HW clock 127 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic
128 */ 129 */
129struct x86_platform_ops { 130struct x86_platform_ops {
130 unsigned long (*calibrate_tsc)(void); 131 unsigned long (*calibrate_tsc)(void);
131 unsigned long (*get_wallclock)(void); 132 unsigned long (*get_wallclock)(void);
132 int (*set_wallclock)(unsigned long nowtime); 133 int (*set_wallclock)(unsigned long nowtime);
133 void (*iommu_shutdown)(void); 134 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end);
134}; 136};
135 137
136extern struct x86_init_ops x86_init; 138extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..82e508677b91 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 07cdbdcd7a92..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 130c4b934877..b684bb303cbf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/ipi.h> 31#include <asm/ipi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/x86_init.h>
33 34
34DEFINE_PER_CPU(int, x2apic_extra_bits); 35DEFINE_PER_CPU(int, x2apic_extra_bits);
35 36
36static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr;
39
40static inline bool is_GRU_range(u64 start, u64 end)
41{
42 return start >= gru_start_paddr && end <= gru_end_paddr;
43}
44
45static bool uv_is_untracked_pat_range(u64 start, u64 end)
46{
47 return is_ISA_range(start, end) || is_GRU_range(start, end);
48}
37 49
38static int early_get_nodeid(void) 50static int early_get_nodeid(void)
39{ 51{
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 62{
51 if (!strcmp(oem_id, "SGI")) { 63 if (!strcmp(oem_id, "SGI")) {
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
52 if (!strcmp(oem_table_id, "UVL")) 65 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 66 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 67 else if (!strcmp(oem_table_id, "UVX"))
@@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 398 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 399
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 400 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 401 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 402 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
403 gru_start_paddr = ((u64)gru.s.base << shift);
404 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
405
406 }
390} 407}
391 408
392static __init void map_mmr_high(int max_pnode) 409static __init void map_mmr_high(int max_pnode)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a4ec8b647544..c1afa990a6c8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1136 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1137 barrier();
1138 1138
1139 check_efer(); 1139 x86_configure_nx();
1140 if (cpu != 0) 1140 if (cpu != 0)
1141 enable_x2apic(); 1141 enable_x2apic();
1142 1142
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..c900b73f9224 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 263 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 264 for now. */
265 node = apicid_to_node[apicid]; 265 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 266 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 267 node = first_node(node_online_map);
268 else if (!node_online(node)) {
269 /* reuse the value from init_cpu_to_node() */
270 node = cpu_to_node(cpu);
271 }
268 numa_set_node(cpu, node); 272 numa_set_node(cpu, node);
269 273
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); 274 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 73c86db5acbe..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
170 return start1 - start2; 170 return start1 - start2;
171} 171}
172 172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
173#define BIOS_BUG_MSG KERN_WARNING \ 208#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 210
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 subtract_range(range, extra_remove_base, 258 subtract_range(range, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 259 extra_remove_base + extra_remove_size - 1);
225 260
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 261 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 262 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end)
265 continue;
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
237 range[i].start, range[i].end + 1); 267 range[i].start, range[i].end + 1);
268 }
238 } 269 }
239 270
240 /* sort the ranges */ 271 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 272 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 273 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 274 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 275 for (i = 0; i < nr_range; i++)
@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 720 continue;
690 if (!size) 721 if (!size)
691 type = MTRR_NUM_TYPES; 722 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 723 num[type]++;
695 } 724 }
696 725
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5a1b9758fd62..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -189,9 +189,26 @@ static void wait_for_nmi(void)
189 nmi_wait_count++; 189 nmi_wait_count++;
190} 190}
191 191
192static inline int
193within(unsigned long addr, unsigned long start, unsigned long end)
194{
195 return addr >= start && addr < end;
196}
197
192static int 198static int
193do_ftrace_mod_code(unsigned long ip, void *new_code) 199do_ftrace_mod_code(unsigned long ip, void *new_code)
194{ 200{
201 /*
202 * On x86_64, kernel text mappings are mapped read-only with
203 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
204 * of the kernel text mapping to modify the kernel text.
205 *
206 * For 32bit kernels, these mappings are same and we can use
207 * kernel identity mapping to modify code.
208 */
209 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
210 ip = (unsigned long)__va(__pa(ip));
211
195 mod_code_ip = (void *)ip; 212 mod_code_ip = (void *)ip;
196 mod_code_newcode = new_code; 213 mod_code_newcode = new_code;
197 214
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..7fd318bac59c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 22db86a37643..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c843f8406da2..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -158,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
158{ 158{
159 int error; 159 int error;
160 160
161 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
162 set_pages_x(image->control_code_page, 1);
163 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
164 if (error) 163 if (error)
165 return error; 164 return error;
@@ -173,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
173 */ 172 */
174void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
175{ 174{
176 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
177 set_pages_nx(image->control_code_page, 1);
178 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
179} 177}
180 178
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..35a57c963df9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 667 */
668} 668}
669 669
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 671{
672 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 673
685 if (mpf->physptr < end) { 674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 675}
694 676
695static int __init smp_scan_config(unsigned long base, unsigned long length, 677static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 678{
698 unsigned int *bp = phys_to_virt(base); 679 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 680 struct mpf_intel *mpf;
681 unsigned long mem;
700 682
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 683 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 684 bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 699 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 700 mpf, (u64)virt_to_phys(mpf));
719 701
720 if (!reserve) 702 mem = virt_to_phys(mpf);
721 return 1; 703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 704 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 705 smp_reserve_memory(mpf);
726 706
727 return 1; 707 return 1;
728 } 708 }
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 712 return 0;
733} 713}
734 714
735void __init default_find_smp_config(unsigned int reserve) 715void __init default_find_smp_config(void)
736{ 716{
737 unsigned int address; 717 unsigned int address;
738 718
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 724 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 725 * 3) Scan the 64K of bios
746 */ 726 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 727 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 728 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 729 smp_scan_config(0xF0000, 0x10000))
750 return; 730 return;
751 /* 731 /*
752 * If it is an SMP machine we should know now, unless the 732 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 747
768 address = get_bios_ebda(); 748 address = get_bios_ebda();
769 if (address) 749 if (address)
770 smp_scan_config(address, 0x400, reserve); 750 smp_scan_config(address, 0x400);
771} 751}
772 752
773#ifdef CONFIG_X86_IO_APIC 753#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 82e88cdda9bc..946a311a25c9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,6 +106,7 @@
106#include <asm/percpu.h> 106#include <asm/percpu.h>
107#include <asm/topology.h> 107#include <asm/topology.h>
108#include <asm/apicdef.h> 108#include <asm/apicdef.h>
109#include <asm/k8.h>
109#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 111#include <asm/numa_64.h>
111#endif 112#endif
@@ -487,42 +488,11 @@ static void __init reserve_early_setup_data(void)
487 488
488#ifdef CONFIG_KEXEC 489#ifdef CONFIG_KEXEC
489 490
490/**
491 * Reserve @size bytes of crashkernel memory at any suitable offset.
492 *
493 * @size: Size of the crashkernel memory to reserve.
494 * Returns the base address on success, and -1ULL on failure.
495 */
496static
497unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
498{
499 const unsigned long long alignment = 16<<20; /* 16M */
500 unsigned long long start = 0LL;
501
502 while (1) {
503 int ret;
504
505 start = find_e820_area(start, ULONG_MAX, size, alignment);
506 if (start == -1ULL)
507 return start;
508
509 /* try to reserve it */
510 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
511 if (ret >= 0)
512 return start;
513
514 start += alignment;
515 }
516}
517
518static inline unsigned long long get_total_mem(void) 491static inline unsigned long long get_total_mem(void)
519{ 492{
520 unsigned long long total; 493 unsigned long long total;
521 494
522 total = max_low_pfn - min_low_pfn; 495 total = max_pfn - min_low_pfn;
523#ifdef CONFIG_HIGHMEM
524 total += highend_pfn - highstart_pfn;
525#endif
526 496
527 return total << PAGE_SHIFT; 497 return total << PAGE_SHIFT;
528} 498}
@@ -542,21 +512,25 @@ static void __init reserve_crashkernel(void)
542 512
543 /* 0 means: find the address automatically */ 513 /* 0 means: find the address automatically */
544 if (crash_base <= 0) { 514 if (crash_base <= 0) {
545 crash_base = find_and_reserve_crashkernel(crash_size); 515 const unsigned long long alignment = 16<<20; /* 16M */
516
517 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
518 alignment);
546 if (crash_base == -1ULL) { 519 if (crash_base == -1ULL) {
547 pr_info("crashkernel reservation failed. " 520 pr_info("crashkernel reservation failed - No suitable area found.\n");
548 "No suitable area found.\n");
549 return; 521 return;
550 } 522 }
551 } else { 523 } else {
552 ret = reserve_bootmem_generic(crash_base, crash_size, 524 unsigned long long start;
553 BOOTMEM_EXCLUSIVE); 525
554 if (ret < 0) { 526 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
555 pr_info("crashkernel reservation failed - " 527 1<<20);
556 "memory is in use\n"); 528 if (start != crash_base) {
529 pr_info("crashkernel reservation failed - memory is in use.\n");
557 return; 530 return;
558 } 531 }
559 } 532 }
533 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
560 534
561 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 535 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
562 "for crashkernel (System RAM: %ldMB)\n", 536 "for crashkernel (System RAM: %ldMB)\n",
@@ -699,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
699 673
700void __init setup_arch(char **cmdline_p) 674void __init setup_arch(char **cmdline_p)
701{ 675{
676 int acpi = 0;
677 int k8 = 0;
678
702#ifdef CONFIG_X86_32 679#ifdef CONFIG_X86_32
703 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 680 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
704 visws_early_detect(); 681 visws_early_detect();
@@ -791,21 +768,18 @@ void __init setup_arch(char **cmdline_p)
791 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 768 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
792 *cmdline_p = command_line; 769 *cmdline_p = command_line;
793 770
794#ifdef CONFIG_X86_64
795 /* 771 /*
796 * Must call this twice: Once just to detect whether hardware doesn't 772 * x86_configure_nx() is called before parse_early_param() to detect
797 * support NX (so that the early EHCI debug console setup can safely 773 * whether hardware doesn't support NX (so that the early EHCI debug
798 * call set_fixmap(), and then again after parsing early parameters to 774 * console setup can safely call set_fixmap()). It may then be called
799 * honor the respective command line option. 775 * again from within noexec_setup() during parsing early parameters
776 * to honor the respective command line option.
800 */ 777 */
801 check_efer(); 778 x86_configure_nx();
802#endif
803 779
804 parse_early_param(); 780 parse_early_param();
805 781
806#ifdef CONFIG_X86_64 782 x86_report_nx();
807 check_efer();
808#endif
809 783
810 /* Must be before kernel pagetables are setup */ 784 /* Must be before kernel pagetables are setup */
811 vmi_activate(); 785 vmi_activate();
@@ -901,6 +875,13 @@ void __init setup_arch(char **cmdline_p)
901 875
902 reserve_brk(); 876 reserve_brk();
903 877
878#ifdef CONFIG_ACPI_SLEEP
879 /*
880 * Reserve low memory region for sleep support.
881 * even before init_memory_mapping
882 */
883 acpi_reserve_wakeup_memory();
884#endif
904 init_gbpages(); 885 init_gbpages();
905 886
906 /* max_pfn_mapped is updated here */ 887 /* max_pfn_mapped is updated here */
@@ -927,6 +908,8 @@ void __init setup_arch(char **cmdline_p)
927 908
928 reserve_initrd(); 909 reserve_initrd();
929 910
911 reserve_crashkernel();
912
930 vsmp_init(); 913 vsmp_init();
931 914
932 io_delay_init(); 915 io_delay_init();
@@ -938,27 +921,24 @@ void __init setup_arch(char **cmdline_p)
938 921
939 early_acpi_boot_init(); 922 early_acpi_boot_init();
940 923
924 /*
925 * Find and reserve possible boot-time SMP configuration:
926 */
927 find_smp_config();
928
941#ifdef CONFIG_ACPI_NUMA 929#ifdef CONFIG_ACPI_NUMA
942 /* 930 /*
943 * Parse SRAT to discover nodes. 931 * Parse SRAT to discover nodes.
944 */ 932 */
945 acpi_numa_init(); 933 acpi = acpi_numa_init();
946#endif 934#endif
947 935
948 initmem_init(0, max_pfn); 936#ifdef CONFIG_K8_NUMA
949 937 if (!acpi)
950#ifdef CONFIG_ACPI_SLEEP 938 k8 = !k8_numa_init(0, max_pfn);
951 /*
952 * Reserve low memory region for sleep support.
953 */
954 acpi_reserve_bootmem();
955#endif 939#endif
956 /*
957 * Find and reserve possible boot-time SMP configuration:
958 */
959 find_smp_config();
960 940
961 reserve_crashkernel(); 941 initmem_init(0, max_pfn, acpi, k8);
962 942
963#ifdef CONFIG_X86_64 943#ifdef CONFIG_X86_64
964 /* 944 /*
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 1740c85e24bb..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade)
817 */ 817 */
818 apicid = blade_to_first_apicid(blade); 818 apicid = blade_to_first_apicid(blade);
819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
820 if ((pa & 0xff) != UV_BAU_MESSAGE) { 820 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 821 ((apicid << 32) | UV_BAU_MESSAGE));
823 }
824 return 0; 822 return 0;
825} 823}
826 824
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index abda6f53e71e..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
198} 198}
199 199
200static void __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(void)
201{ 201{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3c68fe2d46cf..f3f2104408d9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d11c5ff7c65e..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,6 +13,7 @@
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h>
16#include <asm/tsc.h> 17#include <asm/tsc.h>
17#include <asm/iommu.h> 18#include <asm/iommu.h>
18 19
@@ -80,4 +81,5 @@ struct x86_platform_ops x86_platform = {
80 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
81 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
82 .iommu_shutdown = iommu_shutdown_noop, 83 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range,
83}; 85};
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..d406c5239019 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 146 use_gbpages = direct_gbpages;
147#endif 147#endif
148 148
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 149 /* Enable PSE if available */
154 if (cpu_has_pse) 150 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 151 set_in_cr4(X86_CR4_PSE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..c973f8e2a6cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 412 pkmap_page_table = pte;
413} 413}
414 414
415static void __init add_one_highpage_init(struct page *page, int pfn) 415static void __init add_one_highpage_init(struct page *page)
416{ 416{
417 ClearPageReserved(page); 417 ClearPageReserved(page);
418 init_page_count(page); 418 init_page_count(page);
@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 445 if (!pfn_valid(node_pfn))
446 continue; 446 continue;
447 page = pfn_to_page(node_pfn); 447 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 448 add_one_highpage_init(page);
449 } 449 }
450 450
451 return 0; 451 return 0;
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
703} 703}
704 704
705#ifndef CONFIG_NEED_MULTIPLE_NODES 705#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 706void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 707 int acpi, int k8)
708{ 708{
709#ifdef CONFIG_HIGHMEM 709#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 997const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 998EXPORT_SYMBOL_GPL(rodata_test_data);
999 999
1000static int kernel_set_to_readonly; 1000int kernel_set_to_readonly __read_mostly;
1001 1001
1002void set_kernel_text_rw(void) 1002void set_kernel_text_rw(void)
1003{ 1003{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..5198b9bb34ef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
568} 568}
569 569
570#ifndef CONFIG_NUMA 570#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8)
572{ 573{
573 unsigned long bootmap_size, bootmap; 574 unsigned long bootmap_size, bootmap;
574 575
@@ -694,12 +695,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 695const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 696EXPORT_SYMBOL_GPL(rodata_test_data);
696 697
697static int kernel_set_to_readonly; 698int kernel_set_to_readonly;
698 699
699void set_kernel_text_rw(void) 700void set_kernel_text_rw(void)
700{ 701{
701 unsigned long start = PFN_ALIGN(_stext); 702 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 703 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 704
704 if (!kernel_set_to_readonly) 705 if (!kernel_set_to_readonly)
705 return; 706 return;
@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 708 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 709 start, end);
709 710
711 /*
712 * Make the kernel identity mapping for text RW. Kernel text
713 * mapping will always be RO. Refer to the comment in
714 * static_protections() in pageattr.c
715 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 716 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 717}
712 718
713void set_kernel_text_ro(void) 719void set_kernel_text_ro(void)
714{ 720{
715 unsigned long start = PFN_ALIGN(_stext); 721 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 722 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 723
718 if (!kernel_set_to_readonly) 724 if (!kernel_set_to_readonly)
719 return; 725 return;
@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 727 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 728 start, end);
723 729
730 /*
731 * Set the kernel identity mapping for text RO.
732 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 733 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 734}
726 735
727void mark_rodata_ro(void) 736void mark_rodata_ro(void)
728{ 737{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 738 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 739 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 740 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
741 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
742 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
743 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
744 unsigned long data_start = (unsigned long) &_sdata;
732 745
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 746 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 747 (end - start) >> 10);
@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 764 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 765 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 766#endif
767
768 free_init_pages("unused kernel memory",
769 (unsigned long) page_address(virt_to_page(text_end)),
770 (unsigned long)
771 page_address(virt_to_page(rodata_start)));
772 free_init_pages("unused kernel memory",
773 (unsigned long) page_address(virt_to_page(rodata_end)),
774 (unsigned long) page_address(virt_to_page(data_start)));
754} 775}
755 776
756#endif 777#endif
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..b20760ca7244 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..83bbc70d11bb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 241 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 242 if (nodedata_phys < start || nodedata_phys >= end) {
243 free_bootmem(nodedata_phys, pgdat_size); 243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
244 node_data[nodeid] = NULL; 250 node_data[nodeid] = NULL;
245 return; 251 return;
246 } 252 }
@@ -306,8 +312,71 @@ void __init numa_init_array(void)
306 312
307#ifdef CONFIG_NUMA_EMU 313#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 314/* Numa emulation */
315static struct bootnode nodes[MAX_NUMNODES] __initdata;
316static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 317static char *cmdline __initdata;
310 318
319static int __init setup_physnodes(unsigned long start, unsigned long end,
320 int acpi, int k8)
321{
322 int nr_nodes = 0;
323 int ret = 0;
324 int i;
325
326#ifdef CONFIG_ACPI_NUMA
327 if (acpi)
328 nr_nodes = acpi_get_nodes(physnodes);
329#endif
330#ifdef CONFIG_K8_NUMA
331 if (k8)
332 nr_nodes = k8_get_nodes(physnodes);
333#endif
334 /*
335 * Basic sanity checking on the physical node map: there may be errors
336 * if the SRAT or K8 incorrectly reported the topology or the mem=
337 * kernel parameter is used.
338 */
339 for (i = 0; i < nr_nodes; i++) {
340 if (physnodes[i].start == physnodes[i].end)
341 continue;
342 if (physnodes[i].start > end) {
343 physnodes[i].end = physnodes[i].start;
344 continue;
345 }
346 if (physnodes[i].end < start) {
347 physnodes[i].start = physnodes[i].end;
348 continue;
349 }
350 if (physnodes[i].start < start)
351 physnodes[i].start = start;
352 if (physnodes[i].end > end)
353 physnodes[i].end = end;
354 }
355
356 /*
357 * Remove all nodes that have no memory or were truncated because of the
358 * limited address range.
359 */
360 for (i = 0; i < nr_nodes; i++) {
361 if (physnodes[i].start == physnodes[i].end)
362 continue;
363 physnodes[ret].start = physnodes[i].start;
364 physnodes[ret].end = physnodes[i].end;
365 ret++;
366 }
367
368 /*
369 * If no physical topology was detected, a single node is faked to cover
370 * the entire address space.
371 */
372 if (!ret) {
373 physnodes[ret].start = start;
374 physnodes[ret].end = end;
375 ret = 1;
376 }
377 return ret;
378}
379
311/* 380/*
312 * Setups up nid to range from addr to addr + size. If the end 381 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 382 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +384,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 384 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 385 * the end of the node.
317 */ 386 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 387static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 388{
321 int ret = 0; 389 int ret = 0;
322
323 nodes[nid].start = *addr; 390 nodes[nid].start = *addr;
324 *addr += size; 391 *addr += size;
325 if (*addr >= max_addr) { 392 if (*addr >= max_addr) {
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 402}
336 403
337/* 404/*
405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
406 * to max_addr. The return value is the number of nodes allocated.
407 */
408static int __init split_nodes_interleave(u64 addr, u64 max_addr,
409 int nr_phys_nodes, int nr_nodes)
410{
411 nodemask_t physnode_mask = NODE_MASK_NONE;
412 u64 size;
413 int big;
414 int ret = 0;
415 int i;
416
417 if (nr_nodes <= 0)
418 return -1;
419 if (nr_nodes > MAX_NUMNODES) {
420 pr_info("numa=fake=%d too large, reducing to %d\n",
421 nr_nodes, MAX_NUMNODES);
422 nr_nodes = MAX_NUMNODES;
423 }
424
425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
426 /*
427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder.
429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
431 FAKE_NODE_MIN_SIZE;
432
433 size &= FAKE_NODE_MIN_HASH_MASK;
434 if (!size) {
435 pr_err("Not enough memory for each node. "
436 "NUMA emulation disabled.\n");
437 return -1;
438 }
439
440 for (i = 0; i < nr_phys_nodes; i++)
441 if (physnodes[i].start != physnodes[i].end)
442 node_set(i, physnode_mask);
443
444 /*
445 * Continue to fill physical nodes with fake nodes until there is no
446 * memory left on any of them.
447 */
448 while (nodes_weight(physnode_mask)) {
449 for_each_node_mask(i, physnode_mask) {
450 u64 end = physnodes[i].start + size;
451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
452
453 if (ret < big)
454 end += FAKE_NODE_MIN_SIZE;
455
456 /*
457 * Continue to add memory to this fake node if its
458 * non-reserved memory is less than the per-node size.
459 */
460 while (end - physnodes[i].start -
461 e820_hole_size(physnodes[i].start, end) < size) {
462 end += FAKE_NODE_MIN_SIZE;
463 if (end > physnodes[i].end) {
464 end = physnodes[i].end;
465 break;
466 }
467 }
468
469 /*
470 * If there won't be at least FAKE_NODE_MIN_SIZE of
471 * non-reserved memory in ZONE_DMA32 for the next node,
472 * this one must extend to the boundary.
473 */
474 if (end < dma32_end && dma32_end - end -
475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
476 end = dma32_end;
477
478 /*
479 * If there won't be enough non-reserved memory for the
480 * next node, this one must extend to the end of the
481 * physical node.
482 */
483 if (physnodes[i].end - end -
484 e820_hole_size(end, physnodes[i].end) < size)
485 end = physnodes[i].end;
486
487 /*
488 * Avoid allocating more nodes than requested, which can
489 * happen as a result of rounding down each node's size
490 * to FAKE_NODE_MIN_SIZE.
491 */
492 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
493 end = physnodes[i].end;
494
495 if (setup_node_range(ret++, &physnodes[i].start,
496 end - physnodes[i].start,
497 physnodes[i].end) < 0)
498 node_clear(i, physnode_mask);
499 }
500 }
501 return ret;
502}
503
504/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 507 * last node allocated.
341 */ 508 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 510 int num_nodes)
345{ 511{
346 unsigned int big; 512 unsigned int big;
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 554 break;
389 } 555 }
390 } 556 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 558 break;
393 } 559 }
394 return i - node_start + 1; 560 return i - node_start + 1;
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 565 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 566 * nodes split.
401 */ 567 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 569 u64 size)
404{ 570{
405 int i = node_start; 571 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 573 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 574 ;
409 return i - node_start; 575 return i - node_start;
410} 576}
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 579 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 580 * numa=fake command-line option.
415 */ 581 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 582static int __init numa_emulation(unsigned long start_pfn,
417 583 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 584{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 585 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes;
423 589
424 memset(&nodes, 0, sizeof(nodes)); 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 591 /*
426 * If the numa=fake command-line is just a single number N, split the 592 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 593 * system RAM into N fake nodes.
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 596 long n = simple_strtol(cmdline, NULL, 0);
431 597
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
433 if (num_nodes < 0) 600 if (num_nodes < 0)
434 return num_nodes; 601 return num_nodes;
435 goto out; 602 goto out;
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 624 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 625 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 626 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 627 size, max_addr) < 0)
461 goto done; 628 goto done;
462 if (!*cmdline) 629 if (!*cmdline)
463 break; 630 break;
@@ -473,7 +640,7 @@ done:
473 if (addr < max_addr) { 640 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 641 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 642 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 643 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 644 num_nodes, num);
478 goto out; 645 goto out;
479 } 646 }
@@ -482,7 +649,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 649 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 650 if (coeff <= 0)
484 break; 651 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 652 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 653 num_nodes, coeff);
487 break; 654 break;
488 case ',': 655 case ',':
@@ -490,8 +657,8 @@ done:
490 break; 657 break;
491 default: 658 default:
492 /* Give one final node */ 659 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 660 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 661 max_addr);
495 num_nodes++; 662 num_nodes++;
496 } 663 }
497 } 664 }
@@ -505,14 +672,10 @@ out:
505 } 672 }
506 673
507 /* 674 /*
508 * We need to vacate all active ranges that may have been registered by 675 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 676 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 677 */
512 remove_all_active_ranges(); 678 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 679 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 681 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +687,8 @@ out:
524} 687}
525#endif /* CONFIG_NUMA_EMU */ 688#endif /* CONFIG_NUMA_EMU */
526 689
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 690void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 int acpi, int k8)
528{ 692{
529 int i; 693 int i;
530 694
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
533 697
534#ifdef CONFIG_NUMA_EMU 698#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 700 return;
537 nodes_clear(node_possible_map); 701 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 702 nodes_clear(node_online_map);
539#endif 703#endif
540 704
541#ifdef CONFIG_ACPI_NUMA 705#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 707 last_pfn << PAGE_SHIFT))
544 return; 708 return;
545 nodes_clear(node_possible_map); 709 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 710 nodes_clear(node_online_map);
547#endif 711#endif
548 712
549#ifdef CONFIG_K8_NUMA 713#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 714 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 715 return;
553 nodes_clear(node_possible_map); 716 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 717 nodes_clear(node_online_map);
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 764early_param("numa", numa_setup);
602 765
603#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
767
768static __init int find_near_online_node(int node)
769{
770 int n, val;
771 int min_val = INT_MAX;
772 int best_node = -1;
773
774 for_each_online_node(n) {
775 val = node_distance(node, n);
776
777 if (val < min_val) {
778 min_val = val;
779 best_node = n;
780 }
781 }
782
783 return best_node;
784}
785
604/* 786/*
605 * Setup early cpu_to_node. 787 * Setup early cpu_to_node.
606 * 788 *
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 814 if (node == NUMA_NO_NODE)
633 continue; 815 continue;
634 if (!node_online(node)) 816 if (!node_online(node))
635 continue; 817 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 818 numa_set_node(cpu, node);
637 } 819 }
638} 820}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..1d4eb93d333c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align))
295 pgprot_val(forbidden) |= _PAGE_RW;
296#endif
297
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 299
284 return prot; 300 return prot;
@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1085
1070int set_memory_x(unsigned long addr, int numpages) 1086int set_memory_x(unsigned long addr, int numpages)
1071{ 1087{
1088 if (!(__supported_pte_mask & _PAGE_NX))
1089 return 0;
1090
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1091 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1092}
1074EXPORT_SYMBOL(set_memory_x); 1093EXPORT_SYMBOL(set_memory_x);
1075 1094
1076int set_memory_nx(unsigned long addr, int numpages) 1095int set_memory_nx(unsigned long addr, int numpages)
1077{ 1096{
1097 if (!(__supported_pte_mask & _PAGE_NX))
1098 return 0;
1099
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1100 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1101}
1080EXPORT_SYMBOL(set_memory_nx); 1102EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..a81b7e73275d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -388,7 +389,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 389 }
389 390
390 /* Low ISA region is always mapped WB in page table. No need to track */ 391 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 392 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 393 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 394 *new_type = _PAGE_CACHE_WB;
394 return 0; 395 return 0;
@@ -499,7 +500,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 500 return 0;
500 501
501 /* Low ISA region is always mapped WB. No need to track */ 502 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 503 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 504 return 0;
504 505
505 is_range_ram = pat_pagerange_is_ram(start, end); 506 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +583,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 583 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 584 struct memtype *entry;
584 585
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 586 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 587 return rettype;
587 588
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 589 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -1018,8 +1019,10 @@ static const struct file_operations memtype_fops = {
1018 1019
1019static int __init pat_memtype_list_init(void) 1020static int __init pat_memtype_list_init(void)
1020{ 1021{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1022 if (pat_enabled) {
1022 NULL, &memtype_fops); 1023 debugfs_create_file("pat_memtype_list", S_IRUSR,
1024 arch_debugfs_dir, NULL, &memtype_fops);
1025 }
1023 return 0; 1026 return 0;
1024} 1027}
1025 1028
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 9d7ce96e5a5c..d89075489664 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 290
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 292 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 293
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 295 update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 336
339void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
340 338
339int __init acpi_get_nodes(struct bootnode *physnodes)
340{
341 int i;
342 int ret = 0;
343
344 for_each_node_mask(i, nodes_parsed) {
345 physnodes[ret].start = nodes[i].start;
346 physnodes[ret].end = nodes[i].end;
347 ret++;
348 }
349 return ret;
350}
351
341/* Use the information discovered above to actually set up the nodes. */ 352/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 353int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 354{
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 361 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 362 cutoff_node(i, start, end);
352 363
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 364 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 365 memblk_nodeid);
360 if (memnode_shift < 0) { 366 if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 370 return -1;
365 } 371 }
366 372
373 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT);
376 if (!nodes_cover_memory(nodes)) {
377 bad_srat();
378 return -1;
379 }
380
367 /* Account for nodes with cpus and no memory */ 381 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 382 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 383
@@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 469 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 470 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 471}
459 472
460static int null_slit_node_compare(int a, int b) 473static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..65b58e4b0b8b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -43,7 +44,7 @@ union smp_flush_state {
43 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dfbf70e65860..c462cea8ef09 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1093,10 +1093,8 @@ asmlinkage void __init xen_start_kernel(void)
1093 1093
1094 __supported_pte_mask |= _PAGE_IOMAP; 1094 __supported_pte_mask |= _PAGE_IOMAP;
1095 1095
1096#ifdef CONFIG_X86_64
1097 /* Work out if we support NX */ 1096 /* Work out if we support NX */
1098 check_efer(); 1097 x86_configure_nx();
1099#endif
1100 1098
1101 xen_setup_features(); 1099 xen_setup_features();
1102 1100