aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2011-05-02 08:08:43 -0400
committerTejun Heo <tj@kernel.org>2011-05-02 08:08:47 -0400
commitaff364860aa105b2deacc6f21ec8ef524460e3fc (patch)
tree18409ebe16b25b141598da9b6386d69416c06afa /arch/x86
parentc7a7b814c9dca9ee01b38e63b4a46de87156d3b6 (diff)
parent993ba1585cbb03fab012e41d1a5d24330a283b31 (diff)
Merge branch 'x86/numa' into x86-mm
Merge reason: Pick up x86-32 remap allocator cleanup changes - 14 commits, 3fe14ab541^..993ba1585c. 3fe14ab541: x86-32, numa: Fix failure condition check in alloc_remap() 993ba1585c: x86-32, numa: Update remap allocator comments Scheduled NUMA init 32/64bit unification changes depend on them. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/dma.h1
-rw-r--r--arch/x86/include/asm/topology.h1
-rw-r--r--arch/x86/kernel/amd_nb.c2
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c1
-rw-r--r--arch/x86/kernel/apic/numaq_32.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c20
-rw-r--r--arch/x86/kernel/irq.c1
-rw-r--r--arch/x86/kernel/microcode_core.c1
-rw-r--r--arch/x86/kernel/reboot.c1
-rw-r--r--arch/x86/mm/numa_32.c268
-rw-r--r--arch/x86/mm/srat_32.c1
-rw-r--r--arch/x86/oprofile/nmi_int.c5
-rw-r--r--arch/x86/oprofile/op_counter.h1
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1
17 files changed, 149 insertions, 172 deletions
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a279d98ea95..2b7d573be54 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_APIC_H 2#define _ASM_X86_APIC_H
3 3
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5#include <linux/delay.h>
6#include <linux/pm.h> 5#include <linux/pm.h>
7 6
8#include <asm/alternative.h> 7#include <asm/alternative.h>
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 97b6d8114a4..057099e5fab 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -10,7 +10,6 @@
10 10
11#include <linux/spinlock.h> /* And spinlocks */ 11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */ 12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14 13
15#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER 14#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
16#define dma_outb outb_p 15#define dma_outb outb_p
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f..8dba76972fd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void);
95#ifdef CONFIG_X86_32 95#ifdef CONFIG_X86_32
96extern unsigned long node_start_pfn[]; 96extern unsigned long node_start_pfn[];
97extern unsigned long node_end_pfn[]; 97extern unsigned long node_end_pfn[];
98extern unsigned long node_remap_size[];
99#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) 98#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
100 99
101# define SD_CACHE_NICE_TRIES 1 100# define SD_CACHE_NICE_TRIES 1
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 6801959a8b2..4c39baa8fac 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -21,7 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
21EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
22 22
23static struct pci_device_id amd_nb_link_ids[] = { 23static struct pci_device_id amd_nb_link_ids[] = {
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) }, 24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
25 {} 25 {}
26}; 26};
27 27
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c4e557a1ebb..5260fe91bcb 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,6 +16,7 @@
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/delay.h>
19 20
20#ifdef CONFIG_HARDLOCKUP_DETECTOR 21#ifdef CONFIG_HARDLOCKUP_DETECTOR
21u64 hw_nmi_get_sample_period(void) 22u64 hw_nmi_get_sample_period(void)
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134..0aced70815f 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
93 node_end_pfn[node]); 93 node_end_pfn[node]);
94 94
95 memory_present(node, node_start_pfn[node], node_end_pfn[node]); 95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
96
97 node_remap_size[node] = node_memmap_size_bytes(node,
98 node_start_pfn[node],
99 node_end_pfn[node]);
100} 96}
101 97
102/* 98/*
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 3c289281394..33b10a0fc09 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -23,6 +23,8 @@
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/kdebug.h> 25#include <linux/kdebug.h>
26#include <linux/delay.h>
27#include <linux/crash_dump.h>
26 28
27#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
28#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
@@ -34,6 +36,7 @@
34#include <asm/ipi.h> 36#include <asm/ipi.h>
35#include <asm/smp.h> 37#include <asm/smp.h>
36#include <asm/x86_init.h> 38#include <asm/x86_init.h>
39#include <asm/emergency-restart.h>
37 40
38DEFINE_PER_CPU(int, x2apic_extra_bits); 41DEFINE_PER_CPU(int, x2apic_extra_bits);
39 42
@@ -810,4 +813,11 @@ void __init uv_system_init(void)
810 813
811 /* register Legacy VGA I/O redirection handler */ 814 /* register Legacy VGA I/O redirection handler */
812 pci_register_set_vga_state(uv_set_vga_state); 815 pci_register_set_vga_state(uv_set_vga_state);
816
817 /*
818 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
819 * EFI is not enabled in the kdump kernel.
820 */
821 if (is_kdump_kernel())
822 reboot_type = BOOT_ACPI;
813} 823}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a05ef63eb4..3385ea26f68 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1626,7 +1626,7 @@ out:
1626static unsigned int mce_poll(struct file *file, poll_table *wait) 1626static unsigned int mce_poll(struct file *file, poll_table *wait)
1627{ 1627{
1628 poll_wait(file, &mce_wait, wait); 1628 poll_wait(file, &mce_wait, wait);
1629 if (rcu_dereference_check_mce(mcelog.next)) 1629 if (rcu_access_index(mcelog.next))
1630 return POLLIN | POLLRDNORM; 1630 return POLLIN | POLLRDNORM;
1631 if (!mce_apei_read_done && apei_check_mce()) 1631 if (!mce_apei_read_done && apei_check_mce())
1632 return POLLIN | POLLRDNORM; 1632 return POLLIN | POLLRDNORM;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 307dfbbf4a8..929739a653d 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -293,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
293 293
294 /* 294 /*
295 * HACK! 295 * HACK!
296 * We use this same function to initialize the mtrrs on boot. 296 *
297 * The state of the boot cpu's mtrrs has been saved, and we want 297 * We use this same function to initialize the mtrrs during boot,
298 * to replicate across all the APs. 298 * resume, runtime cpu online and on an explicit request to set a
299 * If we're doing that @reg is set to something special... 299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
300 */ 310 */
301 if (reg != ~0U) 311 if (reg != ~0U)
302 mtrr_if->set(reg, base, size, type); 312 mtrr_if->set(reg, base, size, type);
303 else if (!mtrr_aps_delayed_init) 313 else
304 mtrr_if->set_all(); 314 mtrr_if->set_all();
305 315
306 /* Wait for the others */ 316 /* Wait for the others */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 948a31eae75..1cb0b9fc78d 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -8,6 +8,7 @@
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/delay.h>
11 12
12#include <asm/apic.h> 13#include <asm/apic.h>
13#include <asm/io_apic.h> 14#include <asm/io_apic.h>
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 5ed0ab549eb..f9242800bc8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -550,6 +550,7 @@ static void __exit microcode_exit(void)
550 microcode_dev_exit(); 550 microcode_dev_exit();
551 551
552 unregister_hotcpu_notifier(&mc_cpu_notifier); 552 unregister_hotcpu_notifier(&mc_cpu_notifier);
553 unregister_syscore_ops(&mc_syscore_ops);
553 554
554 get_online_cpus(); 555 get_online_cpus();
555 mutex_lock(&microcode_mutex); 556 mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d3ce37edb54..08c44b08bf5 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/tboot.h> 8#include <linux/tboot.h>
9#include <linux/delay.h>
9#include <acpi/reboot.h> 10#include <acpi/reboot.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420d..c757c0a3b52 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -104,13 +104,9 @@ extern unsigned long highend_pfn, highstart_pfn;
104 104
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
106 106
107unsigned long node_remap_size[MAX_NUMNODES];
108static void *node_remap_start_vaddr[MAX_NUMNODES]; 107static void *node_remap_start_vaddr[MAX_NUMNODES];
109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 108void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 109
111static unsigned long kva_start_pfn;
112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu) 110int __cpuinit numa_cpu_node(int cpu)
115{ 111{
116 return apic->x86_32_numa_cpu_node(cpu); 112 return apic->x86_32_numa_cpu_node(cpu);
@@ -129,7 +125,6 @@ int __init get_memcfg_numa_flat(void)
129 node_end_pfn[0] = max_pfn; 125 node_end_pfn[0] = max_pfn;
130 memblock_x86_register_active_regions(0, 0, max_pfn); 126 memblock_x86_register_active_regions(0, 0, max_pfn);
131 memory_present(0, 0, max_pfn); 127 memory_present(0, 0, max_pfn);
132 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
133 128
134 /* Indicate there is one node available. */ 129 /* Indicate there is one node available. */
135 nodes_clear(node_online_map); 130 nodes_clear(node_online_map);
@@ -164,9 +159,8 @@ static void __init allocate_pgdat(int nid)
164{ 159{
165 char buf[16]; 160 char buf[16];
166 161
167 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) 162 NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE));
168 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 163 if (!NODE_DATA(nid)) {
169 else {
170 unsigned long pgdat_phys; 164 unsigned long pgdat_phys;
171 pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, 165 pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
172 max_pfn_mapped<<PAGE_SHIFT, 166 max_pfn_mapped<<PAGE_SHIFT,
@@ -182,25 +176,38 @@ static void __init allocate_pgdat(int nid)
182} 176}
183 177
184/* 178/*
185 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 179 * Remap memory allocator
186 * virtual address space (KVA) is reserved and portions of nodes are mapped
187 * using it. This is to allow node-local memory to be allocated for
188 * structures that would normally require ZONE_NORMAL. The memory is
189 * allocated with alloc_remap() and callers should be prepared to allocate
190 * from the bootmem allocator instead.
191 */ 180 */
192static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 181static unsigned long node_remap_start_pfn[MAX_NUMNODES];
193static void *node_remap_end_vaddr[MAX_NUMNODES]; 182static void *node_remap_end_vaddr[MAX_NUMNODES];
194static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 183static void *node_remap_alloc_vaddr[MAX_NUMNODES];
195static unsigned long node_remap_offset[MAX_NUMNODES];
196 184
185/**
186 * alloc_remap - Allocate remapped memory
187 * @nid: NUMA node to allocate memory from
188 * @size: The size of allocation
189 *
190 * Allocate @size bytes from the remap area of NUMA node @nid. The
191 * size of the remap area is predetermined by init_alloc_remap() and
192 * only the callers considered there should call this function. For
193 * more info, please read the comment on top of init_alloc_remap().
194 *
195 * The caller must be ready to handle allocation failure from this
196 * function and fall back to regular memory allocator in such cases.
197 *
198 * CONTEXT:
199 * Single CPU early boot context.
200 *
201 * RETURNS:
202 * Pointer to the allocated memory on success, %NULL on failure.
203 */
197void *alloc_remap(int nid, unsigned long size) 204void *alloc_remap(int nid, unsigned long size)
198{ 205{
199 void *allocation = node_remap_alloc_vaddr[nid]; 206 void *allocation = node_remap_alloc_vaddr[nid];
200 207
201 size = ALIGN(size, L1_CACHE_BYTES); 208 size = ALIGN(size, L1_CACHE_BYTES);
202 209
203 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 210 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
204 return NULL; 211 return NULL;
205 212
206 node_remap_alloc_vaddr[nid] += size; 213 node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +216,6 @@ void *alloc_remap(int nid, unsigned long size)
209 return allocation; 216 return allocation;
210} 217}
211 218
212static void __init remap_numa_kva(void)
213{
214 void *vaddr;
215 unsigned long pfn;
216 int node;
217
218 for_each_online_node(node) {
219 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
220 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
221 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
222 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
223 (unsigned long)vaddr,
224 node_remap_start_pfn[node] + pfn);
225 set_pmd_pfn((ulong) vaddr,
226 node_remap_start_pfn[node] + pfn,
227 PAGE_KERNEL_LARGE);
228 }
229 }
230}
231
232#ifdef CONFIG_HIBERNATION 219#ifdef CONFIG_HIBERNATION
233/** 220/**
234 * resume_map_numa_kva - add KVA mapping to the temporary page tables created 221 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +227,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
240 int node; 227 int node;
241 228
242 for_each_online_node(node) { 229 for_each_online_node(node) {
243 unsigned long start_va, start_pfn, size, pfn; 230 unsigned long start_va, start_pfn, nr_pages, pfn;
244 231
245 start_va = (unsigned long)node_remap_start_vaddr[node]; 232 start_va = (unsigned long)node_remap_start_vaddr[node];
246 start_pfn = node_remap_start_pfn[node]; 233 start_pfn = node_remap_start_pfn[node];
247 size = node_remap_size[node]; 234 nr_pages = (node_remap_end_vaddr[node] -
235 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
248 236
249 printk(KERN_DEBUG "%s: node %d\n", __func__, node); 237 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
250 238
251 for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { 239 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
252 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); 240 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
253 pgd_t *pgd = pgd_base + pgd_index(vaddr); 241 pgd_t *pgd = pgd_base + pgd_index(vaddr);
254 pud_t *pud = pud_offset(pgd, vaddr); 242 pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +252,102 @@ void resume_map_numa_kva(pgd_t *pgd_base)
264} 252}
265#endif 253#endif
266 254
267static __init unsigned long calculate_numa_remap_pages(void) 255/**
256 * init_alloc_remap - Initialize remap allocator for a NUMA node
257 * @nid: NUMA node to initizlie remap allocator for
258 *
259 * NUMA nodes may end up without any lowmem. As allocating pgdat and
260 * memmap on a different node with lowmem is inefficient, a special
261 * remap allocator is implemented which can be used by alloc_remap().
262 *
263 * For each node, the amount of memory which will be necessary for
264 * pgdat and memmap is calculated and two memory areas of the size are
265 * allocated - one in the node and the other in lowmem; then, the area
266 * in the node is remapped to the lowmem area.
267 *
268 * As pgdat and memmap must be allocated in lowmem anyway, this
269 * doesn't waste lowmem address space; however, the actual lowmem
270 * which gets remapped over is wasted. The amount shouldn't be
271 * problematic on machines this feature will be used.
272 *
273 * Initialization failure isn't fatal. alloc_remap() is used
274 * opportunistically and the callers will fall back to other memory
275 * allocation mechanisms on failure.
276 */
277static __init void init_alloc_remap(int nid)
268{ 278{
269 int nid; 279 unsigned long size, pfn;
270 unsigned long size, reserve_pages = 0; 280 u64 node_pa, remap_pa;
281 void *remap_va;
271 282
272 for_each_online_node(nid) { 283 /*
273 u64 node_kva_target; 284 * The acpi/srat node info can show hot-add memroy zones where
274 u64 node_kva_final; 285 * memory could be added but not currently present.
275 286 */
276 /* 287 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
277 * The acpi/srat node info can show hot-add memroy zones 288 nid, node_start_pfn[nid], node_end_pfn[nid]);
278 * where memory could be added but not currently present. 289 if (node_start_pfn[nid] > max_pfn)
279 */ 290 return;
280 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", 291 if (!node_end_pfn[nid])
281 nid, node_start_pfn[nid], node_end_pfn[nid]); 292 return;
282 if (node_start_pfn[nid] > max_pfn) 293 if (node_end_pfn[nid] > max_pfn)
283 continue; 294 node_end_pfn[nid] = max_pfn;
284 if (!node_end_pfn[nid])
285 continue;
286 if (node_end_pfn[nid] > max_pfn)
287 node_end_pfn[nid] = max_pfn;
288
289 /* ensure the remap includes space for the pgdat. */
290 size = node_remap_size[nid] + sizeof(pg_data_t);
291
292 /* convert size to large (pmd size) pages, rounding up */
293 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
294 /* now the roundup is correct, convert to PAGE_SIZE pages */
295 size = size * PTRS_PER_PTE;
296
297 node_kva_target = round_down(node_end_pfn[nid] - size,
298 PTRS_PER_PTE);
299 node_kva_target <<= PAGE_SHIFT;
300 do {
301 node_kva_final = memblock_find_in_range(node_kva_target,
302 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
303 ((u64)size)<<PAGE_SHIFT,
304 LARGE_PAGE_BYTES);
305 node_kva_target -= LARGE_PAGE_BYTES;
306 } while (node_kva_final == MEMBLOCK_ERROR &&
307 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
308
309 if (node_kva_final == MEMBLOCK_ERROR)
310 panic("Can not get kva ram\n");
311
312 node_remap_size[nid] = size;
313 node_remap_offset[nid] = reserve_pages;
314 reserve_pages += size;
315 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
316 " node %d at %llx\n",
317 size, nid, node_kva_final>>PAGE_SHIFT);
318
319 /*
320 * prevent kva address below max_low_pfn want it on system
321 * with less memory later.
322 * layout will be: KVA address , KVA RAM
323 *
324 * we are supposed to only record the one less then max_low_pfn
325 * but we could have some hole in high memory, and it will only
326 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
327 * to use it as free.
328 * So memblock_x86_reserve_range here, hope we don't run out of that array
329 */
330 memblock_x86_reserve_range(node_kva_final,
331 node_kva_final+(((u64)size)<<PAGE_SHIFT),
332 "KVA RAM");
333
334 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
335 }
336 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
337 reserve_pages);
338 return reserve_pages;
339}
340 295
341static void init_remap_allocator(int nid) 296 /* calculate the necessary space aligned to large page size */
342{ 297 size = node_memmap_size_bytes(nid, node_start_pfn[nid],
343 node_remap_start_vaddr[nid] = pfn_to_kaddr( 298 min(node_end_pfn[nid], max_pfn));
344 kva_start_pfn + node_remap_offset[nid]); 299 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
345 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + 300 size = ALIGN(size, LARGE_PAGE_BYTES);
346 (node_remap_size[nid] * PAGE_SIZE); 301
347 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 302 /* allocate node memory and the lowmem remap area */
348 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 303 node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
349 304 (u64)node_end_pfn[nid] << PAGE_SHIFT,
350 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, 305 size, LARGE_PAGE_BYTES);
351 (ulong) node_remap_start_vaddr[nid], 306 if (node_pa == MEMBLOCK_ERROR) {
352 (ulong) node_remap_end_vaddr[nid]); 307 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
308 size, nid);
309 return;
310 }
311 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
312
313 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
314 max_low_pfn << PAGE_SHIFT,
315 size, LARGE_PAGE_BYTES);
316 if (remap_pa == MEMBLOCK_ERROR) {
317 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
318 size, nid);
319 memblock_x86_free_range(node_pa, node_pa + size);
320 return;
321 }
322 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
323 remap_va = phys_to_virt(remap_pa);
324
325 /* perform actual remap */
326 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
327 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
328 (node_pa >> PAGE_SHIFT) + pfn,
329 PAGE_KERNEL_LARGE);
330
331 /* initialize remap allocator parameters */
332 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
333 node_remap_start_vaddr[nid] = remap_va;
334 node_remap_end_vaddr[nid] = remap_va + size;
335 node_remap_alloc_vaddr[nid] = remap_va;
336
337 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
338 nid, node_pa, node_pa + size, remap_va, remap_va + size);
353} 339}
354 340
355void __init initmem_init(void) 341void __init initmem_init(void)
356{ 342{
357 int nid; 343 int nid;
358 long kva_target_pfn;
359
360 /*
361 * When mapping a NUMA machine we allocate the node_mem_map arrays
362 * from node local memory. They are then mapped directly into KVA
363 * between zone normal and vmalloc space. Calculate the size of
364 * this space and use it to adjust the boundary between ZONE_NORMAL
365 * and ZONE_HIGHMEM.
366 */
367 344
368 get_memcfg_numa(); 345 get_memcfg_numa();
369 numa_init_array(); 346 numa_init_array();
370 347
371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 348 for_each_online_node(nid)
372 349 init_alloc_remap(nid);
373 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
374 do {
375 kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
376 max_low_pfn<<PAGE_SHIFT,
377 kva_pages<<PAGE_SHIFT,
378 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
379 kva_target_pfn -= PTRS_PER_PTE;
380 } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
381
382 if (kva_start_pfn == MEMBLOCK_ERROR)
383 panic("Can not get kva space\n");
384
385 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
386 kva_start_pfn, max_low_pfn);
387 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
388 350
389 /* avoid clash with initrd */
390 memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
391 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
392 "KVA PG");
393#ifdef CONFIG_HIGHMEM 351#ifdef CONFIG_HIGHMEM
394 highstart_pfn = highend_pfn = max_pfn; 352 highstart_pfn = highend_pfn = max_pfn;
395 if (max_pfn > max_low_pfn) 353 if (max_pfn > max_low_pfn)
@@ -409,12 +367,8 @@ void __init initmem_init(void)
409 367
410 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 368 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
411 (ulong) pfn_to_kaddr(max_low_pfn)); 369 (ulong) pfn_to_kaddr(max_low_pfn));
412 for_each_online_node(nid) { 370 for_each_online_node(nid)
413 init_remap_allocator(nid);
414
415 allocate_pgdat(nid); 371 allocate_pgdat(nid);
416 }
417 remap_numa_kva();
418 372
419 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 373 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
420 (ulong) pfn_to_kaddr(highstart_pfn)); 374 (ulong) pfn_to_kaddr(highstart_pfn));
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 48651c6f657..1b9e82c96dc 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -276,7 +276,6 @@ int __init get_memcfg_from_srat(void)
276 unsigned long end = min(node_end_pfn[nid], max_pfn); 276 unsigned long end = min(node_end_pfn[nid], max_pfn);
277 277
278 memory_present(nid, start, end); 278 memory_present(nid, start, end);
279 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
280 } 279 }
281 return 1; 280 return 1;
282out_fail: 281out_fail:
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 8dace181c88..cf9750004a0 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -49,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; 49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; 50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
51 val |= (counter_config->unit_mask & 0xFF) << 8; 51 val |= (counter_config->unit_mask & 0xFF) << 8;
52 counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
53 ARCH_PERFMON_EVENTSEL_EDGE |
54 ARCH_PERFMON_EVENTSEL_CMASK);
55 val |= counter_config->extra;
52 event &= model->event_mask ? model->event_mask : 0xFF; 56 event &= model->event_mask ? model->event_mask : 0xFF;
53 val |= event & 0xFF; 57 val |= event & 0xFF;
54 val |= (event & 0x0F00) << 24; 58 val |= (event & 0x0F00) << 24;
@@ -440,6 +444,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
440 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 444 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
441 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 445 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
442 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 446 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
447 oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra);
443 } 448 }
444 449
445 return 0; 450 return 0;
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index e28398df0df..0b7b7b179cb 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -22,6 +22,7 @@ struct op_counter_config {
22 unsigned long kernel; 22 unsigned long kernel;
23 unsigned long user; 23 unsigned long user;
24 unsigned long unit_mask; 24 unsigned long unit_mask;
25 unsigned long extra;
25}; 26};
26 27
27extern struct op_counter_config counter_config[]; 28extern struct op_counter_config counter_config[];
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index a7b38d35c29..7cb6424317f 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/debugfs.h> 11#include <linux/debugfs.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/delay.h>
14 15
15#include <asm/mmu_context.h> 16#include <asm/mmu_context.h>
16#include <asm/uv/uv.h> 17#include <asm/uv/uv.h>