aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-02-24 15:52:45 -0500
committerIngo Molnar <mingo@elte.hu>2009-02-24 15:52:45 -0500
commit0edcf8d6926f4038443dbc24e319530177ca0353 (patch)
tree6010af62f73d01ab673d5106f310eaf4f4228e32 /arch
parent87b203079ed949de52f0d92aeae20e5e0116c12f (diff)
parent40150d37be7f7949b2ec07d511244da856647d84 (diff)
Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu
Conflicts: arch/x86/include/asm/pgtable.h
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/mm/init.c20
-rw-r--r--arch/avr32/Kconfig2
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/percpu.h8
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/irq_32.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c365
-rw-r--r--arch/x86/mm/init_32.c17
-rw-r--r--arch/x86/mm/init_64.c72
11 files changed, 458 insertions, 107 deletions
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16eab312..91eddd8505df 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
189 189
190 if (alpha_using_srm) { 190 if (alpha_using_srm) {
191 static struct vm_struct console_remap_vm; 191 static struct vm_struct console_remap_vm;
192 unsigned long vaddr = VMALLOC_START; 192 unsigned long nr_pages = 0;
193 unsigned long vaddr;
193 unsigned long i, j; 194 unsigned long i, j;
194 195
196 /* calculate needed size */
197 for (i = 0; i < crb->map_entries; ++i)
198 nr_pages += crb->map[i].count;
199
200 /* register the vm area */
201 console_remap_vm.flags = VM_ALLOC;
202 console_remap_vm.size = nr_pages << PAGE_SHIFT;
203 vm_area_register_early(&console_remap_vm, PAGE_SIZE);
204
205 vaddr = (unsigned long)consle_remap_vm.addr;
206
195 /* Set up the third level PTEs and update the virtual 207 /* Set up the third level PTEs and update the virtual
196 addresses of the CRB entries. */ 208 addresses of the CRB entries. */
197 for (i = 0; i < crb->map_entries; ++i) { 209 for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
213 vaddr += PAGE_SIZE; 225 vaddr += PAGE_SIZE;
214 } 226 }
215 } 227 }
216
217 /* Let vmalloc know that we've allocated some space. */
218 console_remap_vm.flags = VM_ALLOC;
219 console_remap_vm.addr = (void *) VMALLOC_START;
220 console_remap_vm.size = vaddr - VMALLOC_START;
221 vmlist = &console_remap_vm;
222 } 228 }
223 229
224 callback_init_done = 1; 230 callback_init_done = 1;
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680d18b0..05fe3053dcae 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
181config QUICKLIST 181config QUICKLIST
182 def_bool y 182 def_bool y
183 183
184config HAVE_ARCH_BOOTMEM_NODE 184config HAVE_ARCH_BOOTMEM
185 def_bool n 185 def_bool n
186 186
187config ARCH_HAVE_MEMORY_PRESENT 187config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e2919c0ff92..8015641478bd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
135config HAVE_SETUP_PER_CPU_AREA 135config HAVE_SETUP_PER_CPU_AREA
136 def_bool y 136 def_bool y
137 137
138config HAVE_DYNAMIC_PER_CPU_AREA
139 def_bool y
140
138config HAVE_CPUMASK_OF_CPU_MAP 141config HAVE_CPUMASK_OF_CPU_MAP
139 def_bool X86_64_SMP 142 def_bool X86_64_SMP
140 143
@@ -1122,7 +1125,7 @@ config NODES_SHIFT
1122 Specify the maximum number of NUMA Nodes available on the target 1125 Specify the maximum number of NUMA Nodes available on the target
1123 system. Increases memory reserved to accomodate various tables. 1126 system. Increases memory reserved to accomodate various tables.
1124 1127
1125config HAVE_ARCH_BOOTMEM_NODE 1128config HAVE_ARCH_BOOTMEM
1126 def_bool y 1129 def_bool y
1127 depends on X86_32 && NUMA 1130 depends on X86_32 && NUMA
1128 1131
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a0635..eeacf67de49e 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
91#endif /* CONFIG_DISCONTIGMEM */ 91#endif /* CONFIG_DISCONTIGMEM */
92 92
93#ifdef CONFIG_NEED_MULTIPLE_NODES 93#ifdef CONFIG_NEED_MULTIPLE_NODES
94 94/* always use node 0 for bootmem on this numa platform */
95/* 95#define alloc_bootmem_core(__bdata, size, align, goal, limit) \
96 * Following are macros that are specific to this numa platform.
97 */
98#define reserve_bootmem(addr, size, flags) \
99 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
100#define alloc_bootmem(x) \
101 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_nopanic(x) \
103 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
104 __pa(MAX_DMA_ADDRESS))
105#define alloc_bootmem_low(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
107#define alloc_bootmem_pages(x) \
108 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
109#define alloc_bootmem_pages_nopanic(x) \
110 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
111 __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_low_pages(x) \
113 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
114#define alloc_bootmem_node(pgdat, x) \
115({ \
116 struct pglist_data __maybe_unused \
117 *__alloc_bootmem_node__pgdat = (pgdat); \
118 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
119 __pa(MAX_DMA_ADDRESS)); \
120})
121#define alloc_bootmem_pages_node(pgdat, x) \
122({ \
123 struct pglist_data __maybe_unused \
124 *__alloc_bootmem_node__pgdat = (pgdat); \
125 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
126 __pa(MAX_DMA_ADDRESS)); \
127})
128#define alloc_bootmem_low_pages_node(pgdat, x) \
129({ \ 96({ \
130 struct pglist_data __maybe_unused \ 97 bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \
131 *__alloc_bootmem_node__pgdat = (pgdat); \ 98 __alloc_bootmem_core(NODE_DATA(0)->bdata, \
132 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ 99 (size), (align), (goal), (limit)); \
133}) 100})
134#endif /* CONFIG_NEED_MULTIPLE_NODES */ 101#endif /* CONFIG_NEED_MULTIPLE_NODES */
135 102
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b26d01..8f1d2fbec1d4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,6 +43,14 @@
43#else /* ...!ASSEMBLY */ 43#else /* ...!ASSEMBLY */
44 44
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46#include <asm/sections.h>
47
48#define __addr_to_pcpu_ptr(addr) \
49 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
50 + (unsigned long)__per_cpu_start)
51#define __pcpu_ptr_to_addr(ptr) \
52 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
53 - (unsigned long)__per_cpu_start)
46 54
47#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 56#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a6669..d0812e155f1d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
288 return 1; 288 return 1;
289} 289}
290 290
291pmd_t *populate_extra_pmd(unsigned long vaddr);
292pte_t *populate_extra_pte(unsigned long vaddr);
291#endif /* __ASSEMBLY__ */ 293#endif /* __ASSEMBLY__ */
292 294
293#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..22590cf688ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 601 if (!data)
602 return -ENOMEM; 602 return -ENOMEM;
603 603
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 604 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 605 per_cpu(drv_data, cpu) = data;
606 606
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b24275..3b09634a5153 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff730..2d946a8f78b9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
@@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
41}; 42};
42EXPORT_SYMBOL(__per_cpu_offset); 43EXPORT_SYMBOL(__per_cpu_offset);
43 44
45/**
46 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
47 *
48 * If NUMA is not configured or there is only one NUMA node available,
49 * there is no reason to consider NUMA. This function determines
50 * whether percpu allocation should consider NUMA or not.
51 *
52 * RETURNS:
53 * true if NUMA should be considered; otherwise, false.
54 */
55static bool __init pcpu_need_numa(void)
56{
57#ifdef CONFIG_NEED_MULTIPLE_NODES
58 pg_data_t *last = NULL;
59 unsigned int cpu;
60
61 for_each_possible_cpu(cpu) {
62 int node = early_cpu_to_node(cpu);
63
64 if (node_online(node) && NODE_DATA(node) &&
65 last && last != NODE_DATA(node))
66 return true;
67
68 last = NODE_DATA(node);
69 }
70#endif
71 return false;
72}
73
74/**
75 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
76 * @cpu: cpu to allocate for
77 * @size: size allocation in bytes
78 * @align: alignment
79 *
80 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
81 * does the right thing for NUMA regardless of the current
82 * configuration.
83 *
84 * RETURNS:
85 * Pointer to the allocated area on success, NULL on failure.
86 */
87static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
88 unsigned long align)
89{
90 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
91#ifdef CONFIG_NEED_MULTIPLE_NODES
92 int node = early_cpu_to_node(cpu);
93 void *ptr;
94
95 if (!node_online(node) || !NODE_DATA(node)) {
96 ptr = __alloc_bootmem_nopanic(size, align, goal);
97 pr_info("cpu %d has no node %d or node-local memory\n",
98 cpu, node);
99 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
100 cpu, size, __pa(ptr));
101 } else {
102 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
103 size, align, goal);
104 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
105 "%016lx\n", cpu, size, node, __pa(ptr));
106 }
107 return ptr;
108#else
109 return __alloc_bootmem_nopanic(size, align, goal);
110#endif
111}
112
113/*
114 * Remap allocator
115 *
116 * This allocator uses PMD page as unit. A PMD page is allocated for
117 * each cpu and each is remapped into vmalloc area using PMD mapping.
118 * As PMD page is quite large, only part of it is used for the first
119 * chunk. Unused part is returned to the bootmem allocator.
120 *
121 * So, the PMD pages are mapped twice - once to the physical mapping
122 * and to the vmalloc area for the first percpu chunk. The double
123 * mapping does add one more PMD TLB entry pressure but still is much
124 * better than only using 4k mappings while still being NUMA friendly.
125 */
126#ifdef CONFIG_NEED_MULTIPLE_NODES
127static size_t pcpur_size __initdata;
128static void **pcpur_ptrs __initdata;
129
130static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
131{
132 size_t off = (size_t)pageno << PAGE_SHIFT;
133
134 if (off >= pcpur_size)
135 return NULL;
136
137 return virt_to_page(pcpur_ptrs[cpu] + off);
138}
139
140static ssize_t __init setup_pcpu_remap(size_t static_size)
141{
142 static struct vm_struct vm;
143 pg_data_t *last;
144 size_t ptrs_size;
145 unsigned int cpu;
146 ssize_t ret;
147
148 /*
149 * If large page isn't supported, there's no benefit in doing
150 * this. Also, on non-NUMA, embedding is better.
151 */
152 if (!cpu_has_pse || pcpu_need_numa())
153 return -EINVAL;
154
155 last = NULL;
156 for_each_possible_cpu(cpu) {
157 int node = early_cpu_to_node(cpu);
158
159 if (node_online(node) && NODE_DATA(node) &&
160 last && last != NODE_DATA(node))
161 goto proceed;
162
163 last = NODE_DATA(node);
164 }
165 return -EINVAL;
166
167proceed:
168 /*
169 * Currently supports only single page. Supporting multiple
170 * pages won't be too difficult if it ever becomes necessary.
171 */
172 pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
173 if (pcpur_size > PMD_SIZE) {
174 pr_warning("PERCPU: static data is larger than large page, "
175 "can't use large page\n");
176 return -EINVAL;
177 }
178
179 /* allocate pointer array and alloc large pages */
180 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
181 pcpur_ptrs = alloc_bootmem(ptrs_size);
182
183 for_each_possible_cpu(cpu) {
184 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
185 if (!pcpur_ptrs[cpu])
186 goto enomem;
187
188 /*
189 * Only use pcpur_size bytes and give back the rest.
190 *
191 * Ingo: The 2MB up-rounding bootmem is needed to make
192 * sure the partial 2MB page is still fully RAM - it's
193 * not well-specified to have a PAT-incompatible area
194 * (unmapped RAM, device memory, etc.) in that hole.
195 */
196 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
197 PMD_SIZE - pcpur_size);
198
199 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
200 }
201
202 /* allocate address and map */
203 vm.flags = VM_ALLOC;
204 vm.size = num_possible_cpus() * PMD_SIZE;
205 vm_area_register_early(&vm, PMD_SIZE);
206
207 for_each_possible_cpu(cpu) {
208 pmd_t *pmd;
209
210 pmd = populate_extra_pmd((unsigned long)vm.addr
211 + cpu * PMD_SIZE);
212 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
213 PAGE_KERNEL_LARGE));
214 }
215
216 /* we're ready, commit */
217 pr_info("PERCPU: Remapped at %p with large pages, static data "
218 "%zu bytes\n", vm.addr, static_size);
219
220 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
221 pcpur_size - static_size, vm.addr, NULL);
222 goto out_free_ar;
223
224enomem:
225 for_each_possible_cpu(cpu)
226 if (pcpur_ptrs[cpu])
227 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
228 ret = -ENOMEM;
229out_free_ar:
230 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
231 return ret;
232}
233#else
234static ssize_t __init setup_pcpu_remap(size_t static_size)
235{
236 return -EINVAL;
237}
238#endif
239
240/*
241 * Embedding allocator
242 *
243 * The first chunk is sized to just contain the static area plus
244 * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
245 * bootmem allocator and used as-is without being mapped into vmalloc
246 * area. This enables the first chunk to piggy back on the linear
247 * physical PMD mapping and doesn't add any additional pressure to
248 * TLB.
249 */
250static void *pcpue_ptr __initdata;
251static size_t pcpue_unit_size __initdata;
252
253static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
254{
255 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
256 + ((size_t)pageno << PAGE_SHIFT));
257}
258
259static ssize_t __init setup_pcpu_embed(size_t static_size)
260{
261 unsigned int cpu;
262
263 /*
264 * If large page isn't supported, there's no benefit in doing
265 * this. Also, embedding allocation doesn't play well with
266 * NUMA.
267 */
268 if (!cpu_has_pse || pcpu_need_numa())
269 return -EINVAL;
270
271 /* allocate and copy */
272 pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
273 pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
274 pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
275 PAGE_SIZE);
276 if (!pcpue_ptr)
277 return -ENOMEM;
278
279 for_each_possible_cpu(cpu)
280 memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
281 static_size);
282
283 /* we're ready, commit */
284 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
285 pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
286
287 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
288 pcpue_unit_size,
289 pcpue_unit_size - static_size, pcpue_ptr,
290 NULL);
291}
292
293/*
294 * 4k page allocator
295 *
296 * This is the basic allocator. Static percpu area is allocated
297 * page-by-page and most of initialization is done by the generic
298 * setup function.
299 */
300static struct page **pcpu4k_pages __initdata;
301static int pcpu4k_nr_static_pages __initdata;
302
303static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
304{
305 if (pageno < pcpu4k_nr_static_pages)
306 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
307 return NULL;
308}
309
310static void __init pcpu4k_populate_pte(unsigned long addr)
311{
312 populate_extra_pte(addr);
313}
314
315static ssize_t __init setup_pcpu_4k(size_t static_size)
316{
317 size_t pages_size;
318 unsigned int cpu;
319 int i, j;
320 ssize_t ret;
321
322 pcpu4k_nr_static_pages = PFN_UP(static_size);
323
324 /* unaligned allocations can't be freed, round up to page size */
325 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
326 * sizeof(pcpu4k_pages[0]));
327 pcpu4k_pages = alloc_bootmem(pages_size);
328
329 /* allocate and copy */
330 j = 0;
331 for_each_possible_cpu(cpu)
332 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
333 void *ptr;
334
335 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
336 if (!ptr)
337 goto enomem;
338
339 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
340 pcpu4k_pages[j++] = virt_to_page(ptr);
341 }
342
343 /* we're ready, commit */
344 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
345 pcpu4k_nr_static_pages, static_size);
346
347 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
348 pcpu4k_populate_pte);
349 goto out_free_ar;
350
351enomem:
352 while (--j >= 0)
353 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
354 ret = -ENOMEM;
355out_free_ar:
356 free_bootmem(__pa(pcpu4k_pages), pages_size);
357 return ret;
358}
359
44static inline void setup_percpu_segment(int cpu) 360static inline void setup_percpu_segment(int cpu)
45{ 361{
46#ifdef CONFIG_X86_32 362#ifdef CONFIG_X86_32
@@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
61 */ 377 */
62void __init setup_per_cpu_areas(void) 378void __init setup_per_cpu_areas(void)
63{ 379{
64 ssize_t size; 380 size_t static_size = __per_cpu_end - __per_cpu_start;
65 char *ptr; 381 unsigned int cpu;
66 int cpu; 382 unsigned long delta;
67 383 size_t pcpu_unit_size;
68 /* Copy section for each CPU (we discard the original) */ 384 ssize_t ret;
69 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
70 385
71 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 386 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
72 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 387 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
73 388
74 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 389 /*
390 * Allocate percpu area. If PSE is supported, try to make use
391 * of large page mappings. Please read comments on top of
392 * each allocator for details.
393 */
394 ret = setup_pcpu_remap(static_size);
395 if (ret < 0)
396 ret = setup_pcpu_embed(static_size);
397 if (ret < 0)
398 ret = setup_pcpu_4k(static_size);
399 if (ret < 0)
400 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
401 static_size, ret);
75 402
76 for_each_possible_cpu(cpu) { 403 pcpu_unit_size = ret;
77#ifndef CONFIG_NEED_MULTIPLE_NODES
78 ptr = alloc_bootmem_pages(size);
79#else
80 int node = early_cpu_to_node(cpu);
81 if (!node_online(node) || !NODE_DATA(node)) {
82 ptr = alloc_bootmem_pages(size);
83 pr_info("cpu %d has no node %d or node-local memory\n",
84 cpu, node);
85 pr_debug("per cpu data for cpu%d at %016lx\n",
86 cpu, __pa(ptr));
87 } else {
88 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
89 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
90 cpu, node, __pa(ptr));
91 }
92#endif
93 404
94 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); 405 /* alrighty, percpu areas up and running */
95 per_cpu_offset(cpu) = ptr - __per_cpu_start; 406 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
407 for_each_possible_cpu(cpu) {
408 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
96 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 409 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
97 per_cpu(cpu_number, cpu) = cpu; 410 per_cpu(cpu_number, cpu) = cpu;
98 setup_percpu_segment(cpu); 411 setup_percpu_segment(cpu);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 06708ee94aa4..ef0bb941cdf5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
137 return pte_offset_kernel(pmd, 0); 137 return pte_offset_kernel(pmd, 0);
138} 138}
139 139
140pmd_t * __init populate_extra_pmd(unsigned long vaddr)
141{
142 int pgd_idx = pgd_index(vaddr);
143 int pmd_idx = pmd_index(vaddr);
144
145 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
146}
147
148pte_t * __init populate_extra_pte(unsigned long vaddr)
149{
150 int pte_idx = pte_index(vaddr);
151 pmd_t *pmd;
152
153 pmd = populate_extra_pmd(vaddr);
154 return one_page_table_init(pmd) + pte_idx;
155}
156
140static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 157static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
141 unsigned long vaddr, pte_t *lastpte) 158 unsigned long vaddr, pte_t *lastpte)
142{ 159{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e6d36b490250..7d4e76da3368 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
168 return ptr; 168 return ptr;
169} 169}
170 170
171void 171static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
173{ 172{
174 pud_t *pud; 173 if (pgd_none(*pgd)) {
175 pmd_t *pmd; 174 pud_t *pud = (pud_t *)spp_getpage();
176 pte_t *pte; 175 pgd_populate(&init_mm, pgd, pud);
176 if (pud != pud_offset(pgd, 0))
177 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
178 pud, pud_offset(pgd, 0));
179 }
180 return pud_offset(pgd, vaddr);
181}
177 182
178 pud = pud_page + pud_index(vaddr); 183static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
184{
179 if (pud_none(*pud)) { 185 if (pud_none(*pud)) {
180 pmd = (pmd_t *) spp_getpage(); 186 pmd_t *pmd = (pmd_t *) spp_getpage();
181 pud_populate(&init_mm, pud, pmd); 187 pud_populate(&init_mm, pud, pmd);
182 if (pmd != pmd_offset(pud, 0)) { 188 if (pmd != pmd_offset(pud, 0))
183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 189 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
184 pmd, pmd_offset(pud, 0)); 190 pmd, pmd_offset(pud, 0));
185 return;
186 }
187 } 191 }
188 pmd = pmd_offset(pud, vaddr); 192 return pmd_offset(pud, vaddr);
193}
194
195static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
196{
189 if (pmd_none(*pmd)) { 197 if (pmd_none(*pmd)) {
190 pte = (pte_t *) spp_getpage(); 198 pte_t *pte = (pte_t *) spp_getpage();
191 pmd_populate_kernel(&init_mm, pmd, pte); 199 pmd_populate_kernel(&init_mm, pmd, pte);
192 if (pte != pte_offset_kernel(pmd, 0)) { 200 if (pte != pte_offset_kernel(pmd, 0))
193 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 201 printk(KERN_ERR "PAGETABLE BUG #02!\n");
194 return;
195 }
196 } 202 }
203 return pte_offset_kernel(pmd, vaddr);
204}
205
206void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
207{
208 pud_t *pud;
209 pmd_t *pmd;
210 pte_t *pte;
211
212 pud = pud_page + pud_index(vaddr);
213 pmd = fill_pmd(pud, vaddr);
214 pte = fill_pte(pmd, vaddr);
197 215
198 pte = pte_offset_kernel(pmd, vaddr);
199 set_pte(pte, new_pte); 216 set_pte(pte, new_pte);
200 217
201 /* 218 /*
@@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
205 __flush_tlb_one(vaddr); 222 __flush_tlb_one(vaddr);
206} 223}
207 224
208void 225void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{ 226{
211 pgd_t *pgd; 227 pgd_t *pgd;
212 pud_t *pud_page; 228 pud_t *pud_page;
@@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
223 set_pte_vaddr_pud(pud_page, vaddr, pteval); 239 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224} 240}
225 241
242pmd_t * __init populate_extra_pmd(unsigned long vaddr)
243{
244 pgd_t *pgd;
245 pud_t *pud;
246
247 pgd = pgd_offset_k(vaddr);
248 pud = fill_pud(pgd, vaddr);
249 return fill_pmd(pud, vaddr);
250}
251
252pte_t * __init populate_extra_pte(unsigned long vaddr)
253{
254 pmd_t *pmd;
255
256 pmd = populate_extra_pmd(vaddr);
257 return fill_pte(pmd, vaddr);
258}
259
226/* 260/*
227 * Create large page table mappings for a range of physical addresses. 261 * Create large page table mappings for a range of physical addresses.
228 */ 262 */