aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/setup_percpu.c
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-03-23 09:50:03 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-03-23 16:20:20 -0400
commit80c5520811d3805adcb15c570ea5e2d489fa5d0b (patch)
treeae797a7f4af39f80e77526533d06ac23b439f0ab /arch/x86/kernel/setup_percpu.c
parentb3e3b302cf6dc8d60b67f0e84d1fa5648889c038 (diff)
parent8c083f081d0014057901c68a0a3e0f8ca7ac8d23 (diff)
Merge branch 'cpus4096' into irq/threaded
Conflicts: arch/parisc/kernel/irq.c kernel/irq/handle.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'arch/x86/kernel/setup_percpu.c')
-rw-r--r--arch/x86/kernel/setup_percpu.c704
1 files changed, 392 insertions, 312 deletions
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49..efa615f2bf4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,402 +7,482 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/mpspec.h> 14#include <asm/mpspec.h>
14#include <asm/apicdef.h> 15#include <asm/apicdef.h>
15#include <asm/highmem.h> 16#include <asm/highmem.h>
17#include <asm/proto.h>
18#include <asm/cpumask.h>
19#include <asm/cpu.h>
20#include <asm/stackprotector.h>
16 21
17#ifdef CONFIG_X86_LOCAL_APIC 22#ifdef CONFIG_DEBUG_PER_CPU_MAPS
18unsigned int num_processors; 23# define DBG(x...) printk(KERN_DEBUG x)
19unsigned disabled_cpus __cpuinitdata; 24#else
20/* Processor that is doing the boot up */ 25# define DBG(x...)
21unsigned int boot_cpu_physical_apicid = -1U;
22EXPORT_SYMBOL(boot_cpu_physical_apicid);
23unsigned int max_physical_apicid;
24
25/* Bitmask of physically existing CPUs */
26physid_mask_t phys_cpu_present_map;
27#endif 26#endif
28 27
29/* map cpu index to physical APIC ID */ 28DEFINE_PER_CPU(int, cpu_number);
30DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 29EXPORT_PER_CPU_SYMBOL(cpu_number);
31DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
32EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
34
35#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
36#define X86_64_NUMA 1
37 30
38/* map cpu index to node index */ 31#ifdef CONFIG_X86_64
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 32#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 33#else
34#define BOOT_PERCPU_OFFSET 0
35#endif
41 36
42/* which logical CPUs are on which nodes */ 37DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
43cpumask_t *node_to_cpumask_map; 38EXPORT_PER_CPU_SYMBOL(this_cpu_off);
44EXPORT_SYMBOL(node_to_cpumask_map);
45 39
46/* setup node_to_cpumask_map */ 40unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
47static void __init setup_node_to_cpumask_map(void); 41 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
42};
43EXPORT_SYMBOL(__per_cpu_offset);
48 44
45/*
46 * On x86_64 symbols referenced from code should be reachable using
47 * 32bit relocations. Reserve space for static percpu variables in
48 * modules so that they are always served from the first chunk which
49 * is located at the percpu segment base. On x86_32, anything can
50 * address anywhere. No need to reserve space in the first chunk.
51 */
52#ifdef CONFIG_X86_64
53#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
49#else 54#else
50static inline void setup_node_to_cpumask_map(void) { } 55#define PERCPU_FIRST_CHUNK_RESERVE 0
51#endif 56#endif
52 57
53#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 58/**
54/* 59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
55 * Copy data used in early init routines from the initial arrays to the 60 *
56 * per cpu data areas. These arrays then become expendable and the 61 * If NUMA is not configured or there is only one NUMA node available,
57 * *_early_ptr's are zeroed indicating that the static arrays are gone. 62 * there is no reason to consider NUMA. This function determines
63 * whether percpu allocation should consider NUMA or not.
64 *
65 * RETURNS:
66 * true if NUMA should be considered; otherwise, false.
58 */ 67 */
59static void __init setup_per_cpu_maps(void) 68static bool __init pcpu_need_numa(void)
60{ 69{
61 int cpu; 70#ifdef CONFIG_NEED_MULTIPLE_NODES
71 pg_data_t *last = NULL;
72 unsigned int cpu;
62 73
63 for_each_possible_cpu(cpu) { 74 for_each_possible_cpu(cpu) {
64 per_cpu(x86_cpu_to_apicid, cpu) = 75 int node = early_cpu_to_node(cpu);
65 early_per_cpu_map(x86_cpu_to_apicid, cpu);
66 per_cpu(x86_bios_cpu_apicid, cpu) =
67 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
68#ifdef X86_64_NUMA
69 per_cpu(x86_cpu_to_node_map, cpu) =
70 early_per_cpu_map(x86_cpu_to_node_map, cpu);
71#endif
72 }
73 76
74 /* indicate the early static arrays will soon be gone */ 77 if (node_online(node) && NODE_DATA(node) &&
75 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 78 last && last != NODE_DATA(node))
76 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 79 return true;
77#ifdef X86_64_NUMA 80
78 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 81 last = NODE_DATA(node);
82 }
79#endif 83#endif
84 return false;
80} 85}
81 86
82#ifdef CONFIG_X86_32 87/**
83/* 88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
84 * Great future not-so-futuristic plan: make i386 and x86_64 do it 89 * @cpu: cpu to allocate for
85 * the same way 90 * @size: size allocation in bytes
86 */ 91 * @align: alignment
87unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 92 *
88EXPORT_SYMBOL(__per_cpu_offset); 93 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
89static inline void setup_cpu_pda_map(void) { } 94 * does the right thing for NUMA regardless of the current
90 95 * configuration.
91#elif !defined(CONFIG_SMP) 96 *
92static inline void setup_cpu_pda_map(void) { } 97 * RETURNS:
93 98 * Pointer to the allocated area on success, NULL on failure.
94#else /* CONFIG_SMP && CONFIG_X86_64 */
95
96/*
97 * Allocate cpu_pda pointer table and array via alloc_bootmem.
98 */ 99 */
99static void __init setup_cpu_pda_map(void) 100static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
101 unsigned long align)
100{ 102{
101 char *pda; 103 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
102 struct x8664_pda **new_cpu_pda; 104#ifdef CONFIG_NEED_MULTIPLE_NODES
103 unsigned long size; 105 int node = early_cpu_to_node(cpu);
104 int cpu; 106 void *ptr;
105 107
106 size = roundup(sizeof(struct x8664_pda), cache_line_size()); 108 if (!node_online(node) || !NODE_DATA(node)) {
107 109 ptr = __alloc_bootmem_nopanic(size, align, goal);
108 /* allocate cpu_pda array and pointer table */ 110 pr_info("cpu %d has no node %d or node-local memory\n",
109 { 111 cpu, node);
110 unsigned long tsize = nr_cpu_ids * sizeof(void *); 112 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
111 unsigned long asize = size * (nr_cpu_ids - 1); 113 cpu, size, __pa(ptr));
112 114 } else {
113 tsize = roundup(tsize, cache_line_size()); 115 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
114 new_cpu_pda = alloc_bootmem(tsize + asize); 116 size, align, goal);
115 pda = (char *)new_cpu_pda + tsize; 117 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
118 "%016lx\n", cpu, size, node, __pa(ptr));
116 } 119 }
117 120 return ptr;
118 /* initialize pointer table to static pda's */ 121#else
119 for_each_possible_cpu(cpu) { 122 return __alloc_bootmem_nopanic(size, align, goal);
120 if (cpu == 0) { 123#endif
121 /* leave boot cpu pda in place */
122 new_cpu_pda[0] = cpu_pda(0);
123 continue;
124 }
125 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
126 new_cpu_pda[cpu]->in_bootmem = 1;
127 pda += size;
128 }
129
130 /* point to new pointer table */
131 _cpu_pda = new_cpu_pda;
132} 124}
133 125
134#endif /* CONFIG_SMP && CONFIG_X86_64 */ 126/*
135 127 * Remap allocator
136#ifdef CONFIG_X86_64 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata;
141static void **pcpur_ptrs __initdata;
137 142
138/* correctly size the local cpu masks */ 143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
139static void __init setup_cpu_local_masks(void)
140{ 144{
141 alloc_bootmem_cpumask_var(&cpu_initialized_mask); 145 size_t off = (size_t)pageno << PAGE_SHIFT;
142 alloc_bootmem_cpumask_var(&cpu_callin_mask);
143 alloc_bootmem_cpumask_var(&cpu_callout_mask);
144 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
145}
146 146
147#else /* CONFIG_X86_32 */ 147 if (off >= pcpur_size)
148 return NULL;
148 149
149static inline void setup_cpu_local_masks(void) 150 return virt_to_page(pcpur_ptrs[cpu] + off);
150{
151} 151}
152 152
153#endif /* CONFIG_X86_32 */ 153static ssize_t __init setup_pcpu_remap(size_t static_size)
154
155/*
156 * Great future plan:
157 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
158 * Always point %gs to its beginning
159 */
160void __init setup_per_cpu_areas(void)
161{ 154{
162 ssize_t size, old_size; 155 static struct vm_struct vm;
163 char *ptr; 156 pg_data_t *last;
164 int cpu; 157 size_t ptrs_size, dyn_size;
165 unsigned long align = 1; 158 unsigned int cpu;
166 159 ssize_t ret;
167 /* Setup cpu_pda map */ 160
168 setup_cpu_pda_map(); 161 /*
162 * If large page isn't supported, there's no benefit in doing
163 * this. Also, on non-NUMA, embedding is better.
164 */
165 if (!cpu_has_pse || pcpu_need_numa())
166 return -EINVAL;
167
168 last = NULL;
169 for_each_possible_cpu(cpu) {
170 int node = early_cpu_to_node(cpu);
169 171
170 /* Copy section for each CPU (we discard the original) */ 172 if (node_online(node) && NODE_DATA(node) &&
171 old_size = PERCPU_ENOUGH_ROOM; 173 last && last != NODE_DATA(node))
172 align = max_t(unsigned long, PAGE_SIZE, align); 174 goto proceed;
173 size = roundup(old_size, align);
174 175
175 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 176 last = NODE_DATA(node);
176 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 177 }
178 return -EINVAL;
179
180proceed:
181 /*
182 * Currently supports only single page. Supporting multiple
183 * pages won't be too difficult if it ever becomes necessary.
184 */
185 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
186 PERCPU_DYNAMIC_RESERVE);
187 if (pcpur_size > PMD_SIZE) {
188 pr_warning("PERCPU: static data is larger than large page, "
189 "can't use large page\n");
190 return -EINVAL;
191 }
192 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
177 193
178 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 194 /* allocate pointer array and alloc large pages */
195 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
196 pcpur_ptrs = alloc_bootmem(ptrs_size);
179 197
180 for_each_possible_cpu(cpu) { 198 for_each_possible_cpu(cpu) {
181#ifndef CONFIG_NEED_MULTIPLE_NODES 199 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
182 ptr = __alloc_bootmem(size, align, 200 if (!pcpur_ptrs[cpu])
183 __pa(MAX_DMA_ADDRESS)); 201 goto enomem;
184#else 202
185 int node = early_cpu_to_node(cpu); 203 /*
186 if (!node_online(node) || !NODE_DATA(node)) { 204 * Only use pcpur_size bytes and give back the rest.
187 ptr = __alloc_bootmem(size, align, 205 *
188 __pa(MAX_DMA_ADDRESS)); 206 * Ingo: The 2MB up-rounding bootmem is needed to make
189 pr_info("cpu %d has no node %d or node-local memory\n", 207 * sure the partial 2MB page is still fully RAM - it's
190 cpu, node); 208 * not well-specified to have a PAT-incompatible area
191 pr_debug("per cpu data for cpu%d at %016lx\n", 209 * (unmapped RAM, device memory, etc.) in that hole.
192 cpu, __pa(ptr)); 210 */
193 } else { 211 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
194 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 212 PMD_SIZE - pcpur_size);
195 __pa(MAX_DMA_ADDRESS)); 213
196 pr_debug("per cpu data for cpu%d on node%d at %016lx\n", 214 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
197 cpu, node, __pa(ptr));
198 }
199#endif
200 per_cpu_offset(cpu) = ptr - __per_cpu_start;
201 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
202 } 215 }
203 216
204 /* Setup percpu data maps */ 217 /* allocate address and map */
205 setup_per_cpu_maps(); 218 vm.flags = VM_ALLOC;
219 vm.size = num_possible_cpus() * PMD_SIZE;
220 vm_area_register_early(&vm, PMD_SIZE);
206 221
207 /* Setup node to cpumask map */ 222 for_each_possible_cpu(cpu) {
208 setup_node_to_cpumask_map(); 223 pmd_t *pmd;
209 224
210 /* Setup cpu initialized, callin, callout masks */ 225 pmd = populate_extra_pmd((unsigned long)vm.addr
211 setup_cpu_local_masks(); 226 + cpu * PMD_SIZE);
212} 227 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
228 PAGE_KERNEL_LARGE));
229 }
213 230
231 /* we're ready, commit */
232 pr_info("PERCPU: Remapped at %p with large pages, static data "
233 "%zu bytes\n", vm.addr, static_size);
234
235 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
236 PERCPU_FIRST_CHUNK_RESERVE,
237 PMD_SIZE, dyn_size, vm.addr, NULL);
238 goto out_free_ar;
239
240enomem:
241 for_each_possible_cpu(cpu)
242 if (pcpur_ptrs[cpu])
243 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
244 ret = -ENOMEM;
245out_free_ar:
246 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
247 return ret;
248}
249#else
250static ssize_t __init setup_pcpu_remap(size_t static_size)
251{
252 return -EINVAL;
253}
214#endif 254#endif
215 255
216#ifdef X86_64_NUMA
217
218/* 256/*
219 * Allocate node_to_cpumask_map based on number of available nodes 257 * Embedding allocator
220 * Requires node_possible_map to be valid.
221 * 258 *
222 * Note: node_to_cpumask() is not valid until after this is done. 259 * The first chunk is sized to just contain the static area plus
260 * module and dynamic reserves, and allocated as a contiguous area
261 * using bootmem allocator and used as-is without being mapped into
262 * vmalloc area. This enables the first chunk to piggy back on the
263 * linear physical PMD mapping and doesn't add any additional pressure
264 * to TLB. Note that if the needed size is smaller than the minimum
265 * unit size, the leftover is returned to the bootmem allocator.
223 */ 266 */
224static void __init setup_node_to_cpumask_map(void) 267static void *pcpue_ptr __initdata;
225{ 268static size_t pcpue_size __initdata;
226 unsigned int node, num = 0; 269static size_t pcpue_unit_size __initdata;
227 cpumask_t *map;
228
229 /* setup nr_node_ids if not done yet */
230 if (nr_node_ids == MAX_NUMNODES) {
231 for_each_node_mask(node, node_possible_map)
232 num = node;
233 nr_node_ids = num + 1;
234 }
235 270
236 /* allocate the map */ 271static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
237 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 272{
273 size_t off = (size_t)pageno << PAGE_SHIFT;
238 274
239 pr_debug("Node to cpumask map at %p for %d nodes\n", 275 if (off >= pcpue_size)
240 map, nr_node_ids); 276 return NULL;
241 277
242 /* node_to_cpumask() will now work */ 278 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
243 node_to_cpumask_map = map;
244} 279}
245 280
246void __cpuinit numa_set_node(int cpu, int node) 281static ssize_t __init setup_pcpu_embed(size_t static_size)
247{ 282{
248 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 283 unsigned int cpu;
249 284 size_t dyn_size;
250 if (cpu_pda(cpu) && node != NUMA_NO_NODE) 285
251 cpu_pda(cpu)->nodenumber = node; 286 /*
287 * If large page isn't supported, there's no benefit in doing
288 * this. Also, embedding allocation doesn't play well with
289 * NUMA.
290 */
291 if (!cpu_has_pse || pcpu_need_numa())
292 return -EINVAL;
293
294 /* allocate and copy */
295 pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
296 PERCPU_DYNAMIC_RESERVE);
297 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
298 dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
299
300 pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
301 PAGE_SIZE);
302 if (!pcpue_ptr)
303 return -ENOMEM;
252 304
253 if (cpu_to_node_map) 305 for_each_possible_cpu(cpu) {
254 cpu_to_node_map[cpu] = node; 306 void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
255 307
256 else if (per_cpu_offset(cpu)) 308 free_bootmem(__pa(ptr + pcpue_size),
257 per_cpu(x86_cpu_to_node_map, cpu) = node; 309 pcpue_unit_size - pcpue_size);
310 memcpy(ptr, __per_cpu_load, static_size);
311 }
258 312
259 else 313 /* we're ready, commit */
260 pr_debug("Setting node for non-present cpu %d\n", cpu); 314 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
261} 315 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
262 316
263void __cpuinit numa_clear_node(int cpu) 317 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
264{ 318 PERCPU_FIRST_CHUNK_RESERVE,
265 numa_set_node(cpu, NUMA_NO_NODE); 319 pcpue_unit_size, dyn_size,
320 pcpue_ptr, NULL);
266} 321}
267 322
268#ifndef CONFIG_DEBUG_PER_CPU_MAPS 323/*
324 * 4k page allocator
325 *
326 * This is the basic allocator. Static percpu area is allocated
327 * page-by-page and most of initialization is done by the generic
328 * setup function.
329 */
330static struct page **pcpu4k_pages __initdata;
331static int pcpu4k_nr_static_pages __initdata;
269 332
270void __cpuinit numa_add_cpu(int cpu) 333static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
271{ 334{
272 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 335 if (pageno < pcpu4k_nr_static_pages)
336 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
337 return NULL;
273} 338}
274 339
275void __cpuinit numa_remove_cpu(int cpu) 340static void __init pcpu4k_populate_pte(unsigned long addr)
276{ 341{
277 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); 342 populate_extra_pte(addr);
278} 343}
279 344
280#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 345static ssize_t __init setup_pcpu_4k(size_t static_size)
281
282/*
283 * --------- debug versions of the numa functions ---------
284 */
285static void __cpuinit numa_set_cpumask(int cpu, int enable)
286{ 346{
287 int node = cpu_to_node(cpu); 347 size_t pages_size;
288 cpumask_t *mask; 348 unsigned int cpu;
289 char buf[64]; 349 int i, j;
290 350 ssize_t ret;
291 if (node_to_cpumask_map == NULL) { 351
292 printk(KERN_ERR "node_to_cpumask_map NULL\n"); 352 pcpu4k_nr_static_pages = PFN_UP(static_size);
293 dump_stack(); 353
294 return; 354 /* unaligned allocations can't be freed, round up to page size */
295 } 355 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
296 356 * sizeof(pcpu4k_pages[0]));
297 mask = &node_to_cpumask_map[node]; 357 pcpu4k_pages = alloc_bootmem(pages_size);
298 if (enable) 358
299 cpu_set(cpu, *mask); 359 /* allocate and copy */
300 else 360 j = 0;
301 cpu_clear(cpu, *mask); 361 for_each_possible_cpu(cpu)
302 362 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
303 cpulist_scnprintf(buf, sizeof(buf), mask); 363 void *ptr;
304 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 364
305 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 365 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
306} 366 if (!ptr)
367 goto enomem;
368
369 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
370 pcpu4k_pages[j++] = virt_to_page(ptr);
371 }
307 372
308void __cpuinit numa_add_cpu(int cpu) 373 /* we're ready, commit */
309{ 374 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
310 numa_set_cpumask(cpu, 1); 375 pcpu4k_nr_static_pages, static_size);
376
377 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
378 PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
379 pcpu4k_populate_pte);
380 goto out_free_ar;
381
382enomem:
383 while (--j >= 0)
384 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
385 ret = -ENOMEM;
386out_free_ar:
387 free_bootmem(__pa(pcpu4k_pages), pages_size);
388 return ret;
311} 389}
312 390
313void __cpuinit numa_remove_cpu(int cpu) 391static inline void setup_percpu_segment(int cpu)
314{ 392{
315 numa_set_cpumask(cpu, 0); 393#ifdef CONFIG_X86_32
316} 394 struct desc_struct gdt;
317 395
318int cpu_to_node(int cpu) 396 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
319{ 397 0x2 | DESCTYPE_S, 0x8);
320 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 398 gdt.s = 1;
321 printk(KERN_WARNING 399 write_gdt_entry(get_cpu_gdt_table(cpu),
322 "cpu_to_node(%d): usage too early!\n", cpu); 400 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
323 dump_stack(); 401#endif
324 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
325 }
326 return per_cpu(x86_cpu_to_node_map, cpu);
327} 402}
328EXPORT_SYMBOL(cpu_to_node);
329 403
330/* 404/*
331 * Same function as cpu_to_node() but used if called before the 405 * Great future plan:
332 * per_cpu areas are setup. 406 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
407 * Always point %gs to its beginning
333 */ 408 */
334int early_cpu_to_node(int cpu) 409void __init setup_per_cpu_areas(void)
335{ 410{
336 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 411 size_t static_size = __per_cpu_end - __per_cpu_start;
337 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 412 unsigned int cpu;
338 413 unsigned long delta;
339 if (!per_cpu_offset(cpu)) { 414 size_t pcpu_unit_size;
340 printk(KERN_WARNING 415 ssize_t ret;
341 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
342 dump_stack();
343 return NUMA_NO_NODE;
344 }
345 return per_cpu(x86_cpu_to_node_map, cpu);
346}
347
348
349/* empty cpumask */
350static const cpumask_t cpu_mask_none;
351 416
352/* 417 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
353 * Returns a pointer to the bitmask of CPUs on Node 'node'. 418 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
354 */
355const cpumask_t *cpumask_of_node(int node)
356{
357 if (node_to_cpumask_map == NULL) {
358 printk(KERN_WARNING
359 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
360 node);
361 dump_stack();
362 return (const cpumask_t *)&cpu_online_map;
363 }
364 if (node >= nr_node_ids) {
365 printk(KERN_WARNING
366 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
367 node, nr_node_ids);
368 dump_stack();
369 return &cpu_mask_none;
370 }
371 return &node_to_cpumask_map[node];
372}
373EXPORT_SYMBOL(cpumask_of_node);
374 419
375/* 420 /*
376 * Returns a bitmask of CPUs on Node 'node'. 421 * Allocate percpu area. If PSE is supported, try to make use
377 * 422 * of large page mappings. Please read comments on top of
378 * Side note: this function creates the returned cpumask on the stack 423 * each allocator for details.
379 * so with a high NR_CPUS count, excessive stack space is used. The 424 */
380 * node_to_cpumask_ptr function should be used whenever possible. 425 ret = setup_pcpu_remap(static_size);
381 */ 426 if (ret < 0)
382cpumask_t node_to_cpumask(int node) 427 ret = setup_pcpu_embed(static_size);
383{ 428 if (ret < 0)
384 if (node_to_cpumask_map == NULL) { 429 ret = setup_pcpu_4k(static_size);
385 printk(KERN_WARNING 430 if (ret < 0)
386 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); 431 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
387 dump_stack(); 432 static_size, ret);
388 return cpu_online_map; 433
389 } 434 pcpu_unit_size = ret;
390 if (node >= nr_node_ids) { 435
391 printk(KERN_WARNING 436 /* alrighty, percpu areas up and running */
392 "node_to_cpumask(%d): node > nr_node_ids(%d)\n", 437 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
393 node, nr_node_ids); 438 for_each_possible_cpu(cpu) {
394 dump_stack(); 439 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
395 return cpu_mask_none; 440 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
441 per_cpu(cpu_number, cpu) = cpu;
442 setup_percpu_segment(cpu);
443 setup_stack_canary_segment(cpu);
444 /*
445 * Copy data used in early init routines from the
446 * initial arrays to the per cpu data areas. These
447 * arrays then become expendable and the *_early_ptr's
448 * are zeroed indicating that the static arrays are
449 * gone.
450 */
451#ifdef CONFIG_X86_LOCAL_APIC
452 per_cpu(x86_cpu_to_apicid, cpu) =
453 early_per_cpu_map(x86_cpu_to_apicid, cpu);
454 per_cpu(x86_bios_cpu_apicid, cpu) =
455 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
456#endif
457#ifdef CONFIG_X86_64
458 per_cpu(irq_stack_ptr, cpu) =
459 per_cpu(irq_stack_union.irq_stack, cpu) +
460 IRQ_STACK_SIZE - 64;
461#ifdef CONFIG_NUMA
462 per_cpu(x86_cpu_to_node_map, cpu) =
463 early_per_cpu_map(x86_cpu_to_node_map, cpu);
464#endif
465#endif
466 /*
467 * Up to this point, the boot CPU has been using .data.init
468 * area. Reload any changed state for the boot CPU.
469 */
470 if (cpu == boot_cpu_id)
471 switch_to_new_gdt(cpu);
396 } 472 }
397 return node_to_cpumask_map[node];
398}
399EXPORT_SYMBOL(node_to_cpumask);
400
401/*
402 * --------- end of debug versions of the numa functions ---------
403 */
404 473
405#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 474 /* indicate the early static arrays will soon be gone */
475#ifdef CONFIG_X86_LOCAL_APIC
476 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
477 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
478#endif
479#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
480 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
481#endif
406 482
407#endif /* X86_64_NUMA */ 483 /* Setup node to cpumask map */
484 setup_node_to_cpumask_map();
408 485
486 /* Setup cpu initialized, callin, callout masks */
487 setup_cpu_local_masks();
488}