aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/setup_percpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/setup_percpu.c')
-rw-r--r--arch/x86/kernel/setup_percpu.c679
1 files changed, 358 insertions, 321 deletions
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49c..400331b50a53 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,402 +7,439 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/mpspec.h> 14#include <asm/mpspec.h>
14#include <asm/apicdef.h> 15#include <asm/apicdef.h>
15#include <asm/highmem.h> 16#include <asm/highmem.h>
17#include <asm/proto.h>
18#include <asm/cpumask.h>
19#include <asm/cpu.h>
20#include <asm/stackprotector.h>
16 21
17#ifdef CONFIG_X86_LOCAL_APIC 22#ifdef CONFIG_DEBUG_PER_CPU_MAPS
18unsigned int num_processors; 23# define DBG(x...) printk(KERN_DEBUG x)
19unsigned disabled_cpus __cpuinitdata; 24#else
20/* Processor that is doing the boot up */ 25# define DBG(x...)
21unsigned int boot_cpu_physical_apicid = -1U;
22EXPORT_SYMBOL(boot_cpu_physical_apicid);
23unsigned int max_physical_apicid;
24
25/* Bitmask of physically existing CPUs */
26physid_mask_t phys_cpu_present_map;
27#endif 26#endif
28 27
29/* map cpu index to physical APIC ID */ 28DEFINE_PER_CPU(int, cpu_number);
30DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 29EXPORT_PER_CPU_SYMBOL(cpu_number);
31DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
32EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
34
35#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
36#define X86_64_NUMA 1
37 30
38/* map cpu index to node index */ 31#ifdef CONFIG_X86_64
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 32#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 33#else
34#define BOOT_PERCPU_OFFSET 0
35#endif
41 36
42/* which logical CPUs are on which nodes */ 37DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
43cpumask_t *node_to_cpumask_map; 38EXPORT_PER_CPU_SYMBOL(this_cpu_off);
44EXPORT_SYMBOL(node_to_cpumask_map);
45 39
46/* setup node_to_cpumask_map */ 40unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
47static void __init setup_node_to_cpumask_map(void); 41 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
42};
43EXPORT_SYMBOL(__per_cpu_offset);
48 44
45/*
46 * On x86_64 symbols referenced from code should be reachable using
47 * 32bit relocations. Reserve space for static percpu variables in
48 * modules so that they are always served from the first chunk which
49 * is located at the percpu segment base. On x86_32, anything can
50 * address anywhere. No need to reserve space in the first chunk.
51 */
52#ifdef CONFIG_X86_64
53#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
49#else 54#else
50static inline void setup_node_to_cpumask_map(void) { } 55#define PERCPU_FIRST_CHUNK_RESERVE 0
51#endif 56#endif
52 57
53#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 58/**
54/* 59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
55 * Copy data used in early init routines from the initial arrays to the 60 *
56 * per cpu data areas. These arrays then become expendable and the 61 * If NUMA is not configured or there is only one NUMA node available,
57 * *_early_ptr's are zeroed indicating that the static arrays are gone. 62 * there is no reason to consider NUMA. This function determines
63 * whether percpu allocation should consider NUMA or not.
64 *
65 * RETURNS:
66 * true if NUMA should be considered; otherwise, false.
58 */ 67 */
59static void __init setup_per_cpu_maps(void) 68static bool __init pcpu_need_numa(void)
60{ 69{
61 int cpu; 70#ifdef CONFIG_NEED_MULTIPLE_NODES
71 pg_data_t *last = NULL;
72 unsigned int cpu;
62 73
63 for_each_possible_cpu(cpu) { 74 for_each_possible_cpu(cpu) {
64 per_cpu(x86_cpu_to_apicid, cpu) = 75 int node = early_cpu_to_node(cpu);
65 early_per_cpu_map(x86_cpu_to_apicid, cpu);
66 per_cpu(x86_bios_cpu_apicid, cpu) =
67 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
68#ifdef X86_64_NUMA
69 per_cpu(x86_cpu_to_node_map, cpu) =
70 early_per_cpu_map(x86_cpu_to_node_map, cpu);
71#endif
72 }
73 76
74 /* indicate the early static arrays will soon be gone */ 77 if (node_online(node) && NODE_DATA(node) &&
75 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 78 last && last != NODE_DATA(node))
76 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 79 return true;
77#ifdef X86_64_NUMA 80
78 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 81 last = NODE_DATA(node);
82 }
79#endif 83#endif
84 return false;
80} 85}
81 86
82#ifdef CONFIG_X86_32 87/**
83/* 88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
84 * Great future not-so-futuristic plan: make i386 and x86_64 do it 89 * @cpu: cpu to allocate for
85 * the same way 90 * @size: size allocation in bytes
86 */ 91 * @align: alignment
87unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 92 *
88EXPORT_SYMBOL(__per_cpu_offset); 93 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
89static inline void setup_cpu_pda_map(void) { } 94 * does the right thing for NUMA regardless of the current
90 95 * configuration.
91#elif !defined(CONFIG_SMP) 96 *
92static inline void setup_cpu_pda_map(void) { } 97 * RETURNS:
93 98 * Pointer to the allocated area on success, NULL on failure.
94#else /* CONFIG_SMP && CONFIG_X86_64 */
95
96/*
97 * Allocate cpu_pda pointer table and array via alloc_bootmem.
98 */ 99 */
99static void __init setup_cpu_pda_map(void) 100static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
101 unsigned long align)
100{ 102{
101 char *pda; 103 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
102 struct x8664_pda **new_cpu_pda; 104#ifdef CONFIG_NEED_MULTIPLE_NODES
103 unsigned long size; 105 int node = early_cpu_to_node(cpu);
104 int cpu; 106 void *ptr;
105 107
106 size = roundup(sizeof(struct x8664_pda), cache_line_size()); 108 if (!node_online(node) || !NODE_DATA(node)) {
107 109 ptr = __alloc_bootmem_nopanic(size, align, goal);
108 /* allocate cpu_pda array and pointer table */ 110 pr_info("cpu %d has no node %d or node-local memory\n",
109 { 111 cpu, node);
110 unsigned long tsize = nr_cpu_ids * sizeof(void *); 112 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
111 unsigned long asize = size * (nr_cpu_ids - 1); 113 cpu, size, __pa(ptr));
112 114 } else {
113 tsize = roundup(tsize, cache_line_size()); 115 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
114 new_cpu_pda = alloc_bootmem(tsize + asize); 116 size, align, goal);
115 pda = (char *)new_cpu_pda + tsize; 117 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
116 } 118 "%016lx\n", cpu, size, node, __pa(ptr));
117
118 /* initialize pointer table to static pda's */
119 for_each_possible_cpu(cpu) {
120 if (cpu == 0) {
121 /* leave boot cpu pda in place */
122 new_cpu_pda[0] = cpu_pda(0);
123 continue;
124 }
125 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
126 new_cpu_pda[cpu]->in_bootmem = 1;
127 pda += size;
128 } 119 }
129 120 return ptr;
130 /* point to new pointer table */ 121#else
131 _cpu_pda = new_cpu_pda; 122 return __alloc_bootmem_nopanic(size, align, goal);
123#endif
132} 124}
133 125
134#endif /* CONFIG_SMP && CONFIG_X86_64 */ 126/*
135 127 * Remap allocator
136#ifdef CONFIG_X86_64 128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata;
141static void **pcpur_ptrs __initdata;
137 142
138/* correctly size the local cpu masks */ 143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
139static void __init setup_cpu_local_masks(void)
140{ 144{
141 alloc_bootmem_cpumask_var(&cpu_initialized_mask); 145 size_t off = (size_t)pageno << PAGE_SHIFT;
142 alloc_bootmem_cpumask_var(&cpu_callin_mask);
143 alloc_bootmem_cpumask_var(&cpu_callout_mask);
144 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
145}
146 146
147#else /* CONFIG_X86_32 */ 147 if (off >= pcpur_size)
148 return NULL;
148 149
149static inline void setup_cpu_local_masks(void) 150 return virt_to_page(pcpur_ptrs[cpu] + off);
150{
151} 151}
152 152
153#endif /* CONFIG_X86_32 */ 153static ssize_t __init setup_pcpu_remap(size_t static_size)
154
155/*
156 * Great future plan:
157 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
158 * Always point %gs to its beginning
159 */
160void __init setup_per_cpu_areas(void)
161{ 154{
162 ssize_t size, old_size; 155 static struct vm_struct vm;
163 char *ptr; 156 pg_data_t *last;
164 int cpu; 157 size_t ptrs_size, dyn_size;
165 unsigned long align = 1; 158 unsigned int cpu;
166 159 ssize_t ret;
167 /* Setup cpu_pda map */ 160
168 setup_cpu_pda_map(); 161 /*
162 * If large page isn't supported, there's no benefit in doing
163 * this. Also, on non-NUMA, embedding is better.
164 */
165 if (!cpu_has_pse || pcpu_need_numa())
166 return -EINVAL;
167
168 last = NULL;
169 for_each_possible_cpu(cpu) {
170 int node = early_cpu_to_node(cpu);
169 171
170 /* Copy section for each CPU (we discard the original) */ 172 if (node_online(node) && NODE_DATA(node) &&
171 old_size = PERCPU_ENOUGH_ROOM; 173 last && last != NODE_DATA(node))
172 align = max_t(unsigned long, PAGE_SIZE, align); 174 goto proceed;
173 size = roundup(old_size, align);
174 175
175 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 176 last = NODE_DATA(node);
176 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 177 }
178 return -EINVAL;
179
180proceed:
181 /*
182 * Currently supports only single page. Supporting multiple
183 * pages won't be too difficult if it ever becomes necessary.
184 */
185 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
186 PERCPU_DYNAMIC_RESERVE);
187 if (pcpur_size > PMD_SIZE) {
188 pr_warning("PERCPU: static data is larger than large page, "
189 "can't use large page\n");
190 return -EINVAL;
191 }
192 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
177 193
178 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 194 /* allocate pointer array and alloc large pages */
195 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
196 pcpur_ptrs = alloc_bootmem(ptrs_size);
179 197
180 for_each_possible_cpu(cpu) { 198 for_each_possible_cpu(cpu) {
181#ifndef CONFIG_NEED_MULTIPLE_NODES 199 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
182 ptr = __alloc_bootmem(size, align, 200 if (!pcpur_ptrs[cpu])
183 __pa(MAX_DMA_ADDRESS)); 201 goto enomem;
184#else 202
185 int node = early_cpu_to_node(cpu); 203 /*
186 if (!node_online(node) || !NODE_DATA(node)) { 204 * Only use pcpur_size bytes and give back the rest.
187 ptr = __alloc_bootmem(size, align, 205 *
188 __pa(MAX_DMA_ADDRESS)); 206 * Ingo: The 2MB up-rounding bootmem is needed to make
189 pr_info("cpu %d has no node %d or node-local memory\n", 207 * sure the partial 2MB page is still fully RAM - it's
190 cpu, node); 208 * not well-specified to have a PAT-incompatible area
191 pr_debug("per cpu data for cpu%d at %016lx\n", 209 * (unmapped RAM, device memory, etc.) in that hole.
192 cpu, __pa(ptr)); 210 */
193 } else { 211 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
194 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 212 PMD_SIZE - pcpur_size);
195 __pa(MAX_DMA_ADDRESS)); 213
196 pr_debug("per cpu data for cpu%d on node%d at %016lx\n", 214 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
197 cpu, node, __pa(ptr));
198 }
199#endif
200 per_cpu_offset(cpu) = ptr - __per_cpu_start;
201 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
202 } 215 }
203 216
204 /* Setup percpu data maps */ 217 /* allocate address and map */
205 setup_per_cpu_maps(); 218 vm.flags = VM_ALLOC;
206 219 vm.size = num_possible_cpus() * PMD_SIZE;
207 /* Setup node to cpumask map */ 220 vm_area_register_early(&vm, PMD_SIZE);
208 setup_node_to_cpumask_map();
209
210 /* Setup cpu initialized, callin, callout masks */
211 setup_cpu_local_masks();
212}
213
214#endif
215 221
216#ifdef X86_64_NUMA 222 for_each_possible_cpu(cpu) {
223 pmd_t *pmd;
217 224
218/* 225 pmd = populate_extra_pmd((unsigned long)vm.addr
219 * Allocate node_to_cpumask_map based on number of available nodes 226 + cpu * PMD_SIZE);
220 * Requires node_possible_map to be valid. 227 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
221 * 228 PAGE_KERNEL_LARGE));
222 * Note: node_to_cpumask() is not valid until after this is done.
223 */
224static void __init setup_node_to_cpumask_map(void)
225{
226 unsigned int node, num = 0;
227 cpumask_t *map;
228
229 /* setup nr_node_ids if not done yet */
230 if (nr_node_ids == MAX_NUMNODES) {
231 for_each_node_mask(node, node_possible_map)
232 num = node;
233 nr_node_ids = num + 1;
234 } 229 }
235 230
236 /* allocate the map */ 231 /* we're ready, commit */
237 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 232 pr_info("PERCPU: Remapped at %p with large pages, static data "
238 233 "%zu bytes\n", vm.addr, static_size);
239 pr_debug("Node to cpumask map at %p for %d nodes\n", 234
240 map, nr_node_ids); 235 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
241 236 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
242 /* node_to_cpumask() will now work */ 237 PMD_SIZE, vm.addr, NULL);
243 node_to_cpumask_map = map; 238 goto out_free_ar;
239
240enomem:
241 for_each_possible_cpu(cpu)
242 if (pcpur_ptrs[cpu])
243 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
244 ret = -ENOMEM;
245out_free_ar:
246 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
247 return ret;
244} 248}
245 249#else
246void __cpuinit numa_set_node(int cpu, int node) 250static ssize_t __init setup_pcpu_remap(size_t static_size)
247{ 251{
248 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 252 return -EINVAL;
249
250 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
251 cpu_pda(cpu)->nodenumber = node;
252
253 if (cpu_to_node_map)
254 cpu_to_node_map[cpu] = node;
255
256 else if (per_cpu_offset(cpu))
257 per_cpu(x86_cpu_to_node_map, cpu) = node;
258
259 else
260 pr_debug("Setting node for non-present cpu %d\n", cpu);
261} 253}
254#endif
262 255
263void __cpuinit numa_clear_node(int cpu) 256/*
257 * Embedding allocator
258 *
259 * The first chunk is sized to just contain the static area plus
260 * module and dynamic reserves and embedded into linear physical
261 * mapping so that it can use PMD mapping without additional TLB
262 * pressure.
263 */
264static ssize_t __init setup_pcpu_embed(size_t static_size)
264{ 265{
265 numa_set_node(cpu, NUMA_NO_NODE); 266 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
267
268 /*
269 * If large page isn't supported, there's no benefit in doing
270 * this. Also, embedding allocation doesn't play well with
271 * NUMA.
272 */
273 if (!cpu_has_pse || pcpu_need_numa())
274 return -EINVAL;
275
276 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
277 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
266} 278}
267 279
268#ifndef CONFIG_DEBUG_PER_CPU_MAPS 280/*
281 * 4k page allocator
282 *
283 * This is the basic allocator. Static percpu area is allocated
284 * page-by-page and most of initialization is done by the generic
285 * setup function.
286 */
287static struct page **pcpu4k_pages __initdata;
288static int pcpu4k_nr_static_pages __initdata;
269 289
270void __cpuinit numa_add_cpu(int cpu) 290static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
271{ 291{
272 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 292 if (pageno < pcpu4k_nr_static_pages)
293 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
294 return NULL;
273} 295}
274 296
275void __cpuinit numa_remove_cpu(int cpu) 297static void __init pcpu4k_populate_pte(unsigned long addr)
276{ 298{
277 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); 299 populate_extra_pte(addr);
278} 300}
279 301
280#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 302static ssize_t __init setup_pcpu_4k(size_t static_size)
281
282/*
283 * --------- debug versions of the numa functions ---------
284 */
285static void __cpuinit numa_set_cpumask(int cpu, int enable)
286{ 303{
287 int node = cpu_to_node(cpu); 304 size_t pages_size;
288 cpumask_t *mask; 305 unsigned int cpu;
289 char buf[64]; 306 int i, j;
290 307 ssize_t ret;
291 if (node_to_cpumask_map == NULL) { 308
292 printk(KERN_ERR "node_to_cpumask_map NULL\n"); 309 pcpu4k_nr_static_pages = PFN_UP(static_size);
293 dump_stack(); 310
294 return; 311 /* unaligned allocations can't be freed, round up to page size */
295 } 312 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
296 313 * sizeof(pcpu4k_pages[0]));
297 mask = &node_to_cpumask_map[node]; 314 pcpu4k_pages = alloc_bootmem(pages_size);
298 if (enable) 315
299 cpu_set(cpu, *mask); 316 /* allocate and copy */
300 else 317 j = 0;
301 cpu_clear(cpu, *mask); 318 for_each_possible_cpu(cpu)
319 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
320 void *ptr;
321
322 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
323 if (!ptr)
324 goto enomem;
325
326 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
327 pcpu4k_pages[j++] = virt_to_page(ptr);
328 }
302 329
303 cpulist_scnprintf(buf, sizeof(buf), mask); 330 /* we're ready, commit */
304 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 331 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
305 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 332 pcpu4k_nr_static_pages, static_size);
333
334 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
335 PERCPU_FIRST_CHUNK_RESERVE, -1,
336 -1, NULL, pcpu4k_populate_pte);
337 goto out_free_ar;
338
339enomem:
340 while (--j >= 0)
341 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
342 ret = -ENOMEM;
343out_free_ar:
344 free_bootmem(__pa(pcpu4k_pages), pages_size);
345 return ret;
306} 346}
307 347
308void __cpuinit numa_add_cpu(int cpu) 348static inline void setup_percpu_segment(int cpu)
309{ 349{
310 numa_set_cpumask(cpu, 1); 350#ifdef CONFIG_X86_32
311} 351 struct desc_struct gdt;
312
313void __cpuinit numa_remove_cpu(int cpu)
314{
315 numa_set_cpumask(cpu, 0);
316}
317 352
318int cpu_to_node(int cpu) 353 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
319{ 354 0x2 | DESCTYPE_S, 0x8);
320 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 355 gdt.s = 1;
321 printk(KERN_WARNING 356 write_gdt_entry(get_cpu_gdt_table(cpu),
322 "cpu_to_node(%d): usage too early!\n", cpu); 357 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
323 dump_stack(); 358#endif
324 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
325 }
326 return per_cpu(x86_cpu_to_node_map, cpu);
327} 359}
328EXPORT_SYMBOL(cpu_to_node);
329 360
330/* 361/*
331 * Same function as cpu_to_node() but used if called before the 362 * Great future plan:
332 * per_cpu areas are setup. 363 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
364 * Always point %gs to its beginning
333 */ 365 */
334int early_cpu_to_node(int cpu) 366void __init setup_per_cpu_areas(void)
335{ 367{
336 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 368 size_t static_size = __per_cpu_end - __per_cpu_start;
337 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 369 unsigned int cpu;
338 370 unsigned long delta;
339 if (!per_cpu_offset(cpu)) { 371 size_t pcpu_unit_size;
340 printk(KERN_WARNING 372 ssize_t ret;
341 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
342 dump_stack();
343 return NUMA_NO_NODE;
344 }
345 return per_cpu(x86_cpu_to_node_map, cpu);
346}
347 373
374 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
375 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
348 376
349/* empty cpumask */ 377 /*
350static const cpumask_t cpu_mask_none; 378 * Allocate percpu area. If PSE is supported, try to make use
351 379 * of large page mappings. Please read comments on top of
352/* 380 * each allocator for details.
353 * Returns a pointer to the bitmask of CPUs on Node 'node'. 381 */
354 */ 382 ret = setup_pcpu_remap(static_size);
355const cpumask_t *cpumask_of_node(int node) 383 if (ret < 0)
356{ 384 ret = setup_pcpu_embed(static_size);
357 if (node_to_cpumask_map == NULL) { 385 if (ret < 0)
358 printk(KERN_WARNING 386 ret = setup_pcpu_4k(static_size);
359 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 387 if (ret < 0)
360 node); 388 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
361 dump_stack(); 389 static_size, ret);
362 return (const cpumask_t *)&cpu_online_map; 390
363 } 391 pcpu_unit_size = ret;
364 if (node >= nr_node_ids) { 392
365 printk(KERN_WARNING 393 /* alrighty, percpu areas up and running */
366 "cpumask_of_node(%d): node > nr_node_ids(%d)\n", 394 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
367 node, nr_node_ids); 395 for_each_possible_cpu(cpu) {
368 dump_stack(); 396 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
369 return &cpu_mask_none; 397 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
370 } 398 per_cpu(cpu_number, cpu) = cpu;
371 return &node_to_cpumask_map[node]; 399 setup_percpu_segment(cpu);
372} 400 setup_stack_canary_segment(cpu);
373EXPORT_SYMBOL(cpumask_of_node); 401 /*
374 402 * Copy data used in early init routines from the
375/* 403 * initial arrays to the per cpu data areas. These
376 * Returns a bitmask of CPUs on Node 'node'. 404 * arrays then become expendable and the *_early_ptr's
377 * 405 * are zeroed indicating that the static arrays are
378 * Side note: this function creates the returned cpumask on the stack 406 * gone.
379 * so with a high NR_CPUS count, excessive stack space is used. The 407 */
380 * node_to_cpumask_ptr function should be used whenever possible. 408#ifdef CONFIG_X86_LOCAL_APIC
381 */ 409 per_cpu(x86_cpu_to_apicid, cpu) =
382cpumask_t node_to_cpumask(int node) 410 early_per_cpu_map(x86_cpu_to_apicid, cpu);
383{ 411 per_cpu(x86_bios_cpu_apicid, cpu) =
384 if (node_to_cpumask_map == NULL) { 412 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
385 printk(KERN_WARNING 413#endif
386 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); 414#ifdef CONFIG_X86_64
387 dump_stack(); 415 per_cpu(irq_stack_ptr, cpu) =
388 return cpu_online_map; 416 per_cpu(irq_stack_union.irq_stack, cpu) +
389 } 417 IRQ_STACK_SIZE - 64;
390 if (node >= nr_node_ids) { 418#ifdef CONFIG_NUMA
391 printk(KERN_WARNING 419 per_cpu(x86_cpu_to_node_map, cpu) =
392 "node_to_cpumask(%d): node > nr_node_ids(%d)\n", 420 early_per_cpu_map(x86_cpu_to_node_map, cpu);
393 node, nr_node_ids); 421#endif
394 dump_stack(); 422#endif
395 return cpu_mask_none; 423 /*
424 * Up to this point, the boot CPU has been using .data.init
425 * area. Reload any changed state for the boot CPU.
426 */
427 if (cpu == boot_cpu_id)
428 switch_to_new_gdt(cpu);
396 } 429 }
397 return node_to_cpumask_map[node];
398}
399EXPORT_SYMBOL(node_to_cpumask);
400 430
401/* 431 /* indicate the early static arrays will soon be gone */
402 * --------- end of debug versions of the numa functions --------- 432#ifdef CONFIG_X86_LOCAL_APIC
403 */ 433 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
404 434 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
405#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ 435#endif
436#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
437 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
438#endif
406 439
407#endif /* X86_64_NUMA */ 440 /* Setup node to cpumask map */
441 setup_node_to_cpumask_map();
408 442
443 /* Setup cpu initialized, callin, callout masks */
444 setup_cpu_local_masks();
445}