aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-01-13 06:41:35 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-16 08:19:46 -0500
commit1a51e3a0aed18767cf2762e95456ecfeb0bca5e6 (patch)
tree2d930218ef1072a59f7dac0f97bb03aa02796c8c
parentc8f3329a0ddd751241e96b4100df7eda14b2cbc6 (diff)
x86: fold pda into percpu area on SMP
[ Based on original patch from Christoph Lameter and Mike Travis. ] Currently pdas and percpu areas are allocated separately. %gs points to local pda and percpu area can be reached using pda->data_offset. This patch folds pda into percpu area. Due to strange gcc requirement, pda needs to be at the beginning of the percpu area so that pda->stack_canary is at %gs:40. To achieve this, a new percpu output section macro - PERCPU_VADDR_PREALLOC() - is added and used to reserve pda sized chunk at the start of the percpu area. After this change, for boot cpu, %gs first points to pda in the data.init area and later during setup_per_cpu_areas() gets updated to point to the actual pda. This means that setup_per_cpu_areas() need to reload %gs for CPU0 while clearing pda area for other cpus as cpu0 already has modified it when control reaches setup_per_cpu_areas(). This patch also removes now unnecessary get_local_pda() and its call sites. A lot of this patch is taken from Mike Travis' "x86_64: Fold pda into per cpu area" patch. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/include/asm/percpu.h8
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/head64.c8
-rw-r--r--arch/x86/kernel/head_64.S15
-rw-r--r--arch/x86/kernel/setup_percpu.c107
-rw-r--r--arch/x86/kernel/smpboot.c60
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S6
-rw-r--r--arch/x86/xen/smp.c10
-rw-r--r--include/asm-generic/vmlinux.lds.h25
11 files changed, 104 insertions, 144 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index df644f3e53e6..0ed77cf33f76 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,6 +1,14 @@
1#ifndef _ASM_X86_PERCPU_H 1#ifndef _ASM_X86_PERCPU_H
2#define _ASM_X86_PERCPU_H 2#define _ASM_X86_PERCPU_H
3 3
4#ifndef __ASSEMBLY__
5#ifdef CONFIG_X86_64
6extern void load_pda_offset(int cpu);
7#else
8static inline void load_pda_offset(int cpu) { }
9#endif
10#endif
11
4#ifdef CONFIG_X86_64 12#ifdef CONFIG_X86_64
5#include <linux/compiler.h> 13#include <linux/compiler.h>
6 14
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a8cea7b09434..127415402ea1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -19,8 +19,6 @@
19#include <asm/thread_info.h> 19#include <asm/thread_info.h>
20#include <asm/cpumask.h> 20#include <asm/cpumask.h>
21 21
22extern int __cpuinit get_local_pda(int cpu);
23
24extern int smp_num_siblings; 22extern int smp_num_siblings;
25extern unsigned int num_processors; 23extern unsigned int num_processors;
26 24
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..f8d1b047ef4f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,7 @@ int main(void)
56 ENTRY(cpunumber); 56 ENTRY(cpunumber);
57 ENTRY(irqstackptr); 57 ENTRY(irqstackptr);
58 ENTRY(data_offset); 58 ENTRY(data_offset);
59 DEFINE(pda_size, sizeof(struct x8664_pda));
59 BLANK(); 60 BLANK();
60#undef ENTRY 61#undef ENTRY
61#ifdef CONFIG_PARAVIRT 62#ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c116c599326e..7041acdf5579 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -893,10 +893,8 @@ void __cpuinit pda_init(int cpu)
893 /* Setup up data that may be needed in __get_free_pages early */ 893 /* Setup up data that may be needed in __get_free_pages early */
894 loadsegment(fs, 0); 894 loadsegment(fs, 0);
895 loadsegment(gs, 0); 895 loadsegment(gs, 0);
896 /* Memory clobbers used to order PDA accessed */ 896
897 mb(); 897 load_pda_offset(cpu);
898 wrmsrl(MSR_GS_BASE, pda);
899 mb();
900 898
901 pda->cpunumber = cpu; 899 pda->cpunumber = cpu;
902 pda->irqcount = -1; 900 pda->irqcount = -1;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 462d0beccb6b..1a311293f733 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,12 +26,18 @@
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 27#include <asm/trampoline.h>
28 28
29/* boot cpu pda, referenced by head_64.S to initialize %gs for boot CPU */ 29#ifndef CONFIG_SMP
30/* boot cpu pda, referenced by head_64.S to initialize %gs on UP */
30struct x8664_pda _boot_cpu_pda; 31struct x8664_pda _boot_cpu_pda;
32#endif
31 33
32void __init x86_64_init_pda(void) 34void __init x86_64_init_pda(void)
33{ 35{
36#ifdef CONFIG_SMP
37 cpu_pda(0) = (void *)__per_cpu_load;
38#else
34 cpu_pda(0) = &_boot_cpu_pda; 39 cpu_pda(0) = &_boot_cpu_pda;
40#endif
35 cpu_pda(0)->data_offset = 41 cpu_pda(0)->data_offset =
36 (unsigned long)(__per_cpu_load - __per_cpu_start); 42 (unsigned long)(__per_cpu_load - __per_cpu_start);
37 pda_init(0); 43 pda_init(0);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2f0ab0089883..7a995d0e9f78 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -245,10 +245,13 @@ ENTRY(secondary_startup_64)
245 245
246 /* Set up %gs. 246 /* Set up %gs.
247 * 247 *
248 * %gs should point to the pda. For initial boot, make %gs point 248 * On SMP, %gs should point to the per-cpu area. For initial
249 * to the _boot_cpu_pda in data section. For a secondary CPU, 249 * boot, make %gs point to the init data section. For a
250 * initial_gs should be set to its pda address before the CPU runs 250 * secondary CPU,initial_gs should be set to its pda address
251 * this code. 251 * before the CPU runs this code.
252 *
253 * On UP, initial_gs points to _boot_cpu_pda and doesn't
254 * change.
252 */ 255 */
253 movl $MSR_GS_BASE,%ecx 256 movl $MSR_GS_BASE,%ecx
254 movq initial_gs(%rip),%rax 257 movq initial_gs(%rip),%rax
@@ -278,7 +281,11 @@ ENTRY(secondary_startup_64)
278 ENTRY(initial_code) 281 ENTRY(initial_code)
279 .quad x86_64_start_kernel 282 .quad x86_64_start_kernel
280 ENTRY(initial_gs) 283 ENTRY(initial_gs)
284#ifdef CONFIG_SMP
285 .quad __per_cpu_load
286#else
281 .quad _boot_cpu_pda 287 .quad _boot_cpu_pda
288#endif
282 __FINITDATA 289 __FINITDATA
283 290
284 ENTRY(stack_start) 291 ENTRY(stack_start)
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 73ab01b297c5..63d462802272 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -13,6 +13,7 @@
13#include <asm/mpspec.h> 13#include <asm/mpspec.h>
14#include <asm/apicdef.h> 14#include <asm/apicdef.h>
15#include <asm/highmem.h> 15#include <asm/highmem.h>
16#include <asm/proto.h>
16#include <asm/cpumask.h> 17#include <asm/cpumask.h>
17 18
18#ifdef CONFIG_DEBUG_PER_CPU_MAPS 19#ifdef CONFIG_DEBUG_PER_CPU_MAPS
@@ -65,6 +66,36 @@ static void __init setup_node_to_cpumask_map(void);
65static inline void setup_node_to_cpumask_map(void) { } 66static inline void setup_node_to_cpumask_map(void) { }
66#endif 67#endif
67 68
69#ifdef CONFIG_X86_64
70void __cpuinit load_pda_offset(int cpu)
71{
72 /* Memory clobbers used to order pda/percpu accesses */
73 mb();
74 wrmsrl(MSR_GS_BASE, cpu_pda(cpu));
75 mb();
76}
77
78#endif /* CONFIG_SMP && CONFIG_X86_64 */
79
80#ifdef CONFIG_X86_64
81
82/* correctly size the local cpu masks */
83static void setup_cpu_local_masks(void)
84{
85 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
86 alloc_bootmem_cpumask_var(&cpu_callin_mask);
87 alloc_bootmem_cpumask_var(&cpu_callout_mask);
88 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
89}
90
91#else /* CONFIG_X86_32 */
92
93static inline void setup_cpu_local_masks(void)
94{
95}
96
97#endif /* CONFIG_X86_32 */
98
68#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA 99#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
69/* 100/*
70 * Copy data used in early init routines from the initial arrays to the 101 * Copy data used in early init routines from the initial arrays to the
@@ -101,63 +132,7 @@ static void __init setup_per_cpu_maps(void)
101 */ 132 */
102unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 133unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
103EXPORT_SYMBOL(__per_cpu_offset); 134EXPORT_SYMBOL(__per_cpu_offset);
104static inline void setup_cpu_pda_map(void) { } 135#endif
105
106#elif !defined(CONFIG_SMP)
107static inline void setup_cpu_pda_map(void) { }
108
109#else /* CONFIG_SMP && CONFIG_X86_64 */
110
111/*
112 * Allocate cpu_pda pointer table and array via alloc_bootmem.
113 */
114static void __init setup_cpu_pda_map(void)
115{
116 char *pda;
117 unsigned long size;
118 int cpu;
119
120 size = roundup(sizeof(struct x8664_pda), cache_line_size());
121
122 /* allocate cpu_pda array and pointer table */
123 {
124 unsigned long asize = size * (nr_cpu_ids - 1);
125
126 pda = alloc_bootmem(asize);
127 }
128
129 /* initialize pointer table to static pda's */
130 for_each_possible_cpu(cpu) {
131 if (cpu == 0) {
132 /* leave boot cpu pda in place */
133 continue;
134 }
135 cpu_pda(cpu) = (struct x8664_pda *)pda;
136 cpu_pda(cpu)->in_bootmem = 1;
137 pda += size;
138 }
139}
140
141#endif /* CONFIG_SMP && CONFIG_X86_64 */
142
143#ifdef CONFIG_X86_64
144
145/* correctly size the local cpu masks */
146static void setup_cpu_local_masks(void)
147{
148 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
149 alloc_bootmem_cpumask_var(&cpu_callin_mask);
150 alloc_bootmem_cpumask_var(&cpu_callout_mask);
151 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
152}
153
154#else /* CONFIG_X86_32 */
155
156static inline void setup_cpu_local_masks(void)
157{
158}
159
160#endif /* CONFIG_X86_32 */
161 136
162/* 137/*
163 * Great future plan: 138 * Great future plan:
@@ -171,9 +146,6 @@ void __init setup_per_cpu_areas(void)
171 int cpu; 146 int cpu;
172 unsigned long align = 1; 147 unsigned long align = 1;
173 148
174 /* Setup cpu_pda map */
175 setup_cpu_pda_map();
176
177 /* Copy section for each CPU (we discard the original) */ 149 /* Copy section for each CPU (we discard the original) */
178 old_size = PERCPU_ENOUGH_ROOM; 150 old_size = PERCPU_ENOUGH_ROOM;
179 align = max_t(unsigned long, PAGE_SIZE, align); 151 align = max_t(unsigned long, PAGE_SIZE, align);
@@ -204,8 +176,21 @@ void __init setup_per_cpu_areas(void)
204 cpu, node, __pa(ptr)); 176 cpu, node, __pa(ptr));
205 } 177 }
206#endif 178#endif
207 per_cpu_offset(cpu) = ptr - __per_cpu_start; 179
208 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); 180 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
181#ifdef CONFIG_X86_64
182 cpu_pda(cpu) = (void *)ptr;
183
184 /*
185 * CPU0 modified pda in the init data area, reload pda
186 * offset for CPU0 and clear the area for others.
187 */
188 if (cpu == 0)
189 load_pda_offset(0);
190 else
191 memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
192#endif
193 per_cpu_offset(cpu) = ptr - __per_cpu_start;
209 194
210 DBG("PERCPU: cpu %4d %p\n", cpu, ptr); 195 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
211 } 196 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 70d846628bbf..f2f77ca494d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -744,52 +744,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
744 complete(&c_idle->done); 744 complete(&c_idle->done);
745} 745}
746 746
747#ifdef CONFIG_X86_64
748
749/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
750static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
751{
752 if (!after_bootmem)
753 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
754}
755
756/*
757 * Allocate node local memory for the AP pda.
758 *
759 * Must be called after the _cpu_pda pointer table is initialized.
760 */
761int __cpuinit get_local_pda(int cpu)
762{
763 struct x8664_pda *oldpda, *newpda;
764 unsigned long size = sizeof(struct x8664_pda);
765 int node = cpu_to_node(cpu);
766
767 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
768 return 0;
769
770 oldpda = cpu_pda(cpu);
771 newpda = kmalloc_node(size, GFP_ATOMIC, node);
772 if (!newpda) {
773 printk(KERN_ERR "Could not allocate node local PDA "
774 "for CPU %d on node %d\n", cpu, node);
775
776 if (oldpda)
777 return 0; /* have a usable pda */
778 else
779 return -1;
780 }
781
782 if (oldpda) {
783 memcpy(newpda, oldpda, size);
784 free_bootmem_pda(oldpda);
785 }
786
787 newpda->in_bootmem = 0;
788 cpu_pda(cpu) = newpda;
789 return 0;
790}
791#endif /* CONFIG_X86_64 */
792
793static int __cpuinit do_boot_cpu(int apicid, int cpu) 747static int __cpuinit do_boot_cpu(int apicid, int cpu)
794/* 748/*
795 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 749 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -807,16 +761,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
807 }; 761 };
808 INIT_WORK(&c_idle.work, do_fork_idle); 762 INIT_WORK(&c_idle.work, do_fork_idle);
809 763
810#ifdef CONFIG_X86_64
811 /* Allocate node local memory for AP pdas */
812 if (cpu > 0) {
813 boot_error = get_local_pda(cpu);
814 if (boot_error)
815 goto restore_state;
816 /* if can't get pda memory, can't start cpu */
817 }
818#endif
819
820 alternatives_smp_switch(1); 764 alternatives_smp_switch(1);
821 765
822 c_idle.idle = get_idle_for_cpu(cpu); 766 c_idle.idle = get_idle_for_cpu(cpu);
@@ -931,9 +875,7 @@ do_rest:
931 inquire_remote_apic(apicid); 875 inquire_remote_apic(apicid);
932 } 876 }
933 } 877 }
934#ifdef CONFIG_X86_64 878
935restore_state:
936#endif
937 if (boot_error) { 879 if (boot_error) {
938 /* Try to put things back the way they were before ... */ 880 /* Try to put things back the way they were before ... */
939 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 881 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index f50280db0dfe..962f21f1d4d7 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
8#include <asm/page.h> 9#include <asm/page.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
@@ -215,10 +216,11 @@ SECTIONS
215 /* 216 /*
216 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the 217 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
217 * output PHDR, so the next output section - __data_nosave - should 218 * output PHDR, so the next output section - __data_nosave - should
218 * switch it back to data.init. 219 * switch it back to data.init. Also, pda should be at the head of
220 * percpu area. Preallocate it.
219 */ 221 */
220 . = ALIGN(PAGE_SIZE); 222 . = ALIGN(PAGE_SIZE);
221 PERCPU_VADDR(0, :percpu) 223 PERCPU_VADDR_PREALLOC(0, :percpu, pda_size)
222#else 224#else
223 PERCPU(PAGE_SIZE) 225 PERCPU(PAGE_SIZE)
224#endif 226#endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..83fa4236477d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -283,16 +283,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
283 struct task_struct *idle = idle_task(cpu); 283 struct task_struct *idle = idle_task(cpu);
284 int rc; 284 int rc;
285 285
286#ifdef CONFIG_X86_64
287 /* Allocate node local memory for AP pdas */
288 WARN_ON(cpu == 0);
289 if (cpu > 0) {
290 rc = get_local_pda(cpu);
291 if (rc)
292 return rc;
293 }
294#endif
295
296#ifdef CONFIG_X86_32 286#ifdef CONFIG_X86_32
297 init_gdt(cpu); 287 init_gdt(cpu);
298 per_cpu(current_task, cpu) = idle; 288 per_cpu(current_task, cpu) = idle;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fc2f55f2dcd6..e53319cf29cb 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -441,9 +441,10 @@
441 . = __per_cpu_load + SIZEOF(.data.percpu); 441 . = __per_cpu_load + SIZEOF(.data.percpu);
442 442
443/** 443/**
444 * PERCPU_VADDR - define output section for percpu area 444 * PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
445 * @vaddr: explicit base address (optional) 445 * @vaddr: explicit base address (optional)
446 * @phdr: destination PHDR (optional) 446 * @phdr: destination PHDR (optional)
447 * @prealloc: the size of prealloc area
447 * 448 *
448 * Macro which expands to output section for percpu area. If @vaddr 449 * Macro which expands to output section for percpu area. If @vaddr
449 * is not blank, it specifies explicit base address and all percpu 450 * is not blank, it specifies explicit base address and all percpu
@@ -455,11 +456,33 @@
455 * section in the linker script will go there too. @phdr should have 456 * section in the linker script will go there too. @phdr should have
456 * a leading colon. 457 * a leading colon.
457 * 458 *
459 * If @prealloc is non-zero, the specified number of bytes will be
460 * reserved at the start of percpu area. As the prealloc area is
461 * likely to break alignment, this macro puts areas in increasing
462 * alignment order.
463 *
458 * This macro defines three symbols, __per_cpu_load, __per_cpu_start 464 * This macro defines three symbols, __per_cpu_load, __per_cpu_start
459 * and __per_cpu_end. The first one is the vaddr of loaded percpu 465 * and __per_cpu_end. The first one is the vaddr of loaded percpu
460 * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the 466 * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the
461 * end offset. 467 * end offset.
462 */ 468 */
469#define PERCPU_VADDR_PREALLOC(vaddr, segment, prealloc) \
470 PERCPU_PROLOG(vaddr) \
471 . += prealloc; \
472 *(.data.percpu) \
473 *(.data.percpu.shared_aligned) \
474 *(.data.percpu.page_aligned) \
475 PERCPU_EPILOG(segment)
476
477/**
478 * PERCPU_VADDR - define output section for percpu area
479 * @vaddr: explicit base address (optional)
480 * @phdr: destination PHDR (optional)
481 *
482 * Macro which expands to output section for percpu area. Mostly
483 * identical to PERCPU_VADDR_PREALLOC(@vaddr, @phdr, 0) other than
484 * using slighly different layout.
485 */
463#define PERCPU_VADDR(vaddr, phdr) \ 486#define PERCPU_VADDR(vaddr, phdr) \
464 PERCPU_PROLOG(vaddr) \ 487 PERCPU_PROLOG(vaddr) \
465 *(.data.percpu.page_aligned) \ 488 *(.data.percpu.page_aligned) \