aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/entry.S22
-rw-r--r--arch/x86_64/kernel/genapic.c33
-rw-r--r--arch/x86_64/kernel/genapic_flat.c142
-rw-r--r--arch/x86_64/kernel/head.S16
-rw-r--r--arch/x86_64/kernel/irq.c19
-rw-r--r--arch/x86_64/kernel/machine_kexec.c101
-rw-r--r--arch/x86_64/kernel/mce.c109
-rw-r--r--arch/x86_64/kernel/mpparse.c34
-rw-r--r--arch/x86_64/kernel/setup.c11
-rw-r--r--arch/x86_64/kernel/setup64.c18
-rw-r--r--arch/x86_64/kernel/smp.c66
-rw-r--r--arch/x86_64/kernel/smpboot.c94
-rw-r--r--arch/x86_64/kernel/traps.c3
13 files changed, 397 insertions, 271 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 28817490fdc6..096d470e280f 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -76,7 +76,7 @@
76 76
77 .macro FAKE_STACK_FRAME child_rip 77 .macro FAKE_STACK_FRAME child_rip
78 /* push in order ss, rsp, eflags, cs, rip */ 78 /* push in order ss, rsp, eflags, cs, rip */
79 xorq %rax, %rax 79 xorl %eax, %eax
80 pushq %rax /* ss */ 80 pushq %rax /* ss */
81 CFI_ADJUST_CFA_OFFSET 8 81 CFI_ADJUST_CFA_OFFSET 8
82 pushq %rax /* rsp */ 82 pushq %rax /* rsp */
@@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn)
423 testl $3,CS(%rdi) 423 testl $3,CS(%rdi)
424 je 1f 424 je 1f
425 swapgs 425 swapgs
4261: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count 4261: incl %gs:pda_irqcount # RED-PEN should check preempt count
427 movq %gs:pda_irqstackptr,%rax 427 movq %gs:pda_irqstackptr,%rax
428 cmoveq %rax,%rsp 428 cmoveq %rax,%rsp
429 pushq %rdi # save old stack 429 pushq %rdi # save old stack
@@ -436,7 +436,7 @@ ENTRY(common_interrupt)
436ret_from_intr: 436ret_from_intr:
437 popq %rdi 437 popq %rdi
438 cli 438 cli
439 subl $1,%gs:pda_irqcount 439 decl %gs:pda_irqcount
440#ifdef CONFIG_DEBUG_INFO 440#ifdef CONFIG_DEBUG_INFO
441 movq RBP(%rdi),%rbp 441 movq RBP(%rdi),%rbp
442#endif 442#endif
@@ -494,7 +494,7 @@ retint_signal:
494 sti 494 sti
495 SAVE_REST 495 SAVE_REST
496 movq $-1,ORIG_RAX(%rsp) 496 movq $-1,ORIG_RAX(%rsp)
497 xorq %rsi,%rsi # oldset 497 xorl %esi,%esi # oldset
498 movq %rsp,%rdi # &pt_regs 498 movq %rsp,%rdi # &pt_regs
499 call do_notify_resume 499 call do_notify_resume
500 RESTORE_REST 500 RESTORE_REST
@@ -752,7 +752,7 @@ child_rip:
752 movq %rsi, %rdi 752 movq %rsi, %rdi
753 call *%rax 753 call *%rax
754 # exit 754 # exit
755 xorq %rdi, %rdi 755 xorl %edi, %edi
756 call do_exit 756 call do_exit
757 757
758/* 758/*
@@ -918,3 +918,15 @@ ENTRY(machine_check)
918ENTRY(call_debug) 918ENTRY(call_debug)
919 zeroentry do_call_debug 919 zeroentry do_call_debug
920 920
921ENTRY(call_softirq)
922 movq %gs:pda_irqstackptr,%rax
923 pushq %r15
924 movq %rsp,%r15
925 incl %gs:pda_irqcount
926 cmove %rax,%rsp
927 call __do_softirq
928 movq %r15,%rsp
929 decl %gs:pda_irqcount
930 popq %r15
931 ret
932
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 69b9c25a8fc4..30c843a5efdd 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
31 31
32extern struct genapic apic_cluster; 32extern struct genapic apic_cluster;
33extern struct genapic apic_flat; 33extern struct genapic apic_flat;
34extern struct genapic apic_physflat;
34 35
35struct genapic *genapic = &apic_flat; 36struct genapic *genapic = &apic_flat;
36 37
@@ -44,12 +45,7 @@ void __init clustered_apic_check(void)
44 u8 clusters, max_cluster; 45 u8 clusters, max_cluster;
45 u8 id; 46 u8 id;
46 u8 cluster_cnt[NUM_APIC_CLUSTERS]; 47 u8 cluster_cnt[NUM_APIC_CLUSTERS];
47 48 int num_cpus = 0;
48 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
49 /* AMD always uses flat mode right now */
50 genapic = &apic_flat;
51 goto print;
52 }
53 49
54#if defined(CONFIG_ACPI_BUS) 50#if defined(CONFIG_ACPI_BUS)
55 /* 51 /*
@@ -64,15 +60,34 @@ void __init clustered_apic_check(void)
64#endif 60#endif
65 61
66 memset(cluster_cnt, 0, sizeof(cluster_cnt)); 62 memset(cluster_cnt, 0, sizeof(cluster_cnt));
67
68 for (i = 0; i < NR_CPUS; i++) { 63 for (i = 0; i < NR_CPUS; i++) {
69 id = bios_cpu_apicid[i]; 64 id = bios_cpu_apicid[i];
70 if (id != BAD_APICID) 65 if (id == BAD_APICID)
71 cluster_cnt[APIC_CLUSTERID(id)]++; 66 continue;
67 num_cpus++;
68 cluster_cnt[APIC_CLUSTERID(id)]++;
72 } 69 }
73 70
71 /* Don't use clustered mode on AMD platforms. */
72 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
73 genapic = &apic_physflat;
74#ifndef CONFIG_CPU_HOTPLUG
75 /* In the CPU hotplug case we cannot use broadcast mode
76 because that opens a race when a CPU is removed.
77 Stay at physflat mode in this case.
78 It is bad to do this unconditionally though. Once
79 we have ACPI platform support for CPU hotplug
80 we should detect hotplug capablity from ACPI tables and
81 only do this when really needed. -AK */
82 if (num_cpus <= 8)
83 genapic = &apic_flat;
84#endif
85 goto print;
86 }
87
74 clusters = 0; 88 clusters = 0;
75 max_cluster = 0; 89 max_cluster = 0;
90
76 for (i = 0; i < NUM_APIC_CLUSTERS; i++) { 91 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
77 if (cluster_cnt[i] > 0) { 92 if (cluster_cnt[i] > 0) {
78 ++clusters; 93 ++clusters;
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 282846965080..adc96282a9e2 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -2,13 +2,11 @@
2 * Copyright 2004 James Cleverdon, IBM. 2 * Copyright 2004 James Cleverdon, IBM.
3 * Subject to the GNU Public License, v.2 3 * Subject to the GNU Public License, v.2
4 * 4 *
5 * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. 5 * Flat APIC subarch code.
6 * 6 *
7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by 7 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and 8 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
9 * James Cleverdon. 9 * James Cleverdon.
10 * Ashok Raj <ashok.raj@intel.com>
11 * Removed IPI broadcast shortcut to support CPU hotplug
12 */ 10 */
13#include <linux/config.h> 11#include <linux/config.h>
14#include <linux/threads.h> 12#include <linux/threads.h>
@@ -20,47 +18,6 @@
20#include <asm/smp.h> 18#include <asm/smp.h>
21#include <asm/ipi.h> 19#include <asm/ipi.h>
22 20
23/*
24 * The following permit choosing broadcast IPI shortcut v.s sending IPI only
25 * to online cpus via the send_IPI_mask varient.
26 * The mask version is my preferred option, since it eliminates a lot of
27 * other extra code that would need to be written to cleanup intrs sent
28 * to a CPU while offline.
29 *
30 * Sending broadcast introduces lots of trouble in CPU hotplug situations.
31 * These IPI's are delivered to cpu's irrespective of their offline status
32 * and could pickup stale intr data when these CPUS are turned online.
33 *
34 * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
35 * the idea of not using broadcast IPI's anymore. Hence the run time check
36 * is introduced, on his request so we can choose an alternate mechanism.
37 *
38 * Initial wacky performance tests that collect cycle counts show
39 * no increase in using mask v.s broadcast version. In fact they seem
40 * identical in terms of cycle counts.
41 *
42 * if we need to use broadcast, we need to do the following.
43 *
44 * cli;
45 * hold call_lock;
46 * clear any pending IPI, just ack and clear all pending intr
47 * set cpu_online_map;
48 * release call_lock;
49 * sti;
50 *
51 * The complicated dummy irq processing shown above is not required if
52 * we didnt sent IPI's to wrong CPU's in the first place.
53 *
54 * - Ashok Raj <ashok.raj@intel.com>
55 */
56#ifdef CONFIG_HOTPLUG_CPU
57#define DEFAULT_SEND_IPI (1)
58#else
59#define DEFAULT_SEND_IPI (0)
60#endif
61
62static int no_broadcast=DEFAULT_SEND_IPI;
63
64static cpumask_t flat_target_cpus(void) 21static cpumask_t flat_target_cpus(void)
65{ 22{
66 return cpu_online_map; 23 return cpu_online_map;
@@ -119,37 +76,15 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
119 local_irq_restore(flags); 76 local_irq_restore(flags);
120} 77}
121 78
122static inline void __local_flat_send_IPI_allbutself(int vector)
123{
124 if (no_broadcast) {
125 cpumask_t mask = cpu_online_map;
126 int this_cpu = get_cpu();
127
128 cpu_clear(this_cpu, mask);
129 flat_send_IPI_mask(mask, vector);
130 put_cpu();
131 }
132 else
133 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
134}
135
136static inline void __local_flat_send_IPI_all(int vector)
137{
138 if (no_broadcast)
139 flat_send_IPI_mask(cpu_online_map, vector);
140 else
141 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
142}
143
144static void flat_send_IPI_allbutself(int vector) 79static void flat_send_IPI_allbutself(int vector)
145{ 80{
146 if (((num_online_cpus()) - 1) >= 1) 81 if (((num_online_cpus()) - 1) >= 1)
147 __local_flat_send_IPI_allbutself(vector); 82 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
148} 83}
149 84
150static void flat_send_IPI_all(int vector) 85static void flat_send_IPI_all(int vector)
151{ 86{
152 __local_flat_send_IPI_all(vector); 87 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
153} 88}
154 89
155static int flat_apic_id_registered(void) 90static int flat_apic_id_registered(void)
@@ -170,16 +105,6 @@ static unsigned int phys_pkg_id(int index_msb)
170 return ((ebx >> 24) & 0xFF) >> index_msb; 105 return ((ebx >> 24) & 0xFF) >> index_msb;
171} 106}
172 107
173static __init int no_ipi_broadcast(char *str)
174{
175 get_option(&str, &no_broadcast);
176 printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
177 "IPI Broadcast");
178 return 1;
179}
180
181__setup("no_ipi_broadcast", no_ipi_broadcast);
182
183struct genapic apic_flat = { 108struct genapic apic_flat = {
184 .name = "flat", 109 .name = "flat",
185 .int_delivery_mode = dest_LowestPrio, 110 .int_delivery_mode = dest_LowestPrio,
@@ -195,11 +120,62 @@ struct genapic apic_flat = {
195 .phys_pkg_id = phys_pkg_id, 120 .phys_pkg_id = phys_pkg_id,
196}; 121};
197 122
198static int __init print_ipi_mode(void) 123/*
124 * Physflat mode is used when there are more than 8 CPUs on a AMD system.
125 * We cannot use logical delivery in this case because the mask
126 * overflows, so use physical mode.
127 */
128
129static cpumask_t physflat_target_cpus(void)
130{
131 return cpumask_of_cpu(0);
132}
133
134static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
135{
136 send_IPI_mask_sequence(cpumask, vector);
137}
138
139static void physflat_send_IPI_allbutself(int vector)
140{
141 cpumask_t allbutme = cpu_online_map;
142 int me = get_cpu();
143 cpu_clear(me, allbutme);
144 physflat_send_IPI_mask(allbutme, vector);
145 put_cpu();
146}
147
148static void physflat_send_IPI_all(int vector)
199{ 149{
200 printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : 150 physflat_send_IPI_mask(cpu_online_map, vector);
201 "Shortcut");
202 return 0;
203} 151}
204 152
205late_initcall(print_ipi_mode); 153static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
154{
155 int cpu;
156
157 /*
158 * We're using fixed IRQ delivery, can only return one phys APIC ID.
159 * May as well be the first.
160 */
161 cpu = first_cpu(cpumask);
162 if ((unsigned)cpu < NR_CPUS)
163 return x86_cpu_to_apicid[cpu];
164 else
165 return BAD_APICID;
166}
167
168struct genapic apic_physflat = {
169 .name = "physical flat",
170 .int_delivery_mode = dest_LowestPrio,
171 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
172 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST,
173 .target_cpus = physflat_target_cpus,
174 .apic_id_registered = flat_apic_id_registered,
175 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
176 .send_IPI_all = physflat_send_IPI_all,
177 .send_IPI_allbutself = physflat_send_IPI_allbutself,
178 .send_IPI_mask = physflat_send_IPI_mask,
179 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
180 .phys_pkg_id = phys_pkg_id,
181};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 8d765aa77a26..98ff5eb32b9a 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -137,14 +137,14 @@ startup_64:
137 wrmsr 137 wrmsr
138 138
139 /* Setup cr0 */ 139 /* Setup cr0 */
140 xorq %rax, %rax 140#define CR0_PM 1 /* protected mode */
141 btsq $31, %rax /* Enable paging */ 141#define CR0_MP (1<<1)
142 btsq $0, %rax /* Enable protected mode */ 142#define CR0_ET (1<<4)
143 btsq $1, %rax /* Enable MP */ 143#define CR0_NE (1<<5)
144 btsq $4, %rax /* Enable ET */ 144#define CR0_WP (1<<16)
145 btsq $5, %rax /* Enable NE */ 145#define CR0_AM (1<<18)
146 btsq $16, %rax /* Enable WP */ 146#define CR0_PAGING (1<<31)
147 btsq $18, %rax /* Enable AM */ 147 movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
148 /* Make changes effective */ 148 /* Make changes effective */
149 movq %rax, %cr0 149 movq %rax, %cr0
150 150
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index cc3fb85f5145..849a20aec7ca 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -135,3 +135,22 @@ void fixup_irqs(cpumask_t map)
135 local_irq_disable(); 135 local_irq_disable();
136} 136}
137#endif 137#endif
138
139extern void call_softirq(void);
140
141asmlinkage void do_softirq(void)
142{
143 __u32 pending;
144 unsigned long flags;
145
146 if (in_interrupt())
147 return;
148
149 local_irq_save(flags);
150 pending = local_softirq_pending();
151 /* Switch to interrupt stack */
152 if (pending)
153 call_softirq();
154 local_irq_restore(flags);
155}
156EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 60d1eff41567..89fab51e20f4 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -8,43 +8,26 @@
8 8
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <linux/string.h> 11#include <linux/string.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <asm/pda.h>
15#include <asm/pgtable.h> 13#include <asm/pgtable.h>
16#include <asm/pgalloc.h>
17#include <asm/tlbflush.h> 14#include <asm/tlbflush.h>
18#include <asm/mmu_context.h> 15#include <asm/mmu_context.h>
19#include <asm/io.h> 16#include <asm/io.h>
20#include <asm/apic.h> 17
21#include <asm/cpufeature.h> 18static void init_level2_page(pmd_t *level2p, unsigned long addr)
22#include <asm/hw_irq.h>
23
24#define LEVEL0_SIZE (1UL << 12UL)
25#define LEVEL1_SIZE (1UL << 21UL)
26#define LEVEL2_SIZE (1UL << 30UL)
27#define LEVEL3_SIZE (1UL << 39UL)
28#define LEVEL4_SIZE (1UL << 48UL)
29
30#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
31#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
32#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
33#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
34
35static void init_level2_page(u64 *level2p, unsigned long addr)
36{ 19{
37 unsigned long end_addr; 20 unsigned long end_addr;
38 21
39 addr &= PAGE_MASK; 22 addr &= PAGE_MASK;
40 end_addr = addr + LEVEL2_SIZE; 23 end_addr = addr + PUD_SIZE;
41 while (addr < end_addr) { 24 while (addr < end_addr) {
42 *(level2p++) = addr | L1_ATTR; 25 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
43 addr += LEVEL1_SIZE; 26 addr += PMD_SIZE;
44 } 27 }
45} 28}
46 29
47static int init_level3_page(struct kimage *image, u64 *level3p, 30static int init_level3_page(struct kimage *image, pud_t *level3p,
48 unsigned long addr, unsigned long last_addr) 31 unsigned long addr, unsigned long last_addr)
49{ 32{
50 unsigned long end_addr; 33 unsigned long end_addr;
@@ -52,32 +35,32 @@ static int init_level3_page(struct kimage *image, u64 *level3p,
52 35
53 result = 0; 36 result = 0;
54 addr &= PAGE_MASK; 37 addr &= PAGE_MASK;
55 end_addr = addr + LEVEL3_SIZE; 38 end_addr = addr + PGDIR_SIZE;
56 while ((addr < last_addr) && (addr < end_addr)) { 39 while ((addr < last_addr) && (addr < end_addr)) {
57 struct page *page; 40 struct page *page;
58 u64 *level2p; 41 pmd_t *level2p;
59 42
60 page = kimage_alloc_control_pages(image, 0); 43 page = kimage_alloc_control_pages(image, 0);
61 if (!page) { 44 if (!page) {
62 result = -ENOMEM; 45 result = -ENOMEM;
63 goto out; 46 goto out;
64 } 47 }
65 level2p = (u64 *)page_address(page); 48 level2p = (pmd_t *)page_address(page);
66 init_level2_page(level2p, addr); 49 init_level2_page(level2p, addr);
67 *(level3p++) = __pa(level2p) | L2_ATTR; 50 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
68 addr += LEVEL2_SIZE; 51 addr += PUD_SIZE;
69 } 52 }
70 /* clear the unused entries */ 53 /* clear the unused entries */
71 while (addr < end_addr) { 54 while (addr < end_addr) {
72 *(level3p++) = 0; 55 pud_clear(level3p++);
73 addr += LEVEL2_SIZE; 56 addr += PUD_SIZE;
74 } 57 }
75out: 58out:
76 return result; 59 return result;
77} 60}
78 61
79 62
80static int init_level4_page(struct kimage *image, u64 *level4p, 63static int init_level4_page(struct kimage *image, pgd_t *level4p,
81 unsigned long addr, unsigned long last_addr) 64 unsigned long addr, unsigned long last_addr)
82{ 65{
83 unsigned long end_addr; 66 unsigned long end_addr;
@@ -85,28 +68,28 @@ static int init_level4_page(struct kimage *image, u64 *level4p,
85 68
86 result = 0; 69 result = 0;
87 addr &= PAGE_MASK; 70 addr &= PAGE_MASK;
88 end_addr = addr + LEVEL4_SIZE; 71 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
89 while ((addr < last_addr) && (addr < end_addr)) { 72 while ((addr < last_addr) && (addr < end_addr)) {
90 struct page *page; 73 struct page *page;
91 u64 *level3p; 74 pud_t *level3p;
92 75
93 page = kimage_alloc_control_pages(image, 0); 76 page = kimage_alloc_control_pages(image, 0);
94 if (!page) { 77 if (!page) {
95 result = -ENOMEM; 78 result = -ENOMEM;
96 goto out; 79 goto out;
97 } 80 }
98 level3p = (u64 *)page_address(page); 81 level3p = (pud_t *)page_address(page);
99 result = init_level3_page(image, level3p, addr, last_addr); 82 result = init_level3_page(image, level3p, addr, last_addr);
100 if (result) { 83 if (result) {
101 goto out; 84 goto out;
102 } 85 }
103 *(level4p++) = __pa(level3p) | L3_ATTR; 86 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
104 addr += LEVEL3_SIZE; 87 addr += PGDIR_SIZE;
105 } 88 }
106 /* clear the unused entries */ 89 /* clear the unused entries */
107 while (addr < end_addr) { 90 while (addr < end_addr) {
108 *(level4p++) = 0; 91 pgd_clear(level4p++);
109 addr += LEVEL3_SIZE; 92 addr += PGDIR_SIZE;
110 } 93 }
111out: 94out:
112 return result; 95 return result;
@@ -115,52 +98,50 @@ out:
115 98
116static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 99static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
117{ 100{
118 u64 *level4p; 101 pgd_t *level4p;
119 level4p = (u64 *)__va(start_pgtable); 102 level4p = (pgd_t *)__va(start_pgtable);
120 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); 103 return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
121} 104}
122 105
123static void set_idt(void *newidt, u16 limit) 106static void set_idt(void *newidt, u16 limit)
124{ 107{
125 unsigned char curidt[10]; 108 struct desc_ptr curidt;
126 109
127 /* x86-64 supports unaliged loads & stores */ 110 /* x86-64 supports unaliged loads & stores */
128 (*(u16 *)(curidt)) = limit; 111 curidt.size = limit;
129 (*(u64 *)(curidt +2)) = (unsigned long)(newidt); 112 curidt.address = (unsigned long)newidt;
130 113
131 __asm__ __volatile__ ( 114 __asm__ __volatile__ (
132 "lidt %0\n" 115 "lidtq %0\n"
133 : "=m" (curidt) 116 : : "m" (curidt)
134 ); 117 );
135}; 118};
136 119
137 120
138static void set_gdt(void *newgdt, u16 limit) 121static void set_gdt(void *newgdt, u16 limit)
139{ 122{
140 unsigned char curgdt[10]; 123 struct desc_ptr curgdt;
141 124
142 /* x86-64 supports unaligned loads & stores */ 125 /* x86-64 supports unaligned loads & stores */
143 (*(u16 *)(curgdt)) = limit; 126 curgdt.size = limit;
144 (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); 127 curgdt.address = (unsigned long)newgdt;
145 128
146 __asm__ __volatile__ ( 129 __asm__ __volatile__ (
147 "lgdt %0\n" 130 "lgdtq %0\n"
148 : "=m" (curgdt) 131 : : "m" (curgdt)
149 ); 132 );
150}; 133};
151 134
152static void load_segments(void) 135static void load_segments(void)
153{ 136{
154 __asm__ __volatile__ ( 137 __asm__ __volatile__ (
155 "\tmovl $"STR(__KERNEL_DS)",%eax\n" 138 "\tmovl %0,%%ds\n"
156 "\tmovl %eax,%ds\n" 139 "\tmovl %0,%%es\n"
157 "\tmovl %eax,%es\n" 140 "\tmovl %0,%%ss\n"
158 "\tmovl %eax,%ss\n" 141 "\tmovl %0,%%fs\n"
159 "\tmovl %eax,%fs\n" 142 "\tmovl %0,%%gs\n"
160 "\tmovl %eax,%gs\n" 143 : : "a" (__KERNEL_DS)
161 ); 144 );
162#undef STR
163#undef __STR
164} 145}
165 146
166typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, 147typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
@@ -178,7 +159,7 @@ int machine_kexec_prepare(struct kimage *image)
178 159
179 /* Calculate the offsets */ 160 /* Calculate the offsets */
180 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 161 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
181 control_code_buffer = start_pgtable + 4096UL; 162 control_code_buffer = start_pgtable + PAGE_SIZE;
182 163
183 /* Setup the identity mapped 64bit page table */ 164 /* Setup the identity mapped 64bit page table */
184 result = init_pgtable(image, start_pgtable); 165 result = init_pgtable(image, start_pgtable);
@@ -214,7 +195,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
214 /* Calculate the offsets */ 195 /* Calculate the offsets */
215 page_list = image->head; 196 page_list = image->head;
216 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 197 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
217 control_code_buffer = start_pgtable + 4096UL; 198 control_code_buffer = start_pgtable + PAGE_SIZE;
218 199
219 /* Set the low half of the page table to my identity mapped 200 /* Set the low half of the page table to my identity mapped
220 * page table for kexec. Leave the high half pointing at the 201 * page table for kexec. Leave the high half pointing at the
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 21e70625a495..8aa56736cde3 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -15,6 +15,8 @@
15#include <linux/sysdev.h> 15#include <linux/sysdev.h>
16#include <linux/miscdevice.h> 16#include <linux/miscdevice.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/cpu.h>
19#include <linux/percpu.h>
18#include <asm/processor.h> 20#include <asm/processor.h>
19#include <asm/msr.h> 21#include <asm/msr.h>
20#include <asm/mce.h> 22#include <asm/mce.h>
@@ -34,6 +36,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
34static unsigned long console_logged; 36static unsigned long console_logged;
35static int notify_user; 37static int notify_user;
36static int rip_msr; 38static int rip_msr;
39static int mce_bootlog;
37 40
38/* 41/*
39 * Lockless MCE logging infrastructure. 42 * Lockless MCE logging infrastructure.
@@ -195,10 +198,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
195 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 198 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
196 199
197 mce_get_rip(&m, regs); 200 mce_get_rip(&m, regs);
198 if (error_code != -1) 201 if (error_code >= 0)
199 rdtscll(m.tsc); 202 rdtscll(m.tsc);
200 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); 203 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
201 mce_log(&m); 204 if (error_code != -2)
205 mce_log(&m);
202 206
203 /* Did this bank cause the exception? */ 207 /* Did this bank cause the exception? */
204 /* Assume that the bank with uncorrectable errors did it, 208 /* Assume that the bank with uncorrectable errors did it,
@@ -313,7 +317,7 @@ static void mce_init(void *dummy)
313 317
314 /* Log the machine checks left over from the previous reset. 318 /* Log the machine checks left over from the previous reset.
315 This also clears all registers */ 319 This also clears all registers */
316 do_machine_check(NULL, -1); 320 do_machine_check(NULL, mce_bootlog ? -1 : -2);
317 321
318 set_in_cr4(X86_CR4_MCE); 322 set_in_cr4(X86_CR4_MCE);
319 323
@@ -474,11 +478,17 @@ static int __init mcheck_disable(char *str)
474} 478}
475 479
476/* mce=off disables machine check. Note you can reenable it later 480/* mce=off disables machine check. Note you can reenable it later
477 using sysfs */ 481 using sysfs.
482 mce=bootlog Log MCEs from before booting. Disabled by default to work
483 around buggy BIOS that leave bogus MCEs. */
478static int __init mcheck_enable(char *str) 484static int __init mcheck_enable(char *str)
479{ 485{
486 if (*str == '=')
487 str++;
480 if (!strcmp(str, "off")) 488 if (!strcmp(str, "off"))
481 mce_dont_init = 1; 489 mce_dont_init = 1;
490 else if (!strcmp(str, "bootlog"))
491 mce_bootlog = 1;
482 else 492 else
483 printk("mce= argument %s ignored. Please use /sys", str); 493 printk("mce= argument %s ignored. Please use /sys", str);
484 return 0; 494 return 0;
@@ -514,10 +524,7 @@ static struct sysdev_class mce_sysclass = {
514 set_kset_name("machinecheck"), 524 set_kset_name("machinecheck"),
515}; 525};
516 526
517static struct sys_device device_mce = { 527static DEFINE_PER_CPU(struct sys_device, device_mce);
518 .id = 0,
519 .cls = &mce_sysclass,
520};
521 528
522/* Why are there no generic functions for this? */ 529/* Why are there no generic functions for this? */
523#define ACCESSOR(name, var, start) \ 530#define ACCESSOR(name, var, start) \
@@ -542,27 +549,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
542ACCESSOR(tolerant,tolerant,) 549ACCESSOR(tolerant,tolerant,)
543ACCESSOR(check_interval,check_interval,mce_restart()) 550ACCESSOR(check_interval,check_interval,mce_restart())
544 551
545static __cpuinit int mce_init_device(void) 552/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
553static __cpuinit int mce_create_device(unsigned int cpu)
546{ 554{
547 int err; 555 int err;
556 if (!mce_available(&cpu_data[cpu]))
557 return -EIO;
558
559 per_cpu(device_mce,cpu).id = cpu;
560 per_cpu(device_mce,cpu).cls = &mce_sysclass;
561
562 err = sysdev_register(&per_cpu(device_mce,cpu));
563
564 if (!err) {
565 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
566 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
567 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
568 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
569 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
570 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
571 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
572 }
573 return err;
574}
575
576#ifdef CONFIG_HOTPLUG_CPU
577static __cpuinit void mce_remove_device(unsigned int cpu)
578{
579 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
580 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
581 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
582 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
583 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
584 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
585 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
586 sysdev_unregister(&per_cpu(device_mce,cpu));
587}
588#endif
589
590/* Get notified when a cpu comes on/off. Be hotplug friendly. */
591static __cpuinit int
592mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
593{
594 unsigned int cpu = (unsigned long)hcpu;
595
596 switch (action) {
597 case CPU_ONLINE:
598 mce_create_device(cpu);
599 break;
600#ifdef CONFIG_HOTPLUG_CPU
601 case CPU_DEAD:
602 mce_remove_device(cpu);
603 break;
604#endif
605 }
606 return NOTIFY_OK;
607}
608
609static struct notifier_block mce_cpu_notifier = {
610 .notifier_call = mce_cpu_callback,
611};
612
613static __init int mce_init_device(void)
614{
615 int err;
616 int i = 0;
617
548 if (!mce_available(&boot_cpu_data)) 618 if (!mce_available(&boot_cpu_data))
549 return -EIO; 619 return -EIO;
550 err = sysdev_class_register(&mce_sysclass); 620 err = sysdev_class_register(&mce_sysclass);
551 if (!err) 621
552 err = sysdev_register(&device_mce); 622 for_each_online_cpu(i) {
553 if (!err) { 623 mce_create_device(i);
554 /* could create per CPU objects, but it is not worth it. */ 624 }
555 sysdev_create_file(&device_mce, &attr_bank0ctl); 625
556 sysdev_create_file(&device_mce, &attr_bank1ctl); 626 register_cpu_notifier(&mce_cpu_notifier);
557 sysdev_create_file(&device_mce, &attr_bank2ctl);
558 sysdev_create_file(&device_mce, &attr_bank3ctl);
559 sysdev_create_file(&device_mce, &attr_bank4ctl);
560 sysdev_create_file(&device_mce, &attr_tolerant);
561 sysdev_create_file(&device_mce, &attr_check_interval);
562 }
563
564 misc_register(&mce_log_device); 627 misc_register(&mce_log_device);
565 return err; 628 return err;
566
567} 629}
630
568device_initcall(mce_init_device); 631device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 9c5aa2a790c7..79c362d03e2e 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -109,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
109 109
110static void __init MP_processor_info (struct mpc_config_processor *m) 110static void __init MP_processor_info (struct mpc_config_processor *m)
111{ 111{
112 int ver; 112 int ver, cpu;
113 static int found_bsp=0; 113 static int found_bsp=0;
114 114
115 if (!(m->mpc_cpuflag & CPU_ENABLED)) 115 if (!(m->mpc_cpuflag & CPU_ENABLED))
@@ -131,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
131 return; 131 return;
132 } 132 }
133 133
134 num_processors++; 134 cpu = num_processors++;
135 135
136 if (m->mpc_apicid > MAX_APICS) { 136 if (m->mpc_apicid > MAX_APICS) {
137 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 137 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
@@ -155,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
155 * in same order as logical cpu numbers. Hence the first 155 * in same order as logical cpu numbers. Hence the first
156 * entry is BSP, and so on. 156 * entry is BSP, and so on.
157 */ 157 */
158 cpu = 0;
159
158 bios_cpu_apicid[0] = m->mpc_apicid; 160 bios_cpu_apicid[0] = m->mpc_apicid;
159 x86_cpu_to_apicid[0] = m->mpc_apicid; 161 x86_cpu_to_apicid[0] = m->mpc_apicid;
160 found_bsp = 1; 162 found_bsp = 1;
161 } else { 163 } else
162 bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; 164 cpu = num_processors - found_bsp;
163 x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; 165 bios_cpu_apicid[cpu] = m->mpc_apicid;
164 } 166 x86_cpu_to_apicid[cpu] = m->mpc_apicid;
167
168 cpu_set(cpu, cpu_possible_map);
169 cpu_set(cpu, cpu_present_map);
165} 170}
166 171
167static void __init MP_bus_info (struct mpc_config_bus *m) 172static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -965,8 +970,21 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
965 * due to unused I/O APIC pins. 970 * due to unused I/O APIC pins.
966 */ 971 */
967 int irq = gsi; 972 int irq = gsi;
968 gsi = pci_irq++; 973 if (gsi < MAX_GSI_NUM) {
969 gsi_to_irq[irq] = gsi; 974 if (gsi > 15)
975 gsi = pci_irq++;
976#ifdef CONFIG_ACPI_BUS
977 /*
978 * Don't assign IRQ used by ACPI SCI
979 */
980 if (gsi == acpi_fadt.sci_int)
981 gsi = pci_irq++;
982#endif
983 gsi_to_irq[irq] = gsi;
984 } else {
985 printk(KERN_ERR "GSI %u is too high\n", gsi);
986 return gsi;
987 }
970 } 988 }
971 989
972 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, 990 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 5fd03225058a..116a491e2961 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -645,15 +645,15 @@ void __init setup_arch(char **cmdline_p)
645 } 645 }
646 } 646 }
647#endif 647#endif
648
649 sparse_init();
650
651#ifdef CONFIG_KEXEC 648#ifdef CONFIG_KEXEC
652 if (crashk_res.start != crashk_res.end) { 649 if (crashk_res.start != crashk_res.end) {
653 reserve_bootmem(crashk_res.start, 650 reserve_bootmem(crashk_res.start,
654 crashk_res.end - crashk_res.start + 1); 651 crashk_res.end - crashk_res.start + 1);
655 } 652 }
656#endif 653#endif
654
655 sparse_init();
656
657 paging_init(); 657 paging_init();
658 658
659 check_ioapic(); 659 check_ioapic();
@@ -765,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
765 int cpu = smp_processor_id(); 765 int cpu = smp_processor_id();
766 int node = 0; 766 int node = 0;
767 unsigned bits; 767 unsigned bits;
768 if (c->x86_num_cores == 1)
769 return;
770 768
771 bits = 0; 769 bits = 0;
772 while ((1 << bits) < c->x86_num_cores) 770 while ((1 << bits) < c->x86_num_cores)
@@ -1081,8 +1079,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1081 else 1079 else
1082 mtrr_ap_init(); 1080 mtrr_ap_init();
1083#ifdef CONFIG_NUMA 1081#ifdef CONFIG_NUMA
1084 if (c != &boot_cpu_data) 1082 numa_add_cpu(smp_processor_id());
1085 numa_add_cpu(c - cpu_data);
1086#endif 1083#endif
1087} 1084}
1088 1085
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index f2b9c6bc999a..34082c1cc41e 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -12,6 +12,7 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/module.h>
15#include <asm/bootsetup.h> 16#include <asm/bootsetup.h>
16#include <asm/pda.h> 17#include <asm/pda.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
@@ -22,10 +23,8 @@
22#include <asm/smp.h> 23#include <asm/smp.h>
23#include <asm/i387.h> 24#include <asm/i387.h>
24#include <asm/percpu.h> 25#include <asm/percpu.h>
25#include <asm/mtrr.h>
26#include <asm/proto.h> 26#include <asm/proto.h>
27#include <asm/mman.h> 27#include <asm/sections.h>
28#include <asm/numa.h>
29 28
30char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; 29char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
31 30
@@ -33,11 +32,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
33 32
34struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 33struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
35 34
36extern struct task_struct init_task;
37
38extern unsigned char __per_cpu_start[], __per_cpu_end[];
39
40extern struct desc_ptr cpu_gdt_descr[];
41struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 35struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
42 36
43char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
@@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void)
101#endif 95#endif
102 96
103 for (i = 0; i < NR_CPUS; i++) { 97 for (i = 0; i < NR_CPUS; i++) {
104 unsigned char *ptr; 98 char *ptr;
105 99
106 if (!NODE_DATA(cpu_to_node(i))) { 100 if (!NODE_DATA(cpu_to_node(i))) {
107 printk("cpu with no node %d, num_online_nodes %d\n", 101 printk("cpu with no node %d, num_online_nodes %d\n",
@@ -190,11 +184,7 @@ void __cpuinit check_efer(void)
190 */ 184 */
191void __cpuinit cpu_init (void) 185void __cpuinit cpu_init (void)
192{ 186{
193#ifdef CONFIG_SMP
194 int cpu = stack_smp_processor_id(); 187 int cpu = stack_smp_processor_id();
195#else
196 int cpu = smp_processor_id();
197#endif
198 struct tss_struct *t = &per_cpu(init_tss, cpu); 188 struct tss_struct *t = &per_cpu(init_tss, cpu);
199 unsigned long v; 189 unsigned long v;
200 char *estacks = NULL; 190 char *estacks = NULL;
@@ -214,7 +204,7 @@ void __cpuinit cpu_init (void)
214 204
215 printk("Initializing CPU#%d\n", cpu); 205 printk("Initializing CPU#%d\n", cpu);
216 206
217 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 207 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
218 208
219 /* 209 /*
220 * Initialize the per-CPU GDT with the boot GDT, 210 * Initialize the per-CPU GDT with the boot GDT,
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index ccae392886af..e5958220d6b8 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void)
129 } else 129 } else
130 leave_mm(cpu); 130 leave_mm(cpu);
131 } 131 }
132out:
132 ack_APIC_irq(); 133 ack_APIC_irq();
133 cpu_clear(cpu, flush_cpumask); 134 cpu_clear(cpu, flush_cpumask);
134
135out:
136 put_cpu_no_resched(); 135 put_cpu_no_resched();
137} 136}
138 137
@@ -294,6 +293,69 @@ void unlock_ipi_call_lock(void)
294} 293}
295 294
296/* 295/*
296 * this function sends a 'generic call function' IPI to one other CPU
297 * in the system.
298 */
299static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info,
300 int nonatomic, int wait)
301{
302 struct call_data_struct data;
303 int cpus = 1;
304
305 data.func = func;
306 data.info = info;
307 atomic_set(&data.started, 0);
308 data.wait = wait;
309 if (wait)
310 atomic_set(&data.finished, 0);
311
312 call_data = &data;
313 wmb();
314 /* Send a message to all other CPUs and wait for them to respond */
315 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
316
317 /* Wait for response */
318 while (atomic_read(&data.started) != cpus)
319 cpu_relax();
320
321 if (!wait)
322 return;
323
324 while (atomic_read(&data.finished) != cpus)
325 cpu_relax();
326}
327
328/*
329 * smp_call_function_single - Run a function on another CPU
330 * @func: The function to run. This must be fast and non-blocking.
331 * @info: An arbitrary pointer to pass to the function.
332 * @nonatomic: Currently unused.
333 * @wait: If true, wait until function has completed on other CPUs.
334 *
335 * Retrurns 0 on success, else a negative status code.
336 *
337 * Does not return until the remote CPU is nearly ready to execute <func>
338 * or is or has executed.
339 */
340
341int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
342 int nonatomic, int wait)
343{
344 /* prevent preemption and reschedule on another processor */
345 int me = get_cpu();
346 if (cpu == me) {
347 WARN_ON(1);
348 put_cpu();
349 return -EBUSY;
350 }
351 spin_lock_bh(&call_lock);
352 __smp_call_function_single(cpu, func, info, nonatomic, wait);
353 spin_unlock_bh(&call_lock);
354 put_cpu();
355 return 0;
356}
357
358/*
297 * this function sends a 'generic call function' IPI to all other CPUs 359 * this function sends a 'generic call function' IPI to all other CPUs
298 * in the system. 360 * in the system.
299 */ 361 */
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index b969ee128728..6e4807d64d46 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -113,24 +113,6 @@ struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
113#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) 113#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
114 114
115/* 115/*
116 * cpu_possible_map should be static, it cannot change as cpu's
117 * are onlined, or offlined. The reason is per-cpu data-structures
118 * are allocated by some modules at init time, and dont expect to
119 * do this dynamically on cpu arrival/departure.
120 * cpu_present_map on the other hand can change dynamically.
121 * In case when cpu_hotplug is not compiled, then we resort to current
122 * behaviour, which is cpu_possible == cpu_present.
123 * If cpu-hotplug is supported, then we need to preallocate for all
124 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
125 * - Ashok Raj
126 */
127#ifdef CONFIG_HOTPLUG_CPU
128#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
129#else
130#define fixup_cpu_possible_map(x)
131#endif
132
133/*
134 * Currently trivial. Write the real->protected mode 116 * Currently trivial. Write the real->protected mode
135 * bootstrap into the page concerned. The caller 117 * bootstrap into the page concerned. The caller
136 * has made sure it's suitably aligned. 118 * has made sure it's suitably aligned.
@@ -229,9 +211,6 @@ static __cpuinit void sync_master(void *arg)
229{ 211{
230 unsigned long flags, i; 212 unsigned long flags, i;
231 213
232 if (smp_processor_id() != boot_cpu_id)
233 return;
234
235 go[MASTER] = 0; 214 go[MASTER] = 0;
236 215
237 local_irq_save(flags); 216 local_irq_save(flags);
@@ -280,12 +259,12 @@ get_delta(long *rt, long *master)
280 return tcenter - best_tm; 259 return tcenter - best_tm;
281} 260}
282 261
283static __cpuinit void sync_tsc(void) 262static __cpuinit void sync_tsc(unsigned int master)
284{ 263{
285 int i, done = 0; 264 int i, done = 0;
286 long delta, adj, adjust_latency = 0; 265 long delta, adj, adjust_latency = 0;
287 unsigned long flags, rt, master_time_stamp, bound; 266 unsigned long flags, rt, master_time_stamp, bound;
288#if DEBUG_TSC_SYNC 267#ifdef DEBUG_TSC_SYNC
289 static struct syncdebug { 268 static struct syncdebug {
290 long rt; /* roundtrip time */ 269 long rt; /* roundtrip time */
291 long master; /* master's timestamp */ 270 long master; /* master's timestamp */
@@ -294,9 +273,17 @@ static __cpuinit void sync_tsc(void)
294 } t[NUM_ROUNDS] __cpuinitdata; 273 } t[NUM_ROUNDS] __cpuinitdata;
295#endif 274#endif
296 275
276 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n",
277 smp_processor_id(), master);
278
297 go[MASTER] = 1; 279 go[MASTER] = 1;
298 280
299 smp_call_function(sync_master, NULL, 1, 0); 281 /* It is dangerous to broadcast IPI as cpus are coming up,
282 * as they may not be ready to accept them. So since
283 * we only need to send the ipi to the boot cpu direct
284 * the message, and avoid the race.
285 */
286 smp_call_function_single(master, sync_master, NULL, 1, 0);
300 287
301 while (go[MASTER]) /* wait for master to be ready */ 288 while (go[MASTER]) /* wait for master to be ready */
302 no_cpu_relax(); 289 no_cpu_relax();
@@ -321,7 +308,7 @@ static __cpuinit void sync_tsc(void)
321 rdtscll(t); 308 rdtscll(t);
322 wrmsrl(MSR_IA32_TSC, t + adj); 309 wrmsrl(MSR_IA32_TSC, t + adj);
323 } 310 }
324#if DEBUG_TSC_SYNC 311#ifdef DEBUG_TSC_SYNC
325 t[i].rt = rt; 312 t[i].rt = rt;
326 t[i].master = master_time_stamp; 313 t[i].master = master_time_stamp;
327 t[i].diff = delta; 314 t[i].diff = delta;
@@ -331,7 +318,7 @@ static __cpuinit void sync_tsc(void)
331 } 318 }
332 spin_unlock_irqrestore(&tsc_sync_lock, flags); 319 spin_unlock_irqrestore(&tsc_sync_lock, flags);
333 320
334#if DEBUG_TSC_SYNC 321#ifdef DEBUG_TSC_SYNC
335 for (i = 0; i < NUM_ROUNDS; ++i) 322 for (i = 0; i < NUM_ROUNDS; ++i)
336 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", 323 printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
337 t[i].rt, t[i].master, t[i].diff, t[i].lat); 324 t[i].rt, t[i].master, t[i].diff, t[i].lat);
@@ -340,16 +327,14 @@ static __cpuinit void sync_tsc(void)
340 printk(KERN_INFO 327 printk(KERN_INFO
341 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " 328 "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
342 "maxerr %lu cycles)\n", 329 "maxerr %lu cycles)\n",
343 smp_processor_id(), boot_cpu_id, delta, rt); 330 smp_processor_id(), master, delta, rt);
344} 331}
345 332
346static void __cpuinit tsc_sync_wait(void) 333static void __cpuinit tsc_sync_wait(void)
347{ 334{
348 if (notscsync || !cpu_has_tsc) 335 if (notscsync || !cpu_has_tsc)
349 return; 336 return;
350 printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), 337 sync_tsc(boot_cpu_id);
351 boot_cpu_id);
352 sync_tsc();
353} 338}
354 339
355static __init int notscsync_setup(char *s) 340static __init int notscsync_setup(char *s)
@@ -537,7 +522,7 @@ void __cpuinit start_secondary(void)
537extern volatile unsigned long init_rsp; 522extern volatile unsigned long init_rsp;
538extern void (*initial_code)(void); 523extern void (*initial_code)(void);
539 524
540#if APIC_DEBUG 525#ifdef APIC_DEBUG
541static void inquire_remote_apic(int apicid) 526static void inquire_remote_apic(int apicid)
542{ 527{
543 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 528 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
@@ -773,8 +758,9 @@ do_rest:
773 initial_code = start_secondary; 758 initial_code = start_secondary;
774 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); 759 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
775 760
776 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 761 printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
777 start_rip, init_rsp); 762 cpus_weight(cpu_present_map),
763 apicid);
778 764
779 /* 765 /*
780 * This grunge runs the startup process for 766 * This grunge runs the startup process for
@@ -841,7 +827,7 @@ do_rest:
841 else 827 else
842 /* trampoline code not run */ 828 /* trampoline code not run */
843 printk("Not responding.\n"); 829 printk("Not responding.\n");
844#if APIC_DEBUG 830#ifdef APIC_DEBUG
845 inquire_remote_apic(apicid); 831 inquire_remote_apic(apicid);
846#endif 832#endif
847 } 833 }
@@ -924,6 +910,27 @@ static __init void enforce_max_cpus(unsigned max_cpus)
924 } 910 }
925} 911}
926 912
913#ifdef CONFIG_HOTPLUG_CPU
914/*
915 * cpu_possible_map should be static, it cannot change as cpu's
916 * are onlined, or offlined. The reason is per-cpu data-structures
917 * are allocated by some modules at init time, and dont expect to
918 * do this dynamically on cpu arrival/departure.
919 * cpu_present_map on the other hand can change dynamically.
920 * In case when cpu_hotplug is not compiled, then we resort to current
921 * behaviour, which is cpu_possible == cpu_present.
922 * If cpu-hotplug is supported, then we need to preallocate for all
923 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
924 * - Ashok Raj
925 */
926static void prefill_possible_map(void)
927{
928 int i;
929 for (i = 0; i < NR_CPUS; i++)
930 cpu_set(i, cpu_possible_map);
931}
932#endif
933
927/* 934/*
928 * Various sanity checks. 935 * Various sanity checks.
929 */ 936 */
@@ -987,25 +994,15 @@ static int __init smp_sanity_check(unsigned max_cpus)
987 */ 994 */
988void __init smp_prepare_cpus(unsigned int max_cpus) 995void __init smp_prepare_cpus(unsigned int max_cpus)
989{ 996{
990 int i;
991
992 nmi_watchdog_default(); 997 nmi_watchdog_default();
993 current_cpu_data = boot_cpu_data; 998 current_cpu_data = boot_cpu_data;
994 current_thread_info()->cpu = 0; /* needed? */ 999 current_thread_info()->cpu = 0; /* needed? */
995 1000
996 enforce_max_cpus(max_cpus); 1001 enforce_max_cpus(max_cpus);
997 1002
998 /* 1003#ifdef CONFIG_HOTPLUG_CPU
999 * Fill in cpu_present_mask 1004 prefill_possible_map();
1000 */ 1005#endif
1001 for (i = 0; i < NR_CPUS; i++) {
1002 int apicid = cpu_present_to_apicid(i);
1003 if (physid_isset(apicid, phys_cpu_present_map)) {
1004 cpu_set(i, cpu_present_map);
1005 cpu_set(i, cpu_possible_map);
1006 }
1007 fixup_cpu_possible_map(i);
1008 }
1009 1006
1010 if (smp_sanity_check(max_cpus) < 0) { 1007 if (smp_sanity_check(max_cpus) < 0) {
1011 printk(KERN_INFO "SMP disabled\n"); 1008 printk(KERN_INFO "SMP disabled\n");
@@ -1189,8 +1186,7 @@ void __cpu_die(unsigned int cpu)
1189 printk ("CPU %d is now offline\n", cpu); 1186 printk ("CPU %d is now offline\n", cpu);
1190 return; 1187 return;
1191 } 1188 }
1192 current->state = TASK_UNINTERRUPTIBLE; 1189 msleep(100);
1193 schedule_timeout(HZ/10);
1194 } 1190 }
1195 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1191 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1196} 1192}
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 102736630002..6ead433a3885 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -594,9 +594,6 @@ asmlinkage void default_do_nmi(struct pt_regs *regs)
594 if (!cpu) 594 if (!cpu)
595 reason = get_nmi_reason(); 595 reason = get_nmi_reason();
596 596
597 if (!cpu_online(cpu))
598 return;
599
600 if (!(reason & 0xc0)) { 597 if (!(reason & 0xc0)) {
601 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) 598 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
602 == NOTIFY_STOP) 599 == NOTIFY_STOP)