aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile4
-rw-r--r--arch/i386/kernel/acpi/boot.c2
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c102
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c10
-rw-r--r--arch/i386/kernel/asm-offsets.c18
-rw-r--r--arch/i386/kernel/cpu/Makefile4
-rw-r--r--arch/i386/kernel/cpu/amd.c15
-rw-r--r--arch/i386/kernel/cpu/bugs.c191
-rw-r--r--arch/i386/kernel/cpu/centaur.c10
-rw-r--r--arch/i386/kernel/cpu/common.c217
-rw-r--r--arch/i386/kernel/cpu/cyrix.c21
-rw-r--r--arch/i386/kernel/cpu/intel.c4
-rw-r--r--arch/i386/kernel/cpu/mcheck/k7.c13
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c3
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c16
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c101
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c11
-rw-r--r--arch/i386/kernel/cpu/nexgen.c10
-rw-r--r--arch/i386/kernel/cpu/perfctr-watchdog.c658
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpu/rise.c9
-rw-r--r--arch/i386/kernel/cpu/transmeta.c10
-rw-r--r--arch/i386/kernel/cpu/umc.c10
-rw-r--r--arch/i386/kernel/doublefault.c29
-rw-r--r--arch/i386/kernel/e820.c64
-rw-r--r--arch/i386/kernel/efi.c16
-rw-r--r--arch/i386/kernel/entry.S23
-rw-r--r--arch/i386/kernel/head.S118
-rw-r--r--arch/i386/kernel/i386_ksyms.c2
-rw-r--r--arch/i386/kernel/io_apic.c38
-rw-r--r--arch/i386/kernel/ioport.c3
-rw-r--r--arch/i386/kernel/irq.c3
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/nmi.c832
-rw-r--r--arch/i386/kernel/paravirt.c522
-rw-r--r--arch/i386/kernel/process.c37
-rw-r--r--arch/i386/kernel/quirks.c69
-rw-r--r--arch/i386/kernel/reboot.c55
-rw-r--r--arch/i386/kernel/reboot_fixups.c2
-rw-r--r--arch/i386/kernel/smp.c304
-rw-r--r--arch/i386/kernel/smpboot.c146
-rw-r--r--arch/i386/kernel/sysenter.c269
-rw-r--r--arch/i386/kernel/time.c2
-rw-r--r--arch/i386/kernel/trampoline.S12
-rw-r--r--arch/i386/kernel/traps.c32
-rw-r--r--arch/i386/kernel/tsc.c13
-rw-r--r--arch/i386/kernel/verify_cpu.S65
-rw-r--r--arch/i386/kernel/vmi.c131
-rw-r--r--arch/i386/kernel/vmiclock.c318
-rw-r--r--arch/i386/kernel/vmitime.c482
-rw-r--r--arch/i386/kernel/vmlinux.lds.S24
-rw-r--r--arch/i386/kernel/vsyscall.lds.S4
54 files changed, 2406 insertions, 2696 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4ae3dcf1d2f0..4f98516b9f94 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,12 +39,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
39obj-$(CONFIG_HPET_TIMER) += hpet.o 39obj-$(CONFIG_HPET_TIMER) += hpet.o
40obj-$(CONFIG_K8_NB) += k8.o 40obj-$(CONFIG_K8_NB) += k8.o
41 41
42obj-$(CONFIG_VMI) += vmi.o vmitime.o 42obj-$(CONFIG_VMI) += vmi.o vmiclock.o
43obj-$(CONFIG_PARAVIRT) += paravirt.o 43obj-$(CONFIG_PARAVIRT) += paravirt.o
44obj-y += pcspeaker.o 44obj-y += pcspeaker.o
45 45
46EXTRA_AFLAGS := -traditional
47
48obj-$(CONFIG_SCx200) += scx200.o 46obj-$(CONFIG_SCx200) += scx200.o
49 47
50# vsyscall.o contains the vsyscall DSO images as __initdata. 48# vsyscall.o contains the vsyscall DSO images as __initdata.
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9ea5b8ecc7e1..280898b045b2 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -874,7 +874,7 @@ static void __init acpi_process_madt(void)
874 acpi_ioapic = 1; 874 acpi_ioapic = 1;
875 875
876 smp_found_config = 1; 876 smp_found_config = 1;
877 clustered_apic_check(); 877 setup_apic_routing();
878 } 878 }
879 } 879 }
880 if (error == -EINVAL) { 880 if (error == -EINVAL) {
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 8f7efd38254d..23f78efc577d 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
14 13
15#ifdef CONFIG_ACPI 14#ifdef CONFIG_ACPI
16 15
@@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device)
48 return 0; 47 return 0;
49} 48}
50 49
51static void check_intel(void)
52{
53 u16 vendor, device;
54
55 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
56
57 if (vendor != PCI_VENDOR_ID_INTEL)
58 return;
59
60 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
61#ifdef CONFIG_SMP
62 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
63 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
64 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
65 quirk_intel_irqbalance();
66#endif
67}
68
69void __init check_acpi_pci(void) 50void __init check_acpi_pci(void)
70{ 51{
71 int num, slot, func; 52 int num, slot, func;
@@ -77,8 +58,6 @@ void __init check_acpi_pci(void)
77 if (!early_pci_allowed()) 58 if (!early_pci_allowed())
78 return; 59 return;
79 60
80 check_intel();
81
82 /* Poor man's PCI discovery */ 61 /* Poor man's PCI discovery */
83 for (num = 0; num < 32; num++) { 62 for (num = 0; num < 32; num++) {
84 for (slot = 0; slot < 32; slot++) { 63 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 426f59b0106b..e5cec6685cc5 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -5,6 +5,7 @@
5#include <asm/alternative.h> 5#include <asm/alternative.h>
6#include <asm/sections.h> 6#include <asm/sections.h>
7 7
8static int noreplace_smp = 0;
8static int smp_alt_once = 0; 9static int smp_alt_once = 0;
9static int debug_alternative = 0; 10static int debug_alternative = 0;
10 11
@@ -13,15 +14,33 @@ static int __init bootonly(char *str)
13 smp_alt_once = 1; 14 smp_alt_once = 1;
14 return 1; 15 return 1;
15} 16}
17__setup("smp-alt-boot", bootonly);
18
16static int __init debug_alt(char *str) 19static int __init debug_alt(char *str)
17{ 20{
18 debug_alternative = 1; 21 debug_alternative = 1;
19 return 1; 22 return 1;
20} 23}
21
22__setup("smp-alt-boot", bootonly);
23__setup("debug-alternative", debug_alt); 24__setup("debug-alternative", debug_alt);
24 25
26static int __init setup_noreplace_smp(char *str)
27{
28 noreplace_smp = 1;
29 return 1;
30}
31__setup("noreplace-smp", setup_noreplace_smp);
32
33#ifdef CONFIG_PARAVIRT
34static int noreplace_paravirt = 0;
35
36static int __init setup_noreplace_paravirt(char *str)
37{
38 noreplace_paravirt = 1;
39 return 1;
40}
41__setup("noreplace-paravirt", setup_noreplace_paravirt);
42#endif
43
25#define DPRINTK(fmt, args...) if (debug_alternative) \ 44#define DPRINTK(fmt, args...) if (debug_alternative) \
26 printk(KERN_DEBUG fmt, args) 45 printk(KERN_DEBUG fmt, args)
27 46
@@ -132,11 +151,8 @@ static void nop_out(void *insns, unsigned int len)
132} 151}
133 152
134extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 153extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
135extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
136extern u8 *__smp_locks[], *__smp_locks_end[]; 154extern u8 *__smp_locks[], *__smp_locks_end[];
137 155
138extern u8 __smp_alt_begin[], __smp_alt_end[];
139
140/* Replace instructions with better alternatives for this CPU type. 156/* Replace instructions with better alternatives for this CPU type.
141 This runs before SMP is initialized to avoid SMP problems with 157 This runs before SMP is initialized to avoid SMP problems with
142 self modifying code. This implies that assymetric systems where 158 self modifying code. This implies that assymetric systems where
@@ -171,29 +187,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
171 187
172#ifdef CONFIG_SMP 188#ifdef CONFIG_SMP
173 189
174static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end)
175{
176 struct alt_instr *a;
177
178 DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end);
179 for (a = start; a < end; a++) {
180 memcpy(a->replacement + a->replacementlen,
181 a->instr,
182 a->instrlen);
183 }
184}
185
186static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end)
187{
188 struct alt_instr *a;
189
190 for (a = start; a < end; a++) {
191 memcpy(a->instr,
192 a->replacement + a->replacementlen,
193 a->instrlen);
194 }
195}
196
197static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 190static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
198{ 191{
199 u8 **ptr; 192 u8 **ptr;
@@ -211,6 +204,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
211{ 204{
212 u8 **ptr; 205 u8 **ptr;
213 206
207 if (noreplace_smp)
208 return;
209
214 for (ptr = start; ptr < end; ptr++) { 210 for (ptr = start; ptr < end; ptr++) {
215 if (*ptr < text) 211 if (*ptr < text)
216 continue; 212 continue;
@@ -245,6 +241,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
245 struct smp_alt_module *smp; 241 struct smp_alt_module *smp;
246 unsigned long flags; 242 unsigned long flags;
247 243
244 if (noreplace_smp)
245 return;
246
248 if (smp_alt_once) { 247 if (smp_alt_once) {
249 if (boot_cpu_has(X86_FEATURE_UP)) 248 if (boot_cpu_has(X86_FEATURE_UP))
250 alternatives_smp_unlock(locks, locks_end, 249 alternatives_smp_unlock(locks, locks_end,
@@ -279,7 +278,7 @@ void alternatives_smp_module_del(struct module *mod)
279 struct smp_alt_module *item; 278 struct smp_alt_module *item;
280 unsigned long flags; 279 unsigned long flags;
281 280
282 if (smp_alt_once) 281 if (smp_alt_once || noreplace_smp)
283 return; 282 return;
284 283
285 spin_lock_irqsave(&smp_alt, flags); 284 spin_lock_irqsave(&smp_alt, flags);
@@ -310,7 +309,7 @@ void alternatives_smp_switch(int smp)
310 return; 309 return;
311#endif 310#endif
312 311
313 if (smp_alt_once) 312 if (noreplace_smp || smp_alt_once)
314 return; 313 return;
315 BUG_ON(!smp && (num_online_cpus() > 1)); 314 BUG_ON(!smp && (num_online_cpus() > 1));
316 315
@@ -319,8 +318,6 @@ void alternatives_smp_switch(int smp)
319 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 318 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
320 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 319 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
321 clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); 320 clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
322 alternatives_smp_apply(__smp_alt_instructions,
323 __smp_alt_instructions_end);
324 list_for_each_entry(mod, &smp_alt_modules, next) 321 list_for_each_entry(mod, &smp_alt_modules, next)
325 alternatives_smp_lock(mod->locks, mod->locks_end, 322 alternatives_smp_lock(mod->locks, mod->locks_end,
326 mod->text, mod->text_end); 323 mod->text, mod->text_end);
@@ -328,8 +325,6 @@ void alternatives_smp_switch(int smp)
328 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 325 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
329 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 326 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
330 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); 327 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
331 apply_alternatives(__smp_alt_instructions,
332 __smp_alt_instructions_end);
333 list_for_each_entry(mod, &smp_alt_modules, next) 328 list_for_each_entry(mod, &smp_alt_modules, next)
334 alternatives_smp_unlock(mod->locks, mod->locks_end, 329 alternatives_smp_unlock(mod->locks, mod->locks_end,
335 mod->text, mod->text_end); 330 mod->text, mod->text_end);
@@ -340,36 +335,31 @@ void alternatives_smp_switch(int smp)
340#endif 335#endif
341 336
342#ifdef CONFIG_PARAVIRT 337#ifdef CONFIG_PARAVIRT
343void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) 338void apply_paravirt(struct paravirt_patch_site *start,
339 struct paravirt_patch_site *end)
344{ 340{
345 struct paravirt_patch *p; 341 struct paravirt_patch_site *p;
342
343 if (noreplace_paravirt)
344 return;
346 345
347 for (p = start; p < end; p++) { 346 for (p = start; p < end; p++) {
348 unsigned int used; 347 unsigned int used;
349 348
350 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, 349 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
351 p->len); 350 p->len);
352#ifdef CONFIG_DEBUG_PARAVIRT 351
353 { 352 BUG_ON(used > p->len);
354 int i; 353
355 /* Deliberately clobber regs using "not %reg" to find bugs. */
356 for (i = 0; i < 3; i++) {
357 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
358 memcpy(p->instr + used, "\xf7\xd0", 2);
359 p->instr[used+1] |= i;
360 used += 2;
361 }
362 }
363 }
364#endif
365 /* Pad the rest with nops */ 354 /* Pad the rest with nops */
366 nop_out(p->instr + used, p->len - used); 355 nop_out(p->instr + used, p->len - used);
367 } 356 }
368 357
369 /* Sync to be conservative, in case we patched following instructions */ 358 /* Sync to be conservative, in case we patched following
359 * instructions */
370 sync_core(); 360 sync_core();
371} 361}
372extern struct paravirt_patch __start_parainstructions[], 362extern struct paravirt_patch_site __start_parainstructions[],
373 __stop_parainstructions[]; 363 __stop_parainstructions[];
374#endif /* CONFIG_PARAVIRT */ 364#endif /* CONFIG_PARAVIRT */
375 365
@@ -396,23 +386,19 @@ void __init alternative_instructions(void)
396 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 386 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
397 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 387 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
398 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); 388 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
399 apply_alternatives(__smp_alt_instructions,
400 __smp_alt_instructions_end);
401 alternatives_smp_unlock(__smp_locks, __smp_locks_end, 389 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
402 _text, _etext); 390 _text, _etext);
403 } 391 }
404 free_init_pages("SMP alternatives", 392 free_init_pages("SMP alternatives",
405 (unsigned long)__smp_alt_begin, 393 __pa_symbol(&__smp_locks),
406 (unsigned long)__smp_alt_end); 394 __pa_symbol(&__smp_locks_end));
407 } else { 395 } else {
408 alternatives_smp_save(__smp_alt_instructions,
409 __smp_alt_instructions_end);
410 alternatives_smp_module_add(NULL, "core kernel", 396 alternatives_smp_module_add(NULL, "core kernel",
411 __smp_locks, __smp_locks_end, 397 __smp_locks, __smp_locks_end,
412 _text, _etext); 398 _text, _etext);
413 alternatives_smp_switch(0); 399 alternatives_smp_switch(0);
414 } 400 }
415#endif 401#endif
416 apply_paravirt(__start_parainstructions, __stop_parainstructions); 402 apply_paravirt(__parainstructions, __parainstructions_end);
417 local_irq_restore(flags); 403 local_irq_restore(flags);
418} 404}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 93aa911646ad..aca054cc0552 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -129,6 +129,28 @@ static int modern_apic(void)
129 return lapic_get_version() >= 0x14; 129 return lapic_get_version() >= 0x14;
130} 130}
131 131
132void apic_wait_icr_idle(void)
133{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
135 cpu_relax();
136}
137
138unsigned long safe_apic_wait_icr_idle(void)
139{
140 unsigned long send_status;
141 int timeout;
142
143 timeout = 0;
144 do {
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150
151 return send_status;
152}
153
132/** 154/**
133 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
134 */ 156 */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 064bbf2861f4..367ff1d930cb 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -233,11 +233,10 @@
233#include <asm/desc.h> 233#include <asm/desc.h>
234#include <asm/i8253.h> 234#include <asm/i8253.h>
235#include <asm/paravirt.h> 235#include <asm/paravirt.h>
236#include <asm/reboot.h>
236 237
237#include "io_ports.h" 238#include "io_ports.h"
238 239
239extern void machine_real_restart(unsigned char *, int);
240
241#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
242extern int (*console_blank_hook)(int); 241extern int (*console_blank_hook)(int);
243#endif 242#endif
@@ -384,13 +383,6 @@ static int ignore_sys_suspend;
384static int ignore_normal_resume; 383static int ignore_normal_resume;
385static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
386 385
387#ifdef CONFIG_APM_RTC_IS_GMT
388# define clock_cmos_diff 0
389# define got_clock_diff 1
390#else
391static long clock_cmos_diff;
392static int got_clock_diff;
393#endif
394static int debug __read_mostly; 386static int debug __read_mostly;
395static int smp __read_mostly; 387static int smp __read_mostly;
396static int apm_disabled = -1; 388static int apm_disabled = -1;
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c37535163bfc..27a776c9044d 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -11,11 +11,11 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/ucontext.h> 12#include <asm/ucontext.h>
13#include "sigframe.h" 13#include "sigframe.h"
14#include <asm/pgtable.h>
14#include <asm/fixmap.h> 15#include <asm/fixmap.h>
15#include <asm/processor.h> 16#include <asm/processor.h>
16#include <asm/thread_info.h> 17#include <asm/thread_info.h>
17#include <asm/elf.h> 18#include <asm/elf.h>
18#include <asm/pda.h>
19 19
20#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
21 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -25,6 +25,9 @@
25#define OFFSET(sym, str, mem) \ 25#define OFFSET(sym, str, mem) \
26 DEFINE(sym, offsetof(struct str, mem)); 26 DEFINE(sym, offsetof(struct str, mem));
27 27
28/* workaround for a warning with -Wmissing-prototypes */
29void foo(void);
30
28void foo(void) 31void foo(void)
29{ 32{
30 OFFSET(SIGCONTEXT_eax, sigcontext, eax); 33 OFFSET(SIGCONTEXT_eax, sigcontext, eax);
@@ -90,17 +93,18 @@ void foo(void)
90 OFFSET(pbe_next, pbe, next); 93 OFFSET(pbe_next, pbe, next);
91 94
92 /* Offset from the sysenter stack to tss.esp0 */ 95 /* Offset from the sysenter stack to tss.esp0 */
93 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - 96 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
94 sizeof(struct tss_struct)); 97 sizeof(struct tss_struct));
95 98
96 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 99 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
97 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 100 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
101 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
102 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
103 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
98 104
99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 105 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
100 106
101 BLANK(); 107 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104 108
105#ifdef CONFIG_PARAVIRT 109#ifdef CONFIG_PARAVIRT
106 BLANK(); 110 BLANK();
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
index 010aecfffbc1..74f27a463db0 100644
--- a/arch/i386/kernel/cpu/Makefile
+++ b/arch/i386/kernel/cpu/Makefile
@@ -2,7 +2,7 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5obj-y := common.o proc.o 5obj-y := common.o proc.o bugs.o
6 6
7obj-y += amd.o 7obj-y += amd.o
8obj-y += cyrix.o 8obj-y += cyrix.o
@@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE) += mcheck/
17 17
18obj-$(CONFIG_MTRR) += mtrr/ 18obj-$(CONFIG_MTRR) += mtrr/
19obj-$(CONFIG_CPU_FREQ) += cpufreq/ 19obj-$(CONFIG_CPU_FREQ) += cpufreq/
20
21obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 2d47db482972..4fec702afd7e 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void)
53 return 0; 53 return 0;
54} 54}
55 55
56int force_mwait __cpuinitdata;
57
56static void __cpuinit init_amd(struct cpuinfo_x86 *c) 58static void __cpuinit init_amd(struct cpuinfo_x86 *c)
57{ 59{
58 u32 l, h; 60 u32 l, h;
@@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
275 277
276 if (amd_apic_timer_broken()) 278 if (amd_apic_timer_broken())
277 set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); 279 set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
280
281 if (c->x86 == 0x10 && !force_mwait)
282 clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
278} 283}
279 284
280static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) 285static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -314,13 +319,3 @@ int __init amd_init_cpu(void)
314 cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; 319 cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
315 return 0; 320 return 0;
316} 321}
317
318//early_arch_initcall(amd_init_cpu);
319
320static int __init amd_exit_cpu(void)
321{
322 cpu_devs[X86_VENDOR_AMD] = NULL;
323 return 0;
324}
325
326late_initcall(amd_exit_cpu);
diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c
new file mode 100644
index 000000000000..54428a2500f3
--- /dev/null
+++ b/arch/i386/kernel/cpu/bugs.c
@@ -0,0 +1,191 @@
1/*
2 * arch/i386/cpu/bugs.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Cyrix stuff, June 1998 by:
7 * - Rafael R. Reilova (moved everything from head.S),
8 * <rreilova@ececs.uc.edu>
9 * - Channing Corn (tests & fixes),
10 * - Andrew D. Balsa (code cleanup).
11 */
12#include <linux/init.h>
13#include <linux/utsname.h>
14#include <asm/processor.h>
15#include <asm/i387.h>
16#include <asm/msr.h>
17#include <asm/paravirt.h>
18#include <asm/alternative.h>
19
20static int __init no_halt(char *s)
21{
22 boot_cpu_data.hlt_works_ok = 0;
23 return 1;
24}
25
26__setup("no-hlt", no_halt);
27
28static int __init mca_pentium(char *s)
29{
30 mca_pentium_flag = 1;
31 return 1;
32}
33
34__setup("mca-pentium", mca_pentium);
35
36static int __init no_387(char *s)
37{
38 boot_cpu_data.hard_math = 0;
39 write_cr0(0xE | read_cr0());
40 return 1;
41}
42
43__setup("no387", no_387);
44
45static double __initdata x = 4195835.0;
46static double __initdata y = 3145727.0;
47
48/*
49 * This used to check for exceptions..
50 * However, it turns out that to support that,
51 * the XMM trap handlers basically had to
52 * be buggy. So let's have a correct XMM trap
53 * handler, and forget about printing out
54 * some status at boot.
55 *
56 * We should really only care about bugs here
57 * anyway. Not features.
58 */
59static void __init check_fpu(void)
60{
61 if (!boot_cpu_data.hard_math) {
62#ifndef CONFIG_MATH_EMULATION
63 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
64 printk(KERN_EMERG "Giving up.\n");
65 for (;;) ;
66#endif
67 return;
68 }
69
70/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
71 /* Test for the divl bug.. */
72 __asm__("fninit\n\t"
73 "fldl %1\n\t"
74 "fdivl %2\n\t"
75 "fmull %2\n\t"
76 "fldl %1\n\t"
77 "fsubp %%st,%%st(1)\n\t"
78 "fistpl %0\n\t"
79 "fwait\n\t"
80 "fninit"
81 : "=m" (*&boot_cpu_data.fdiv_bug)
82 : "m" (*&x), "m" (*&y));
83 if (boot_cpu_data.fdiv_bug)
84 printk("Hmm, FPU with FDIV bug.\n");
85}
86
87static void __init check_hlt(void)
88{
89 if (paravirt_enabled())
90 return;
91
92 printk(KERN_INFO "Checking 'hlt' instruction... ");
93 if (!boot_cpu_data.hlt_works_ok) {
94 printk("disabled\n");
95 return;
96 }
97 halt();
98 halt();
99 halt();
100 halt();
101 printk("OK.\n");
102}
103
104/*
105 * Most 386 processors have a bug where a POPAD can lock the
106 * machine even from user space.
107 */
108
109static void __init check_popad(void)
110{
111#ifndef CONFIG_X86_POPAD_OK
112 int res, inp = (int) &res;
113
114 printk(KERN_INFO "Checking for popad bug... ");
115 __asm__ __volatile__(
116 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
117 : "=&a" (res)
118 : "d" (inp)
119 : "ecx", "edi" );
120 /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
121 if (res != 12345678) printk( "Buggy.\n" );
122 else printk( "OK.\n" );
123#endif
124}
125
126/*
127 * Check whether we are able to run this kernel safely on SMP.
128 *
129 * - In order to run on a i386, we need to be compiled for i386
130 * (for due to lack of "invlpg" and working WP on a i386)
131 * - In order to run on anything without a TSC, we need to be
132 * compiled for a i486.
133 * - In order to support the local APIC on a buggy Pentium machine,
134 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
135 * which happens implicitly if compiled for a Pentium or lower
136 * (unless an advanced selection of CPU features is used) as an
137 * otherwise config implies a properly working local APIC without
138 * the need to do extra reads from the APIC.
139*/
140
141static void __init check_config(void)
142{
143/*
144 * We'd better not be a i386 if we're configured to use some
145 * i486+ only features! (WP works in supervisor mode and the
146 * new "invlpg" and "bswap" instructions)
147 */
148#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
149 if (boot_cpu_data.x86 == 3)
150 panic("Kernel requires i486+ for 'invlpg' and other features");
151#endif
152
153/*
154 * If we configured ourselves for a TSC, we'd better have one!
155 */
156#ifdef CONFIG_X86_TSC
157 if (!cpu_has_tsc && !tsc_disable)
158 panic("Kernel compiled for Pentium+, requires TSC feature!");
159#endif
160
161/*
162 * If we were told we had a good local APIC, check for buggy Pentia,
163 * i.e. all B steppings and the C2 stepping of P54C when using their
164 * integrated APIC (see 11AP erratum in "Pentium Processor
165 * Specification Update").
166 */
167#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
168 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
169 && cpu_has_apic
170 && boot_cpu_data.x86 == 5
171 && boot_cpu_data.x86_model == 2
172 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
173 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
174#endif
175}
176
177
178void __init check_bugs(void)
179{
180 identify_boot_cpu();
181#ifndef CONFIG_SMP
182 printk("CPU: ");
183 print_cpu_info(&boot_cpu_data);
184#endif
185 check_config();
186 check_fpu();
187 check_hlt();
188 check_popad();
189 init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
190 alternative_instructions();
191}
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index 8c25047975c0..473eac883c7b 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -469,13 +469,3 @@ int __init centaur_init_cpu(void)
469 cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev; 469 cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
470 return 0; 470 return 0;
471} 471}
472
473//early_arch_initcall(centaur_init_cpu);
474
475static int __init centaur_exit_cpu(void)
476{
477 cpu_devs[X86_VENDOR_CENTAUR] = NULL;
478 return 0;
479}
480
481late_initcall(centaur_exit_cpu);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index dcbbd0a8bfc2..794d593c47eb 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,15 +18,37 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
22 21
23#include "cpu.h" 22#include "cpu.h"
24 23
25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 25 [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
26 [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
28 [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
29 /*
30 * Segments used for calling PnP BIOS have byte granularity.
31 * They code segments and data segments have fixed 64k limits,
32 * the transfer segment sizes are set at run time.
33 */
34 [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
35 [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
36 [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
37 [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
38 [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
39 /*
40 * The APM segments have byte granularity and their bases
41 * are set at run time. All have 64k limits.
42 */
43 [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
44 /* 16-bit code */
45 [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
46 [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
27 47
28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; 48 [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
29EXPORT_SYMBOL(_cpu_pda); 49 [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
50} };
51EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
30 52
31static int cachesize_override __cpuinitdata = -1; 53static int cachesize_override __cpuinitdata = -1;
32static int disable_x86_fxsr __cpuinitdata; 54static int disable_x86_fxsr __cpuinitdata;
@@ -368,7 +390,7 @@ __setup("serialnumber", x86_serial_nr_setup);
368/* 390/*
369 * This does the hard work of actually picking apart the CPU stuff... 391 * This does the hard work of actually picking apart the CPU stuff...
370 */ 392 */
371void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 393static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372{ 394{
373 int i; 395 int i;
374 396
@@ -479,15 +501,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
479 501
480 /* Init Machine Check Exception if available. */ 502 /* Init Machine Check Exception if available. */
481 mcheck_init(c); 503 mcheck_init(c);
504}
482 505
483 if (c == &boot_cpu_data) 506void __init identify_boot_cpu(void)
484 sysenter_setup(); 507{
508 identify_cpu(&boot_cpu_data);
509 sysenter_setup();
485 enable_sep_cpu(); 510 enable_sep_cpu();
511 mtrr_bp_init();
512}
486 513
487 if (c == &boot_cpu_data) 514void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
488 mtrr_bp_init(); 515{
489 else 516 BUG_ON(c == &boot_cpu_data);
490 mtrr_ap_init(); 517 identify_cpu(c);
518 enable_sep_cpu();
519 mtrr_ap_init();
491} 520}
492 521
493#ifdef CONFIG_X86_HT 522#ifdef CONFIG_X86_HT
@@ -601,129 +630,36 @@ void __init early_cpu_init(void)
601#endif 630#endif
602} 631}
603 632
604/* Make sure %gs is initialized properly in idle threads */ 633/* Make sure %fs is initialized properly in idle threads */
605struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 634struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
606{ 635{
607 memset(regs, 0, sizeof(struct pt_regs)); 636 memset(regs, 0, sizeof(struct pt_regs));
608 regs->xfs = __KERNEL_PDA; 637 regs->xfs = __KERNEL_PERCPU;
609 return regs; 638 return regs;
610} 639}
611 640
612static __cpuinit int alloc_gdt(int cpu) 641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
613{ 644{
614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 645 struct Xgt_desc_struct gdt_descr;
615 struct desc_struct *gdt;
616 struct i386_pda *pda;
617
618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
619 pda = cpu_pda(cpu);
620
621 /*
622 * This is a horrible hack to allocate the GDT. The problem
623 * is that cpu_init() is called really early for the boot CPU
624 * (and hence needs bootmem) but much later for the secondary
625 * CPUs, when bootmem will have gone away
626 */
627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
628 BUG_ON(gdt != NULL || pda != NULL);
629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
636 } else {
637 /* GDT and PDA might already have been allocated if
638 this is a CPU hotplug re-insertion. */
639 if (gdt == NULL)
640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
649 }
650 }
651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657 646
658/* Initial PDA used by boot CPU */ 647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
659struct i386_pda boot_pda = { 648 gdt_descr.size = GDT_SIZE - 1;
660 ._pda = &boot_pda, 649 load_gdt(&gdt_descr);
661 .cpu_number = 0, 650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_fs(void)
666{
667 /* Set %fs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
671} 651}
672 652
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for 653/*
674 itself, but secondaries find this done for them. */ 654 * cpu_init() initializes state that is per-CPU. Some data is already
675__cpuinit int init_gdt(int cpu, struct task_struct *idle) 655 * initialized (naturally) in the bootstrap process, such as the GDT
676{ 656 * and IDT. We reload them nevertheless, this function acts as a
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 657 * 'CPU state barrier', nothing should get across.
678 struct desc_struct *gdt; 658 */
679 struct i386_pda *pda; 659void __cpuinit cpu_init(void)
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
693 /*
694 * Initialize the per-CPU GDT with the boot GDT,
695 * and set up the GDT descriptor:
696 */
697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
699
700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
702 (unsigned long)pda, sizeof(*pda) - 1,
703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
704
705 memset(pda, 0, sizeof(*pda));
706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713void __cpuinit cpu_set_gdt(int cpu)
714{
715 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
716
717 /* Reinit these anyway, even if they've already been done (on
718 the boot CPU, this will transition from the boot gdt+pda to
719 the real ones). */
720 load_gdt(cpu_gdt_descr);
721 set_kernel_fs();
722}
723
724/* Common CPU init for both boot and secondary CPUs */
725static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
726{ 660{
661 int cpu = smp_processor_id();
662 struct task_struct *curr = current;
727 struct tss_struct * t = &per_cpu(init_tss, cpu); 663 struct tss_struct * t = &per_cpu(init_tss, cpu);
728 struct thread_struct *thread = &curr->thread; 664 struct thread_struct *thread = &curr->thread;
729 665
@@ -744,6 +680,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
744 } 680 }
745 681
746 load_idt(&idt_descr); 682 load_idt(&idt_descr);
683 switch_to_new_gdt();
747 684
748 /* 685 /*
749 * Set up and load the per-CPU TSS and LDT 686 * Set up and load the per-CPU TSS and LDT
@@ -783,38 +720,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
783 mxcsr_feature_mask_init(); 720 mxcsr_feature_mask_init();
784} 721}
785 722
786/* Entrypoint to initialize secondary CPU */
787void __cpuinit secondary_cpu_init(void)
788{
789 int cpu = smp_processor_id();
790 struct task_struct *curr = current;
791
792 _cpu_init(cpu, curr);
793}
794
795/*
796 * cpu_init() initializes state that is per-CPU. Some data is already
797 * initialized (naturally) in the bootstrap process, such as the GDT
798 * and IDT. We reload them nevertheless, this function acts as a
799 * 'CPU state barrier', nothing should get across.
800 */
801void __cpuinit cpu_init(void)
802{
803 int cpu = smp_processor_id();
804 struct task_struct *curr = current;
805
806 /* Set up the real GDT and PDA, so we can transition from the
807 boot versions. */
808 if (!init_gdt(cpu, curr)) {
809 /* failed to allocate something; not much we can do... */
810 for (;;)
811 local_irq_enable();
812 }
813
814 cpu_set_gdt(cpu);
815 _cpu_init(cpu, curr);
816}
817
818#ifdef CONFIG_HOTPLUG_CPU 723#ifdef CONFIG_HOTPLUG_CPU
819void __cpuinit cpu_uninit(void) 724void __cpuinit cpu_uninit(void)
820{ 725{
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index de27bd07bc9c..0b8411a864fb 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
279 */ 279 */
280 if (vendor == PCI_VENDOR_ID_CYRIX && 280 if (vendor == PCI_VENDOR_ID_CYRIX &&
281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) 281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
282 pit_latch_buggy = 1; 282 mark_tsc_unstable("cyrix 5510/5520 detected");
283 } 283 }
284#endif 284#endif
285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ 285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
@@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void)
448 return 0; 448 return 0;
449} 449}
450 450
451//early_arch_initcall(cyrix_init_cpu);
452
453static int __init cyrix_exit_cpu(void)
454{
455 cpu_devs[X86_VENDOR_CYRIX] = NULL;
456 return 0;
457}
458
459late_initcall(cyrix_exit_cpu);
460
461static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 451static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
462 .c_vendor = "NSC", 452 .c_vendor = "NSC",
463 .c_ident = { "Geode by NSC" }, 453 .c_ident = { "Geode by NSC" },
@@ -470,12 +460,3 @@ int __init nsc_init_cpu(void)
470 return 0; 460 return 0;
471} 461}
472 462
473//early_arch_initcall(nsc_init_cpu);
474
475static int __init nsc_exit_cpu(void)
476{
477 cpu_devs[X86_VENDOR_NSC] = NULL;
478 return 0;
479}
480
481late_initcall(nsc_exit_cpu);
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 56fe26584957..dc4e08147b1f 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
188 } 188 }
189#endif 189#endif
190 190
191 if (c->x86 == 15) 191 if (c->x86 == 15) {
192 set_bit(X86_FEATURE_P4, c->x86_capability); 192 set_bit(X86_FEATURE_P4, c->x86_capability);
193 set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
194 }
193 if (c->x86 == 6) 195 if (c->x86 == 6)
194 set_bit(X86_FEATURE_P3, c->x86_capability); 196 set_bit(X86_FEATURE_P3, c->x86_capability);
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 197 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index b0862af595aa..f9fa4142551e 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
75 machine_check_vector = k7_machine_check; 75 machine_check_vector = k7_machine_check;
76 wmb(); 76 wmb();
77 77
78 if (!cpu_has(c, X86_FEATURE_MCE))
79 return;
80
78 printk (KERN_INFO "Intel machine check architecture supported.\n"); 81 printk (KERN_INFO "Intel machine check architecture supported.\n");
79 rdmsr (MSR_IA32_MCG_CAP, l, h); 82 rdmsr (MSR_IA32_MCG_CAP, l, h);
80 if (l & (1<<8)) /* Control register present ? */ 83 if (l & (1<<8)) /* Control register present ? */
@@ -82,9 +85,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
82 nr_mce_banks = l & 0xff; 85 nr_mce_banks = l & 0xff;
83 86
84 /* Clear status for MC index 0 separately, we don't touch CTL, 87 /* Clear status for MC index 0 separately, we don't touch CTL,
85 * as some Athlons cause spurious MCEs when its enabled. */ 88 * as some K7 Athlons cause spurious MCEs when its enabled. */
86 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); 89 if (boot_cpu_data.x86 == 6) {
87 for (i=1; i<nr_mce_banks; i++) { 90 wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
91 i = 1;
92 } else
93 i = 0;
94 for (; i<nr_mce_banks; i++) {
88 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 95 wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
89 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 96 wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
90 } 97 }
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 4f10c62d180c..56cd485b127c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
38 38
39 switch (c->x86_vendor) { 39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD: 40 case X86_VENDOR_AMD:
41 if (c->x86==6 || c->x86==15) 41 amd_mcheck_init(c);
42 amd_mcheck_init(c);
43 break; 42 break;
44 43
45 case X86_VENDOR_INTEL: 44 case X86_VENDOR_INTEL:
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..1509edfb2313 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
124 124
125 125
126/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 126/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
127static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 127static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
128{ 128{
129 u32 h; 129 u32 h;
130 130
131 if (mce_num_extended_msrs == 0)
132 goto done;
133
134 rdmsr (MSR_IA32_MCG_EAX, r->eax, h); 131 rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
135 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); 132 rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
136 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); 133 rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
@@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
141 rdmsr (MSR_IA32_MCG_ESP, r->esp, h); 138 rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
142 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); 139 rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
143 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 140 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
144
145 /* can we rely on kmalloc to do a dynamic
146 * allocation for the reserved registers?
147 */
148done:
149 return mce_num_extended_msrs;
150} 141}
151 142
152static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 143static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
@@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
155 u32 alow, ahigh, high, low; 146 u32 alow, ahigh, high, low;
156 u32 mcgstl, mcgsth; 147 u32 mcgstl, mcgsth;
157 int i; 148 int i;
158 struct intel_mce_extended_msrs dbg;
159 149
160 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 150 rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
161 if (mcgstl & (1<<0)) /* Recoverable ? */ 151 if (mcgstl & (1<<0)) /* Recoverable ? */
@@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
164 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 154 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
165 smp_processor_id(), mcgsth, mcgstl); 155 smp_processor_id(), mcgsth, mcgstl);
166 156
167 if (intel_get_extended_msrs(&dbg)) { 157 if (mce_num_extended_msrs > 0) {
158 struct intel_mce_extended_msrs dbg;
159 intel_get_extended_msrs(&dbg);
168 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", 160 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
169 smp_processor_id(), dbg.eip, dbg.eflags); 161 smp_processor_id(), dbg.eip, dbg.eflags);
170 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", 162 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index f77fc53db654..5367e32e0403 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -20,13 +20,25 @@ struct mtrr_state {
20 mtrr_type def_type; 20 mtrr_type def_type;
21}; 21};
22 22
23struct fixed_range_block {
24 int base_msr; /* start address of an MTRR block */
25 int ranges; /* number of MTRRs in this block */
26};
27
28static struct fixed_range_block fixed_range_blocks[] = {
29 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
30 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
31 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
32 {}
33};
34
23static unsigned long smp_changes_mask; 35static unsigned long smp_changes_mask;
24static struct mtrr_state mtrr_state = {}; 36static struct mtrr_state mtrr_state = {};
25 37
26#undef MODULE_PARAM_PREFIX 38#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr." 39#define MODULE_PARAM_PREFIX "mtrr."
28 40
29static __initdata int mtrr_show; 41static int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0); 42module_param_named(show, mtrr_show, bool, 0);
31 43
32/* Get the MSR pair relating to a var range */ 44/* Get the MSR pair relating to a var range */
@@ -37,7 +49,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
37 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 49 rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
38} 50}
39 51
40static void __init 52static void
41get_fixed_ranges(mtrr_type * frs) 53get_fixed_ranges(mtrr_type * frs)
42{ 54{
43 unsigned int *p = (unsigned int *) frs; 55 unsigned int *p = (unsigned int *) frs;
@@ -51,12 +63,18 @@ get_fixed_ranges(mtrr_type * frs)
51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 63 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
52} 64}
53 65
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) 66void mtrr_save_fixed_ranges(void *info)
67{
68 get_fixed_ranges(mtrr_state.fixed_ranges);
69}
70
71static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{ 72{
56 unsigned i; 73 unsigned i;
57 74
58 for (i = 0; i < 8; ++i, ++types, base += step) 75 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); 76 printk(KERN_INFO "MTRR %05X-%05X %s\n",
77 base, base + step - 1, mtrr_attrib_to_str(*types));
60} 78}
61 79
62/* Grab all of the MTRR state for this CPU into *state */ 80/* Grab all of the MTRR state for this CPU into *state */
@@ -147,6 +165,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
147 smp_processor_id(), msr, a, b); 165 smp_processor_id(), msr, a, b);
148} 166}
149 167
168/**
169 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
170 * see AMD publication no. 24593, chapter 3.2.1 for more information
171 */
172static inline void k8_enable_fixed_iorrs(void)
173{
174 unsigned lo, hi;
175
176 rdmsr(MSR_K8_SYSCFG, lo, hi);
177 mtrr_wrmsr(MSR_K8_SYSCFG, lo
178 | K8_MTRRFIXRANGE_DRAM_ENABLE
179 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
180}
181
182/**
183 * Checks and updates an fixed-range MTRR if it differs from the value it
184 * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also.
185 * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
186 * \param msr MSR address of the MTTR which should be checked and updated
187 * \param changed pointer which indicates whether the MTRR needed to be changed
188 * \param msrwords pointer to the MSR values which the MSR should have
189 */
190static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
191{
192 unsigned lo, hi;
193
194 rdmsr(msr, lo, hi);
195
196 if (lo != msrwords[0] || hi != msrwords[1]) {
197 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
198 boot_cpu_data.x86 == 15 &&
199 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
200 k8_enable_fixed_iorrs();
201 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
202 *changed = TRUE;
203 }
204}
205
150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) 206int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
151/* [SUMMARY] Get a free MTRR. 207/* [SUMMARY] Get a free MTRR.
152 <base> The starting (base) address of the region. 208 <base> The starting (base) address of the region.
@@ -196,36 +252,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
196 *type = base_lo & 0xff; 252 *type = base_lo & 0xff;
197} 253}
198 254
255/**
256 * Checks and updates the fixed-range MTRRs if they differ from the saved set
257 * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
258 */
199static int set_fixed_ranges(mtrr_type * frs) 259static int set_fixed_ranges(mtrr_type * frs)
200{ 260{
201 unsigned int *p = (unsigned int *) frs; 261 unsigned long long *saved = (unsigned long long *) frs;
202 int changed = FALSE; 262 int changed = FALSE;
203 int i; 263 int block=-1, range;
204 unsigned int lo, hi;
205 264
206 rdmsr(MTRRfix64K_00000_MSR, lo, hi); 265 while (fixed_range_blocks[++block].ranges)
207 if (p[0] != lo || p[1] != hi) { 266 for (range=0; range < fixed_range_blocks[block].ranges; range++)
208 mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 267 set_fixed_range(fixed_range_blocks[block].base_msr + range,
209 changed = TRUE; 268 &changed, (unsigned int *) saved++);
210 }
211 269
212 for (i = 0; i < 2; i++) {
213 rdmsr(MTRRfix16K_80000_MSR + i, lo, hi);
214 if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) {
215 mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2],
216 p[3 + i * 2]);
217 changed = TRUE;
218 }
219 }
220
221 for (i = 0; i < 8; i++) {
222 rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi);
223 if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) {
224 mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2],
225 p[7 + i * 2]);
226 changed = TRUE;
227 }
228 }
229 return changed; 270 return changed;
230} 271}
231 272
@@ -428,7 +469,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
428 } 469 }
429 } 470 }
430 471
431 if (base + size < 0x100) { 472 if (base < 0x100) {
432 printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", 473 printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
433 base, size); 474 base, size);
434 return -EINVAL; 475 return -EINVAL;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 0acfb6a5a220..02a2f39e5e0a 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -729,6 +729,17 @@ void mtrr_ap_init(void)
729 local_irq_restore(flags); 729 local_irq_restore(flags);
730} 730}
731 731
732/**
733 * Save current fixed-range MTRR state of the BSP
734 */
735void mtrr_save_state(void)
736{
737 if (smp_processor_id() == 0)
738 mtrr_save_fixed_ranges(NULL);
739 else
740 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
741}
742
732static int __init mtrr_init_finialize(void) 743static int __init mtrr_init_finialize(void)
733{ 744{
734 if (!mtrr_if) 745 if (!mtrr_if)
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index 8bf23cc80c63..961fbe1a748f 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void)
58 cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; 58 cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
59 return 0; 59 return 0;
60} 60}
61
62//early_arch_initcall(nexgen_init_cpu);
63
64static int __init nexgen_exit_cpu(void)
65{
66 cpu_devs[X86_VENDOR_NEXGEN] = NULL;
67 return 0;
68}
69
70late_initcall(nexgen_exit_cpu);
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 000000000000..2b04c8f1db62
--- /dev/null
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,658 @@
1/* local apic based NMI watchdog for various CPUs.
2 This file also handles reservation of performance counters for coordination
3 with other users (like oprofile).
4
5 Note that these events normally don't tick when the CPU idles. This means
6 the frequency varies with CPU load.
7
8 Original code for K7/P6 written by Keith Owens */
9
10#include <linux/percpu.h>
11#include <linux/module.h>
12#include <linux/kernel.h>
13#include <linux/bitops.h>
14#include <linux/smp.h>
15#include <linux/nmi.h>
16#include <asm/apic.h>
17#include <asm/intel_arch_perfmon.h>
18
19struct nmi_watchdog_ctlblk {
20 unsigned int cccr_msr;
21 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
22 unsigned int evntsel_msr; /* the MSR to select the events to handle */
23};
24
25/* Interface defining a CPU specific perfctr watchdog */
26struct wd_ops {
27 int (*reserve)(void);
28 void (*unreserve)(void);
29 int (*setup)(unsigned nmi_hz);
30 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
31 void (*stop)(void *);
32 unsigned perfctr;
33 unsigned evntsel;
34 u64 checkbit;
35};
36
37static struct wd_ops *wd_ops;
38
39/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
40 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
41 */
42#define NMI_MAX_COUNTER_BITS 66
43
44/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
45 * evtsel_nmi_owner tracks the ownership of the event selection
46 * - different performance counters/ event selection may be reserved for
47 * different subsystems this reservation system just tries to coordinate
48 * things a little
49 */
50static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
51static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
52
53static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
54
55/* converts an msr to an appropriate reservation bit */
56static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
57{
58 return wd_ops ? msr - wd_ops->perfctr : 0;
59}
60
61/* converts an msr to an appropriate reservation bit */
62/* returns the bit offset of the event selection register */
63static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
64{
65 return wd_ops ? msr - wd_ops->evntsel : 0;
66}
67
68/* checks for a bit availability (hack for oprofile) */
69int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
70{
71 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
72
73 return (!test_bit(counter, perfctr_nmi_owner));
74}
75
76/* checks the an msr for availability */
77int avail_to_resrv_perfctr_nmi(unsigned int msr)
78{
79 unsigned int counter;
80
81 counter = nmi_perfctr_msr_to_bit(msr);
82 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
83
84 return (!test_bit(counter, perfctr_nmi_owner));
85}
86
87int reserve_perfctr_nmi(unsigned int msr)
88{
89 unsigned int counter;
90
91 counter = nmi_perfctr_msr_to_bit(msr);
92 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
93
94 if (!test_and_set_bit(counter, perfctr_nmi_owner))
95 return 1;
96 return 0;
97}
98
99void release_perfctr_nmi(unsigned int msr)
100{
101 unsigned int counter;
102
103 counter = nmi_perfctr_msr_to_bit(msr);
104 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
105
106 clear_bit(counter, perfctr_nmi_owner);
107}
108
109int reserve_evntsel_nmi(unsigned int msr)
110{
111 unsigned int counter;
112
113 counter = nmi_evntsel_msr_to_bit(msr);
114 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
115
116 if (!test_and_set_bit(counter, evntsel_nmi_owner))
117 return 1;
118 return 0;
119}
120
121void release_evntsel_nmi(unsigned int msr)
122{
123 unsigned int counter;
124
125 counter = nmi_evntsel_msr_to_bit(msr);
126 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
127
128 clear_bit(counter, evntsel_nmi_owner);
129}
130
131EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
132EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
133EXPORT_SYMBOL(reserve_perfctr_nmi);
134EXPORT_SYMBOL(release_perfctr_nmi);
135EXPORT_SYMBOL(reserve_evntsel_nmi);
136EXPORT_SYMBOL(release_evntsel_nmi);
137
138void disable_lapic_nmi_watchdog(void)
139{
140 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
141
142 if (atomic_read(&nmi_active) <= 0)
143 return;
144
145 on_each_cpu(wd_ops->stop, NULL, 0, 1);
146 wd_ops->unreserve();
147
148 BUG_ON(atomic_read(&nmi_active) != 0);
149}
150
151void enable_lapic_nmi_watchdog(void)
152{
153 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
154
155 /* are we already enabled */
156 if (atomic_read(&nmi_active) != 0)
157 return;
158
159 /* are we lapic aware */
160 if (!wd_ops)
161 return;
162 if (!wd_ops->reserve()) {
163 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
164 return;
165 }
166
167 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
168 touch_nmi_watchdog();
169}
170
171/*
172 * Activate the NMI watchdog via the local APIC.
173 */
174
175static unsigned int adjust_for_32bit_ctr(unsigned int hz)
176{
177 u64 counter_val;
178 unsigned int retval = hz;
179
180 /*
181 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
182 * are writable, with higher bits sign extending from bit 31.
183 * So, we can only program the counter with 31 bit values and
184 * 32nd bit should be 1, for 33.. to be 1.
185 * Find the appropriate nmi_hz
186 */
187 counter_val = (u64)cpu_khz * 1000;
188 do_div(counter_val, retval);
189 if (counter_val > 0x7fffffffULL) {
190 u64 count = (u64)cpu_khz * 1000;
191 do_div(count, 0x7fffffffUL);
192 retval = count + 1;
193 }
194 return retval;
195}
196
197static void
198write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
199{
200 u64 count = (u64)cpu_khz * 1000;
201
202 do_div(count, nmi_hz);
203 if(descr)
204 Dprintk("setting %s to -0x%08Lx\n", descr, count);
205 wrmsrl(perfctr_msr, 0 - count);
206}
207
208static void write_watchdog_counter32(unsigned int perfctr_msr,
209 const char *descr, unsigned nmi_hz)
210{
211 u64 count = (u64)cpu_khz * 1000;
212
213 do_div(count, nmi_hz);
214 if(descr)
215 Dprintk("setting %s to -0x%08Lx\n", descr, count);
216 wrmsr(perfctr_msr, (u32)(-count), 0);
217}
218
219/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
220 nicely stable so there is not much variety */
221
222#define K7_EVNTSEL_ENABLE (1 << 22)
223#define K7_EVNTSEL_INT (1 << 20)
224#define K7_EVNTSEL_OS (1 << 17)
225#define K7_EVNTSEL_USR (1 << 16)
226#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
227#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
228
229static int setup_k7_watchdog(unsigned nmi_hz)
230{
231 unsigned int perfctr_msr, evntsel_msr;
232 unsigned int evntsel;
233 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
234
235 perfctr_msr = MSR_K7_PERFCTR0;
236 evntsel_msr = MSR_K7_EVNTSEL0;
237
238 wrmsrl(perfctr_msr, 0UL);
239
240 evntsel = K7_EVNTSEL_INT
241 | K7_EVNTSEL_OS
242 | K7_EVNTSEL_USR
243 | K7_NMI_EVENT;
244
245 /* setup the timer */
246 wrmsr(evntsel_msr, evntsel, 0);
247 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
248 apic_write(APIC_LVTPC, APIC_DM_NMI);
249 evntsel |= K7_EVNTSEL_ENABLE;
250 wrmsr(evntsel_msr, evntsel, 0);
251
252 wd->perfctr_msr = perfctr_msr;
253 wd->evntsel_msr = evntsel_msr;
254 wd->cccr_msr = 0; //unused
255 return 1;
256}
257
258static void single_msr_stop_watchdog(void *arg)
259{
260 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
261
262 wrmsr(wd->evntsel_msr, 0, 0);
263}
264
265static int single_msr_reserve(void)
266{
267 if (!reserve_perfctr_nmi(wd_ops->perfctr))
268 return 0;
269
270 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
271 release_perfctr_nmi(wd_ops->perfctr);
272 return 0;
273 }
274 return 1;
275}
276
277static void single_msr_unreserve(void)
278{
279 release_evntsel_nmi(wd_ops->perfctr);
280 release_perfctr_nmi(wd_ops->evntsel);
281}
282
283static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
284{
285 /* start the cycle over again */
286 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
287}
288
289static struct wd_ops k7_wd_ops = {
290 .reserve = single_msr_reserve,
291 .unreserve = single_msr_unreserve,
292 .setup = setup_k7_watchdog,
293 .rearm = single_msr_rearm,
294 .stop = single_msr_stop_watchdog,
295 .perfctr = MSR_K7_PERFCTR0,
296 .evntsel = MSR_K7_EVNTSEL0,
297 .checkbit = 1ULL<<63,
298};
299
300/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
301
302#define P6_EVNTSEL0_ENABLE (1 << 22)
303#define P6_EVNTSEL_INT (1 << 20)
304#define P6_EVNTSEL_OS (1 << 17)
305#define P6_EVNTSEL_USR (1 << 16)
306#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
307#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
308
309static int setup_p6_watchdog(unsigned nmi_hz)
310{
311 unsigned int perfctr_msr, evntsel_msr;
312 unsigned int evntsel;
313 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
314
315 perfctr_msr = MSR_P6_PERFCTR0;
316 evntsel_msr = MSR_P6_EVNTSEL0;
317
318 wrmsrl(perfctr_msr, 0UL);
319
320 evntsel = P6_EVNTSEL_INT
321 | P6_EVNTSEL_OS
322 | P6_EVNTSEL_USR
323 | P6_NMI_EVENT;
324
325 /* setup the timer */
326 wrmsr(evntsel_msr, evntsel, 0);
327 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
328 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
329 apic_write(APIC_LVTPC, APIC_DM_NMI);
330 evntsel |= P6_EVNTSEL0_ENABLE;
331 wrmsr(evntsel_msr, evntsel, 0);
332
333 wd->perfctr_msr = perfctr_msr;
334 wd->evntsel_msr = evntsel_msr;
335 wd->cccr_msr = 0; //unused
336 return 1;
337}
338
339static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
340{
341 /* P6 based Pentium M need to re-unmask
342 * the apic vector but it doesn't hurt
343 * other P6 variant.
344 * ArchPerfom/Core Duo also needs this */
345 apic_write(APIC_LVTPC, APIC_DM_NMI);
346 /* P6/ARCH_PERFMON has 32 bit counter write */
347 write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
348}
349
350static struct wd_ops p6_wd_ops = {
351 .reserve = single_msr_reserve,
352 .unreserve = single_msr_unreserve,
353 .setup = setup_p6_watchdog,
354 .rearm = p6_rearm,
355 .stop = single_msr_stop_watchdog,
356 .perfctr = MSR_P6_PERFCTR0,
357 .evntsel = MSR_P6_EVNTSEL0,
358 .checkbit = 1ULL<<39,
359};
360
361/* Intel P4 performance counters. By far the most complicated of all. */
362
363#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
364#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
365#define P4_ESCR_OS (1<<3)
366#define P4_ESCR_USR (1<<2)
367#define P4_CCCR_OVF_PMI0 (1<<26)
368#define P4_CCCR_OVF_PMI1 (1<<27)
369#define P4_CCCR_THRESHOLD(N) ((N)<<20)
370#define P4_CCCR_COMPLEMENT (1<<19)
371#define P4_CCCR_COMPARE (1<<18)
372#define P4_CCCR_REQUIRED (3<<16)
373#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
374#define P4_CCCR_ENABLE (1<<12)
375#define P4_CCCR_OVF (1<<31)
376
377/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
378 CRU_ESCR0 (with any non-null event selector) through a complemented
379 max threshold. [IA32-Vol3, Section 14.9.9] */
380
381static int setup_p4_watchdog(unsigned nmi_hz)
382{
383 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
384 unsigned int evntsel, cccr_val;
385 unsigned int misc_enable, dummy;
386 unsigned int ht_num;
387 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
388
389 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
390 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
391 return 0;
392
393#ifdef CONFIG_SMP
394 /* detect which hyperthread we are on */
395 if (smp_num_siblings == 2) {
396 unsigned int ebx, apicid;
397
398 ebx = cpuid_ebx(1);
399 apicid = (ebx >> 24) & 0xff;
400 ht_num = apicid & 1;
401 } else
402#endif
403 ht_num = 0;
404
405 /* performance counters are shared resources
406 * assign each hyperthread its own set
407 * (re-use the ESCR0 register, seems safe
408 * and keeps the cccr_val the same)
409 */
410 if (!ht_num) {
411 /* logical cpu 0 */
412 perfctr_msr = MSR_P4_IQ_PERFCTR0;
413 evntsel_msr = MSR_P4_CRU_ESCR0;
414 cccr_msr = MSR_P4_IQ_CCCR0;
415 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
416 } else {
417 /* logical cpu 1 */
418 perfctr_msr = MSR_P4_IQ_PERFCTR1;
419 evntsel_msr = MSR_P4_CRU_ESCR0;
420 cccr_msr = MSR_P4_IQ_CCCR1;
421 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
422 }
423
424 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
425 | P4_ESCR_OS
426 | P4_ESCR_USR;
427
428 cccr_val |= P4_CCCR_THRESHOLD(15)
429 | P4_CCCR_COMPLEMENT
430 | P4_CCCR_COMPARE
431 | P4_CCCR_REQUIRED;
432
433 wrmsr(evntsel_msr, evntsel, 0);
434 wrmsr(cccr_msr, cccr_val, 0);
435 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
436 apic_write(APIC_LVTPC, APIC_DM_NMI);
437 cccr_val |= P4_CCCR_ENABLE;
438 wrmsr(cccr_msr, cccr_val, 0);
439 wd->perfctr_msr = perfctr_msr;
440 wd->evntsel_msr = evntsel_msr;
441 wd->cccr_msr = cccr_msr;
442 return 1;
443}
444
445static void stop_p4_watchdog(void *arg)
446{
447 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
448 wrmsr(wd->cccr_msr, 0, 0);
449 wrmsr(wd->evntsel_msr, 0, 0);
450}
451
452static int p4_reserve(void)
453{
454 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
455 return 0;
456#ifdef CONFIG_SMP
457 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
458 goto fail1;
459#endif
460 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
461 goto fail2;
462 /* RED-PEN why is ESCR1 not reserved here? */
463 return 1;
464 fail2:
465#ifdef CONFIG_SMP
466 if (smp_num_siblings > 1)
467 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
468 fail1:
469#endif
470 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
471 return 0;
472}
473
474static void p4_unreserve(void)
475{
476#ifdef CONFIG_SMP
477 if (smp_num_siblings > 1)
478 release_evntsel_nmi(MSR_P4_IQ_PERFCTR1);
479#endif
480 release_evntsel_nmi(MSR_P4_IQ_PERFCTR0);
481 release_perfctr_nmi(MSR_P4_CRU_ESCR0);
482}
483
484static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
485{
486 unsigned dummy;
487 /*
488 * P4 quirks:
489 * - An overflown perfctr will assert its interrupt
490 * until the OVF flag in its CCCR is cleared.
491 * - LVTPC is masked on interrupt and must be
492 * unmasked by the LVTPC handler.
493 */
494 rdmsrl(wd->cccr_msr, dummy);
495 dummy &= ~P4_CCCR_OVF;
496 wrmsrl(wd->cccr_msr, dummy);
497 apic_write(APIC_LVTPC, APIC_DM_NMI);
498 /* start the cycle over again */
499 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
500}
501
502static struct wd_ops p4_wd_ops = {
503 .reserve = p4_reserve,
504 .unreserve = p4_unreserve,
505 .setup = setup_p4_watchdog,
506 .rearm = p4_rearm,
507 .stop = stop_p4_watchdog,
508 /* RED-PEN this is wrong for the other sibling */
509 .perfctr = MSR_P4_BPU_PERFCTR0,
510 .evntsel = MSR_P4_BSU_ESCR0,
511 .checkbit = 1ULL<<39,
512};
513
514/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
515 all future Intel CPUs. */
516
517#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
518#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
519
520static int setup_intel_arch_watchdog(unsigned nmi_hz)
521{
522 unsigned int ebx;
523 union cpuid10_eax eax;
524 unsigned int unused;
525 unsigned int perfctr_msr, evntsel_msr;
526 unsigned int evntsel;
527 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
528
529 /*
530 * Check whether the Architectural PerfMon supports
531 * Unhalted Core Cycles Event or not.
532 * NOTE: Corresponding bit = 0 in ebx indicates event present.
533 */
534 cpuid(10, &(eax.full), &ebx, &unused, &unused);
535 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
536 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
537 return 0;
538
539 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1;
540 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1;
541
542 wrmsrl(perfctr_msr, 0UL);
543
544 evntsel = ARCH_PERFMON_EVENTSEL_INT
545 | ARCH_PERFMON_EVENTSEL_OS
546 | ARCH_PERFMON_EVENTSEL_USR
547 | ARCH_PERFMON_NMI_EVENT_SEL
548 | ARCH_PERFMON_NMI_EVENT_UMASK;
549
550 /* setup the timer */
551 wrmsr(evntsel_msr, evntsel, 0);
552 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
553 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
554 apic_write(APIC_LVTPC, APIC_DM_NMI);
555 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
556 wrmsr(evntsel_msr, evntsel, 0);
557
558 wd->perfctr_msr = perfctr_msr;
559 wd->evntsel_msr = evntsel_msr;
560 wd->cccr_msr = 0; //unused
561 wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1);
562 return 1;
563}
564
565static struct wd_ops intel_arch_wd_ops = {
566 .reserve = single_msr_reserve,
567 .unreserve = single_msr_unreserve,
568 .setup = setup_intel_arch_watchdog,
569 .rearm = p6_rearm,
570 .stop = single_msr_stop_watchdog,
571 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
572 .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
573};
574
575static void probe_nmi_watchdog(void)
576{
577 switch (boot_cpu_data.x86_vendor) {
578 case X86_VENDOR_AMD:
579 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
580 boot_cpu_data.x86 != 16)
581 return;
582 wd_ops = &k7_wd_ops;
583 break;
584 case X86_VENDOR_INTEL:
585 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
586 wd_ops = &intel_arch_wd_ops;
587 break;
588 }
589 switch (boot_cpu_data.x86) {
590 case 6:
591 if (boot_cpu_data.x86_model > 0xd)
592 return;
593
594 wd_ops = &p6_wd_ops;
595 break;
596 case 15:
597 if (boot_cpu_data.x86_model > 0x4)
598 return;
599
600 wd_ops = &p4_wd_ops;
601 break;
602 default:
603 return;
604 }
605 break;
606 }
607}
608
609/* Interface to nmi.c */
610
611int lapic_watchdog_init(unsigned nmi_hz)
612{
613 if (!wd_ops) {
614 probe_nmi_watchdog();
615 if (!wd_ops)
616 return -1;
617 }
618
619 if (!(wd_ops->setup(nmi_hz))) {
620 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
621 raw_smp_processor_id());
622 return -1;
623 }
624
625 return 0;
626}
627
628void lapic_watchdog_stop(void)
629{
630 if (wd_ops)
631 wd_ops->stop(NULL);
632}
633
634unsigned lapic_adjust_nmi_hz(unsigned hz)
635{
636 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
637 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
638 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
639 hz = adjust_for_32bit_ctr(hz);
640 return hz;
641}
642
643int lapic_wd_event(unsigned nmi_hz)
644{
645 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
646 u64 ctr;
647 rdmsrl(wd->perfctr_msr, ctr);
648 if (ctr & wd_ops->checkbit) { /* perfctr still running? */
649 return 0;
650 }
651 wd_ops->rearm(wd, nmi_hz);
652 return 1;
653}
654
655int lapic_watchdog_ok(void)
656{
657 return wd_ops != NULL;
658}
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 47e3ebbfb28d..89d91e6cc972 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
72 "stc", 72 "stc",
73 "100mhzsteps", 73 "100mhzsteps",
74 "hwpstate", 74 "hwpstate",
75 NULL, 75 "", /* constant_tsc - moved to flags */
76 NULL, /* constant_tsc - moved to flags */
77 /* nothing */ 76 /* nothing */
78 }; 77 };
79 struct cpuinfo_x86 *c = v; 78 struct cpuinfo_x86 *c = v;
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index 9317f7414989..50076f22e90f 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -50,12 +50,3 @@ int __init rise_init_cpu(void)
50 return 0; 50 return 0;
51} 51}
52 52
53//early_arch_initcall(rise_init_cpu);
54
55static int __init rise_exit_cpu(void)
56{
57 cpu_devs[X86_VENDOR_RISE] = NULL;
58 return 0;
59}
60
61late_initcall(rise_exit_cpu);
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 5678d46863c6..6471a5a13202 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void)
112 cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev; 112 cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
113 return 0; 113 return 0;
114} 114}
115
116//early_arch_initcall(transmeta_init_cpu);
117
118static int __init transmeta_exit_cpu(void)
119{
120 cpu_devs[X86_VENDOR_TRANSMETA] = NULL;
121 return 0;
122}
123
124late_initcall(transmeta_exit_cpu);
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 1bf3f87e9c5b..a7a4e75bdcd7 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -24,13 +24,3 @@ int __init umc_init_cpu(void)
24 cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev; 24 cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
25 return 0; 25 return 0;
26} 26}
27
28//early_arch_initcall(umc_init_cpu);
29
30static int __init umc_exit_cpu(void)
31{
32 cpu_devs[X86_VENDOR_UMC] = NULL;
33 return 0;
34}
35
36late_initcall(umc_exit_cpu);
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index b4d14c2eb345..265c5597efb0 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -33,7 +33,7 @@ static void doublefault_fn(void)
33 printk("double fault, tss at %08lx\n", tss); 33 printk("double fault, tss at %08lx\n", tss);
34 34
35 if (ptr_ok(tss)) { 35 if (ptr_ok(tss)) {
36 struct tss_struct *t = (struct tss_struct *)tss; 36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
37 37
38 printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp); 38 printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);
39 39
@@ -49,18 +49,21 @@ static void doublefault_fn(void)
49} 49}
50 50
51struct tss_struct doublefault_tss __cacheline_aligned = { 51struct tss_struct doublefault_tss __cacheline_aligned = {
52 .esp0 = STACK_START, 52 .x86_tss = {
53 .ss0 = __KERNEL_DS, 53 .esp0 = STACK_START,
54 .ldt = 0, 54 .ss0 = __KERNEL_DS,
55 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 55 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
56 57
57 .eip = (unsigned long) doublefault_fn, 58 .eip = (unsigned long) doublefault_fn,
58 .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */ 59 /* 0x2 bit is always set */
59 .esp = STACK_START, 60 .eflags = X86_EFLAGS_SF | 0x2,
60 .es = __USER_DS, 61 .esp = STACK_START,
61 .cs = __KERNEL_CS, 62 .es = __USER_DS,
62 .ss = __KERNEL_DS, 63 .cs = __KERNEL_CS,
63 .ds = __USER_DS, 64 .ss = __KERNEL_DS,
65 .ds = __USER_DS,
64 66
65 .__cr3 = __pa(swapper_pg_dir) 67 .__cr3 = __pa(swapper_pg_dir)
68 }
66}; 69};
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 70f39560846a..9645bb51f76a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { {
161 161
162static int __init romsignature(const unsigned char *rom) 162static int __init romsignature(const unsigned char *rom)
163{ 163{
164 const unsigned short * const ptr = (const unsigned short *)rom;
164 unsigned short sig; 165 unsigned short sig;
165 166
166 return probe_kernel_address((const unsigned short *)rom, sig) == 0 && 167 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
167 sig == ROMSIGNATURE;
168} 168}
169 169
170static int __init romchecksum(unsigned char *rom, unsigned long length) 170static int __init romchecksum(const unsigned char *rom, unsigned long length)
171{ 171{
172 unsigned char sum; 172 unsigned char sum, c;
173 173
174 for (sum = 0; length; length--) 174 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
175 sum += *rom++; 175 sum += c;
176 return sum == 0; 176 return !length && !sum;
177} 177}
178 178
179static void __init probe_roms(void) 179static void __init probe_roms(void)
180{ 180{
181 const unsigned char *rom;
181 unsigned long start, length, upper; 182 unsigned long start, length, upper;
182 unsigned char *rom; 183 unsigned char c;
183 int i; 184 int i;
184 185
185 /* video rom */ 186 /* video rom */
186 upper = adapter_rom_resources[0].start; 187 upper = adapter_rom_resources[0].start;
@@ -191,8 +192,11 @@ static void __init probe_roms(void)
191 192
192 video_rom_resource.start = start; 193 video_rom_resource.start = start;
193 194
195 if (probe_kernel_address(rom + 2, c) != 0)
196 continue;
197
194 /* 0 < length <= 0x7f * 512, historically */ 198 /* 0 < length <= 0x7f * 512, historically */
195 length = rom[2] * 512; 199 length = c * 512;
196 200
197 /* if checksum okay, trust length byte */ 201 /* if checksum okay, trust length byte */
198 if (length && romchecksum(rom, length)) 202 if (length && romchecksum(rom, length))
@@ -226,8 +230,11 @@ static void __init probe_roms(void)
226 if (!romsignature(rom)) 230 if (!romsignature(rom))
227 continue; 231 continue;
228 232
233 if (probe_kernel_address(rom + 2, c) != 0)
234 continue;
235
229 /* 0 < length <= 0x7f * 512, historically */ 236 /* 0 < length <= 0x7f * 512, historically */
230 length = rom[2] * 512; 237 length = c * 512;
231 238
232 /* but accept any length that fits if checksum okay */ 239 /* but accept any length that fits if checksum okay */
233 if (!length || start + length > upper || !romchecksum(rom, length)) 240 if (!length || start + length > upper || !romchecksum(rom, length))
@@ -386,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
386 ____________________33__ 393 ____________________33__
387 ______________________4_ 394 ______________________4_
388 */ 395 */
389 printk("sanitize start\n");
390 /* if there's only one memory region, don't bother */ 396 /* if there's only one memory region, don't bother */
391 if (*pnr_map < 2) { 397 if (*pnr_map < 2) {
392 printk("sanitize bail 0\n");
393 return -1; 398 return -1;
394 } 399 }
395 400
@@ -398,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
398 /* bail out if we find any unreasonable addresses in bios map */ 403 /* bail out if we find any unreasonable addresses in bios map */
399 for (i=0; i<old_nr; i++) 404 for (i=0; i<old_nr; i++)
400 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { 405 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
401 printk("sanitize bail 1\n");
402 return -1; 406 return -1;
403 } 407 }
404 408
@@ -494,7 +498,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
494 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); 498 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
495 *pnr_map = new_nr; 499 *pnr_map = new_nr;
496 500
497 printk("sanitize end\n");
498 return 0; 501 return 0;
499} 502}
500 503
@@ -525,7 +528,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
525 unsigned long long size = biosmap->size; 528 unsigned long long size = biosmap->size;
526 unsigned long long end = start + size; 529 unsigned long long end = start + size;
527 unsigned long type = biosmap->type; 530 unsigned long type = biosmap->type;
528 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
529 531
530 /* Overflow in 64 bits? Ignore the memory map. */ 532 /* Overflow in 64 bits? Ignore the memory map. */
531 if (start > end) 533 if (start > end)
@@ -536,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
536 * Not right. Fix it up. 538 * Not right. Fix it up.
537 */ 539 */
538 if (type == E820_RAM) { 540 if (type == E820_RAM) {
539 printk("copy_e820_map() type is E820_RAM\n");
540 if (start < 0x100000ULL && end > 0xA0000ULL) { 541 if (start < 0x100000ULL && end > 0xA0000ULL) {
541 printk("copy_e820_map() lies in range...\n"); 542 if (start < 0xA0000ULL)
542 if (start < 0xA0000ULL) {
543 printk("copy_e820_map() start < 0xA0000ULL\n");
544 add_memory_region(start, 0xA0000ULL-start, type); 543 add_memory_region(start, 0xA0000ULL-start, type);
545 } 544 if (end <= 0x100000ULL)
546 if (end <= 0x100000ULL) {
547 printk("copy_e820_map() end <= 0x100000ULL\n");
548 continue; 545 continue;
549 }
550 start = 0x100000ULL; 546 start = 0x100000ULL;
551 size = end - start; 547 size = end - start;
552 } 548 }
@@ -818,6 +814,26 @@ void __init limit_regions(unsigned long long size)
818 print_memory_map("limit_regions endfunc"); 814 print_memory_map("limit_regions endfunc");
819} 815}
820 816
817/*
818 * This function checks if any part of the range <start,end> is mapped
819 * with type.
820 */
821int
822e820_any_mapped(u64 start, u64 end, unsigned type)
823{
824 int i;
825 for (i = 0; i < e820.nr_map; i++) {
826 const struct e820entry *ei = &e820.map[i];
827 if (type && ei->type != type)
828 continue;
829 if (ei->addr >= end || ei->addr + ei->size <= start)
830 continue;
831 return 1;
832 }
833 return 0;
834}
835EXPORT_SYMBOL_GPL(e820_any_mapped);
836
821 /* 837 /*
822 * This function checks if the entire range <start,end> is mapped with type. 838 * This function checks if the entire range <start,end> is mapped with type.
823 * 839 *
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8f9c624ace6f..dd9e7faafa7c 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
69{ 69{
70 unsigned long cr4; 70 unsigned long cr4;
71 unsigned long temp; 71 unsigned long temp;
72 struct Xgt_desc_struct *cpu_gdt_descr; 72 struct Xgt_desc_struct gdt_descr;
73 73
74 spin_lock(&efi_rt_lock); 74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags); 75 local_irq_save(efi_rt_eflags);
76 76
77 cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
78
79 /* 77 /*
80 * If I don't have PSE, I should just duplicate two entries in page 78 * If I don't have PSE, I should just duplicate two entries in page
81 * directory. If I have PSE, I just need to duplicate one entry in 79 * directory. If I have PSE, I just need to duplicate one entry in
@@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
105 */ 103 */
106 local_flush_tlb(); 104 local_flush_tlb();
107 105
108 cpu_gdt_descr->address = __pa(cpu_gdt_descr->address); 106 gdt_descr.address = __pa(get_cpu_gdt_table(0));
109 load_gdt(cpu_gdt_descr); 107 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr);
110} 109}
111 110
112static void efi_call_phys_epilog(void) __releases(efi_rt_lock) 111static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
113{ 112{
114 unsigned long cr4; 113 unsigned long cr4;
115 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); 114 struct Xgt_desc_struct gdt_descr;
116 115
117 cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); 116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
118 load_gdt(cpu_gdt_descr); 117 gdt_descr.size = GDT_SIZE - 1;
118 load_gdt(&gdt_descr);
119 119
120 cr4 = read_cr4(); 120 cr4 = read_cr4();
121 121
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 18bddcb8e9e8..b1f16ee65e4d 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -15,7 +15,7 @@
15 * I changed all the .align's to 4 (16 byte alignment), as that's faster 15 * I changed all the .align's to 4 (16 byte alignment), as that's faster
16 * on a 486. 16 * on a 486.
17 * 17 *
18 * Stack layout in 'ret_from_system_call': 18 * Stack layout in 'syscall_exit':
19 * ptrace needs to have all regs on the stack. 19 * ptrace needs to have all regs on the stack.
20 * if the order here is changed, it needs to be 20 * if the order here is changed, it needs to be
21 * updated in fork.c:copy_process, signal.c:do_signal, 21 * updated in fork.c:copy_process, signal.c:do_signal,
@@ -132,7 +132,7 @@ VM_MASK = 0x00020000
132 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
133 movl %edx, %ds; \ 133 movl %edx, %ds; \
134 movl %edx, %es; \ 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \ 135 movl $(__KERNEL_PERCPU), %edx; \
136 movl %edx, %fs 136 movl %edx, %fs
137 137
138#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
@@ -305,16 +305,12 @@ sysenter_past_esp:
305 pushl $(__USER_CS) 305 pushl $(__USER_CS)
306 CFI_ADJUST_CFA_OFFSET 4 306 CFI_ADJUST_CFA_OFFSET 4
307 /*CFI_REL_OFFSET cs, 0*/ 307 /*CFI_REL_OFFSET cs, 0*/
308#ifndef CONFIG_COMPAT_VDSO
309 /* 308 /*
310 * Push current_thread_info()->sysenter_return to the stack. 309 * Push current_thread_info()->sysenter_return to the stack.
311 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 310 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
312 * pushed above; +8 corresponds to copy_thread's esp0 setting. 311 * pushed above; +8 corresponds to copy_thread's esp0 setting.
313 */ 312 */
314 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) 313 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
315#else
316 pushl $SYSENTER_RETURN
317#endif
318 CFI_ADJUST_CFA_OFFSET 4 314 CFI_ADJUST_CFA_OFFSET 4
319 CFI_REL_OFFSET eip, 0 315 CFI_REL_OFFSET eip, 0
320 316
@@ -342,7 +338,7 @@ sysenter_past_esp:
342 jae syscall_badsys 338 jae syscall_badsys
343 call *sys_call_table(,%eax,4) 339 call *sys_call_table(,%eax,4)
344 movl %eax,PT_EAX(%esp) 340 movl %eax,PT_EAX(%esp)
345 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) 341 DISABLE_INTERRUPTS(CLBR_ANY)
346 TRACE_IRQS_OFF 342 TRACE_IRQS_OFF
347 movl TI_flags(%ebp), %ecx 343 movl TI_flags(%ebp), %ecx
348 testw $_TIF_ALLWORK_MASK, %cx 344 testw $_TIF_ALLWORK_MASK, %cx
@@ -560,9 +556,7 @@ END(syscall_badsys)
560 556
561#define FIXUP_ESPFIX_STACK \ 557#define FIXUP_ESPFIX_STACK \
562 /* since we are on a wrong stack, we cant make it a C code :( */ \ 558 /* since we are on a wrong stack, we cant make it a C code :( */ \
563 movl %fs:PDA_cpu, %ebx; \ 559 PER_CPU(gdt_page, %ebx); \
564 PER_CPU(cpu_gdt_descr, %ebx); \
565 movl GDS_address(%ebx), %ebx; \
566 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 560 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
567 addl %esp, %eax; \ 561 addl %esp, %eax; \
568 pushl $__KERNEL_DS; \ 562 pushl $__KERNEL_DS; \
@@ -635,7 +629,7 @@ ENTRY(name) \
635 SAVE_ALL; \ 629 SAVE_ALL; \
636 TRACE_IRQS_OFF \ 630 TRACE_IRQS_OFF \
637 movl %esp,%eax; \ 631 movl %esp,%eax; \
638 call smp_/**/name; \ 632 call smp_##name; \
639 jmp ret_from_intr; \ 633 jmp ret_from_intr; \
640 CFI_ENDPROC; \ 634 CFI_ENDPROC; \
641ENDPROC(name) 635ENDPROC(name)
@@ -643,11 +637,6 @@ ENDPROC(name)
643/* The include is where all of the SMP etc. interrupts come from */ 637/* The include is where all of the SMP etc. interrupts come from */
644#include "entry_arch.h" 638#include "entry_arch.h"
645 639
646/* This alternate entry is needed because we hijack the apic LVTT */
647#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
648BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
649#endif
650
651KPROBE_ENTRY(page_fault) 640KPROBE_ENTRY(page_fault)
652 RING0_EC_FRAME 641 RING0_EC_FRAME
653 pushl $do_page_fault 642 pushl $do_page_fault
@@ -686,7 +675,7 @@ error_code:
686 pushl %fs 675 pushl %fs
687 CFI_ADJUST_CFA_OFFSET 4 676 CFI_ADJUST_CFA_OFFSET 4
688 /*CFI_REL_OFFSET fs, 0*/ 677 /*CFI_REL_OFFSET fs, 0*/
689 movl $(__KERNEL_PDA), %ecx 678 movl $(__KERNEL_PERCPU), %ecx
690 movl %ecx, %fs 679 movl %ecx, %fs
691 UNWIND_ESPFIX_STACK 680 UNWIND_ESPFIX_STACK
692 popl %ecx 681 popl %ecx
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 3fa7f9389afe..9b10af65faaa 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -34,17 +34,32 @@
34 34
35/* 35/*
36 * This is how much memory *in addition to the memory covered up to 36 * This is how much memory *in addition to the memory covered up to
37 * and including _end* we need mapped initially. We need one bit for 37 * and including _end* we need mapped initially.
38 * each possible page, but only in low memory, which means 38 * We need:
39 * 2^32/4096/8 = 128K worst case (4G/4G split.) 39 * - one bit for each possible page, but only in low memory, which means
40 * 2^32/4096/8 = 128K worst case (4G/4G split.)
41 * - enough space to map all low memory, which means
42 * (2^32/4096) / 1024 pages (worst case, non PAE)
43 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
44 * - a few pages for allocator use before the kernel pagetable has
45 * been set up
40 * 46 *
41 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
42 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
43 * 49 *
44 * This should be a multiple of a page. 50 * This should be a multiple of a page.
45 */ 51 */
46#define INIT_MAP_BEYOND_END (128*1024) 52LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
47 53
54#if PTRS_PER_PMD > 1
55PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
56#else
57PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
58#endif
59BOOTBITMAP_SIZE = LOW_PAGES / 8
60ALLOCATOR_SLOP = 4
61
62INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
48 63
49/* 64/*
50 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 65 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -147,8 +162,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
147/* 162/*
148 * Non-boot CPU entry point; entered from trampoline.S 163 * Non-boot CPU entry point; entered from trampoline.S
149 * We can't lgdt here, because lgdt itself uses a data segment, but 164 * We can't lgdt here, because lgdt itself uses a data segment, but
150 * we know the trampoline has already loaded the boot_gdt_table GDT 165 * we know the trampoline has already loaded the boot_gdt for us.
151 * for us.
152 * 166 *
153 * If cpu hotplug is not supported then this code can go in init section 167 * If cpu hotplug is not supported then this code can go in init section
154 * which will be freed later 168 * which will be freed later
@@ -318,12 +332,12 @@ is386: movl $2,%ecx # set MP
318 movl %eax,%cr0 332 movl %eax,%cr0
319 333
320 call check_x87 334 call check_x87
321 call setup_pda
322 lgdt early_gdt_descr 335 lgdt early_gdt_descr
323 lidt idt_descr 336 lidt idt_descr
324 ljmp $(__KERNEL_CS),$1f 337 ljmp $(__KERNEL_CS),$1f
3251: movl $(__KERNEL_DS),%eax # reload all the segment registers 3381: movl $(__KERNEL_DS),%eax # reload all the segment registers
326 movl %eax,%ss # after changing gdt. 339 movl %eax,%ss # after changing gdt.
340 movl %eax,%fs # gets reset once there's real percpu
327 341
328 movl $(__USER_DS),%eax # DS/ES contains default USER segment 342 movl $(__USER_DS),%eax # DS/ES contains default USER segment
329 movl %eax,%ds 343 movl %eax,%ds
@@ -333,16 +347,17 @@ is386: movl $2,%ecx # set MP
333 movl %eax,%gs 347 movl %eax,%gs
334 lldt %ax 348 lldt %ax
335 349
336 movl $(__KERNEL_PDA),%eax
337 mov %eax,%fs
338
339 cld # gcc2 wants the direction flag cleared at all times 350 cld # gcc2 wants the direction flag cleared at all times
340 pushl $0 # fake return address for unwinder 351 pushl $0 # fake return address for unwinder
341#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
342 movb ready, %cl 353 movb ready, %cl
343 movb $1, ready 354 movb $1, ready
344 cmpb $0,%cl # the first CPU calls start_kernel 355 cmpb $0,%cl # the first CPU calls start_kernel
345 jne initialize_secondary # all other CPUs call initialize_secondary 356 je 1f
357 movl $(__KERNEL_PERCPU), %eax
358 movl %eax,%fs # set this cpu's percpu
359 jmp initialize_secondary # all other CPUs call initialize_secondary
3601:
346#endif /* CONFIG_SMP */ 361#endif /* CONFIG_SMP */
347 jmp start_kernel 362 jmp start_kernel
348 363
@@ -366,23 +381,6 @@ check_x87:
366 ret 381 ret
367 382
368/* 383/*
369 * Point the GDT at this CPU's PDA. On boot this will be
370 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
371 * that CPU's GDT and PDA.
372 */
373ENTRY(setup_pda)
374 /* get the PDA pointer */
375 movl start_pda, %eax
376
377 /* slot the PDA address into the GDT */
378 mov early_gdt_descr+2, %ecx
379 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
380 shr $16, %eax
381 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
382 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
383 ret
384
385/*
386 * setup_idt 384 * setup_idt
387 * 385 *
388 * sets up a idt with 256 entries pointing to 386 * sets up a idt with 256 entries pointing to
@@ -554,9 +552,6 @@ ENTRY(empty_zero_page)
554 * This starts the data section. 552 * This starts the data section.
555 */ 553 */
556.data 554.data
557ENTRY(start_pda)
558 .long boot_pda
559
560ENTRY(stack_start) 555ENTRY(stack_start)
561 .long init_thread_union+THREAD_SIZE 556 .long init_thread_union+THREAD_SIZE
562 .long __BOOT_DS 557 .long __BOOT_DS
@@ -588,7 +583,7 @@ fault_msg:
588 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
589boot_gdt_descr: 584boot_gdt_descr:
590 .word __BOOT_DS+7 585 .word __BOOT_DS+7
591 .long boot_gdt_table - __PAGE_OFFSET 586 .long boot_gdt - __PAGE_OFFSET
592 587
593 .word 0 # 32-bit align idt_desc.address 588 .word 0 # 32-bit align idt_desc.address
594idt_descr: 589idt_descr:
@@ -599,67 +594,14 @@ idt_descr:
599 .word 0 # 32 bit align gdt_desc.address 594 .word 0 # 32 bit align gdt_desc.address
600ENTRY(early_gdt_descr) 595ENTRY(early_gdt_descr)
601 .word GDT_ENTRIES*8-1 596 .word GDT_ENTRIES*8-1
602 .long cpu_gdt_table 597 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
603 598
604/* 599/*
605 * The boot_gdt_table must mirror the equivalent in setup.S and is 600 * The boot_gdt must mirror the equivalent in setup.S and is
606 * used only for booting. 601 * used only for booting.
607 */ 602 */
608 .align L1_CACHE_BYTES 603 .align L1_CACHE_BYTES
609ENTRY(boot_gdt_table) 604ENTRY(boot_gdt)
610 .fill GDT_ENTRY_BOOT_CS,8,0 605 .fill GDT_ENTRY_BOOT_CS,8,0
611 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ 606 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
612 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ 607 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
613
614/*
615 * The Global Descriptor Table contains 28 quadwords, per-CPU.
616 */
617 .align L1_CACHE_BYTES
618ENTRY(cpu_gdt_table)
619 .quad 0x0000000000000000 /* NULL descriptor */
620 .quad 0x0000000000000000 /* 0x0b reserved */
621 .quad 0x0000000000000000 /* 0x13 reserved */
622 .quad 0x0000000000000000 /* 0x1b reserved */
623 .quad 0x0000000000000000 /* 0x20 unused */
624 .quad 0x0000000000000000 /* 0x28 unused */
625 .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
626 .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
627 .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
628 .quad 0x0000000000000000 /* 0x4b reserved */
629 .quad 0x0000000000000000 /* 0x53 reserved */
630 .quad 0x0000000000000000 /* 0x5b reserved */
631
632 .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
633 .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
634 .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
635 .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
636
637 .quad 0x0000000000000000 /* 0x80 TSS descriptor */
638 .quad 0x0000000000000000 /* 0x88 LDT descriptor */
639
640 /*
641 * Segments used for calling PnP BIOS have byte granularity.
642 * They code segments and data segments have fixed 64k limits,
643 * the transfer segment sizes are set at run time.
644 */
645 .quad 0x00409a000000ffff /* 0x90 32-bit code */
646 .quad 0x00009a000000ffff /* 0x98 16-bit code */
647 .quad 0x000092000000ffff /* 0xa0 16-bit data */
648 .quad 0x0000920000000000 /* 0xa8 16-bit data */
649 .quad 0x0000920000000000 /* 0xb0 16-bit data */
650
651 /*
652 * The APM segments have byte granularity and their bases
653 * are set at run time. All have 64k limits.
654 */
655 .quad 0x00409a000000ffff /* 0xb8 APM CS code */
656 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
657 .quad 0x004092000000ffff /* 0xc8 APM DS data */
658
659 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
660 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
661 .quad 0x0000000000000000 /* 0xe0 - unused */
662 .quad 0x0000000000000000 /* 0xe8 - unused */
663 .quad 0x0000000000000000 /* 0xf0 - unused */
664 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
665
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 4afe26e86260..e3d4b73bfdb0 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed);
28#endif 28#endif
29 29
30EXPORT_SYMBOL(csum_partial); 30EXPORT_SYMBOL(csum_partial);
31
32EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 89d85d244926..1b623cda3a64 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -35,6 +35,7 @@
35#include <linux/msi.h> 35#include <linux/msi.h>
36#include <linux/htirq.h> 36#include <linux/htirq.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/kthread.h>
38 39
39#include <asm/io.h> 40#include <asm/io.h>
40#include <asm/smp.h> 41#include <asm/smp.h>
@@ -661,8 +662,6 @@ static int balanced_irq(void *unused)
661 unsigned long prev_balance_time = jiffies; 662 unsigned long prev_balance_time = jiffies;
662 long time_remaining = balanced_irq_interval; 663 long time_remaining = balanced_irq_interval;
663 664
664 daemonize("kirqd");
665
666 /* push everything to CPU 0 to give us a starting point. */ 665 /* push everything to CPU 0 to give us a starting point. */
667 for (i = 0 ; i < NR_IRQS ; i++) { 666 for (i = 0 ; i < NR_IRQS ; i++) {
668 irq_desc[i].pending_mask = cpumask_of_cpu(0); 667 irq_desc[i].pending_mask = cpumask_of_cpu(0);
@@ -722,10 +721,9 @@ static int __init balanced_irq_init(void)
722 } 721 }
723 722
724 printk(KERN_INFO "Starting balanced_irq\n"); 723 printk(KERN_INFO "Starting balanced_irq\n");
725 if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 724 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
726 return 0; 725 return 0;
727 else 726 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
728 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
729failed: 727failed:
730 for_each_possible_cpu(i) { 728 for_each_possible_cpu(i) {
731 kfree(irq_cpu_data[i].irq_delta); 729 kfree(irq_cpu_data[i].irq_delta);
@@ -1403,10 +1401,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
1403 enable_8259A_irq(0); 1401 enable_8259A_irq(0);
1404} 1402}
1405 1403
1406static inline void UNEXPECTED_IO_APIC(void)
1407{
1408}
1409
1410void __init print_IO_APIC(void) 1404void __init print_IO_APIC(void)
1411{ 1405{
1412 int apic, i; 1406 int apic, i;
@@ -1446,34 +1440,12 @@ void __init print_IO_APIC(void)
1446 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1440 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1447 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1441 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1448 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); 1442 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1449 if (reg_00.bits.ID >= get_physical_broadcast())
1450 UNEXPECTED_IO_APIC();
1451 if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1452 UNEXPECTED_IO_APIC();
1453 1443
1454 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); 1444 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1455 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1445 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1456 if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1457 (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1458 (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1459 (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1460 (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1461 (reg_01.bits.entries != 0x2E) &&
1462 (reg_01.bits.entries != 0x3F)
1463 )
1464 UNEXPECTED_IO_APIC();
1465 1446
1466 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1447 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1467 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1448 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1468 if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1469 (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1470 (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1471 (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1472 (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1473 )
1474 UNEXPECTED_IO_APIC();
1475 if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1476 UNEXPECTED_IO_APIC();
1477 1449
1478 /* 1450 /*
1479 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, 1451 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1483,8 +1455,6 @@ void __init print_IO_APIC(void)
1483 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { 1455 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1484 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 1456 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1485 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 1457 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1486 if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1487 UNEXPECTED_IO_APIC();
1488 } 1458 }
1489 1459
1490 /* 1460 /*
@@ -1496,8 +1466,6 @@ void __init print_IO_APIC(void)
1496 reg_03.raw != reg_01.raw) { 1466 reg_03.raw != reg_01.raw) {
1497 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); 1467 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1498 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); 1468 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1499 if (reg_03.bits.__reserved_1)
1500 UNEXPECTED_IO_APIC();
1501 } 1469 }
1502 1470
1503 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1471 printk(KERN_DEBUG ".... IRQ redirection table:\n");
diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 498e8bc197d5..d1e42e0dbe67 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -16,6 +16,7 @@
16#include <linux/stddef.h> 16#include <linux/stddef.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/thread_info.h> 18#include <linux/thread_info.h>
19#include <linux/syscalls.h>
19 20
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 21/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) 22static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
@@ -113,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
113 * Reset the owner so that a process switch will not set 114 * Reset the owner so that a process switch will not set
114 * tss->io_bitmap_base to IO_BITMAP_OFFSET. 115 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
115 */ 116 */
116 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 117 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
117 tss->io_bitmap_owner = NULL; 118 tss->io_bitmap_owner = NULL;
118 119
119 put_cpu(); 120 put_cpu();
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8db8d514c9c0..d2daf672f4a2 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -24,6 +24,9 @@
24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; 24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
25EXPORT_PER_CPU_SYMBOL(irq_stat); 25EXPORT_PER_CPU_SYMBOL(irq_stat);
26 26
27DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs);
29
27/* 30/*
28 * 'what should we do if we get a hw irq event on an illegal vector'. 31 * 'what should we do if we get a hw irq event on an illegal vector'.
29 * each architecture has to answer this themselves. 32 * each architecture has to answer this themselves.
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 4f5983c98669..0952eccd8f28 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
477 } 477 }
478 ++mpc_record; 478 ++mpc_record;
479 } 479 }
480 clustered_apic_check(); 480 setup_apic_routing();
481 if (!num_processors) 481 if (!num_processors)
482 printk(KERN_ERR "SMP mptable: no processors registered!\n"); 482 printk(KERN_ERR "SMP mptable: no processors registered!\n");
483 return num_processors; 483 return num_processors;
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 84c3497efb60..33cf2f3c444f 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -20,7 +20,6 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/sysctl.h> 21#include <linux/sysctl.h>
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h>
24#include <linux/kprobes.h> 23#include <linux/kprobes.h>
25#include <linux/cpumask.h> 24#include <linux/cpumask.h>
26#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
@@ -28,30 +27,14 @@
28#include <asm/smp.h> 27#include <asm/smp.h>
29#include <asm/nmi.h> 28#include <asm/nmi.h>
30#include <asm/kdebug.h> 29#include <asm/kdebug.h>
31#include <asm/intel_arch_perfmon.h>
32 30
33#include "mach_traps.h" 31#include "mach_traps.h"
34 32
35int unknown_nmi_panic; 33int unknown_nmi_panic;
36int nmi_watchdog_enabled; 34int nmi_watchdog_enabled;
37 35
38/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
39 * evtsel_nmi_owner tracks the ownership of the event selection
40 * - different performance counters/ event selection may be reserved for
41 * different subsystems this reservation system just tries to coordinate
42 * things a little
43 */
44
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */
48#define NMI_MAX_COUNTER_BITS 66
49#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
50
51static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
52static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
53
54static cpumask_t backtrace_mask = CPU_MASK_NONE; 36static cpumask_t backtrace_mask = CPU_MASK_NONE;
37
55/* nmi_active: 38/* nmi_active:
56 * >0: the lapic NMI watchdog is active, but can be disabled 39 * >0: the lapic NMI watchdog is active, but can be disabled
57 * <0: the lapic NMI watchdog has not been set up, and cannot 40 * <0: the lapic NMI watchdog has not been set up, and cannot
@@ -63,206 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
63unsigned int nmi_watchdog = NMI_DEFAULT; 46unsigned int nmi_watchdog = NMI_DEFAULT;
64static unsigned int nmi_hz = HZ; 47static unsigned int nmi_hz = HZ;
65 48
66struct nmi_watchdog_ctlblk { 49static DEFINE_PER_CPU(short, wd_enabled);
67 int enabled;
68 u64 check_bit;
69 unsigned int cccr_msr;
70 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
71 unsigned int evntsel_msr; /* the MSR to select the events to handle */
72};
73static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
74 50
75/* local prototypes */ 51/* local prototypes */
76static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); 52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
77 53
78extern void show_registers(struct pt_regs *regs);
79extern int unknown_nmi_panic;
80
81/* converts an msr to an appropriate reservation bit */
82static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
83{
84 /* returns the bit offset of the performance counter register */
85 switch (boot_cpu_data.x86_vendor) {
86 case X86_VENDOR_AMD:
87 return (msr - MSR_K7_PERFCTR0);
88 case X86_VENDOR_INTEL:
89 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
90 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
91
92 switch (boot_cpu_data.x86) {
93 case 6:
94 return (msr - MSR_P6_PERFCTR0);
95 case 15:
96 return (msr - MSR_P4_BPU_PERFCTR0);
97 }
98 }
99 return 0;
100}
101
102/* converts an msr to an appropriate reservation bit */
103static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
104{
105 /* returns the bit offset of the event selection register */
106 switch (boot_cpu_data.x86_vendor) {
107 case X86_VENDOR_AMD:
108 return (msr - MSR_K7_EVNTSEL0);
109 case X86_VENDOR_INTEL:
110 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
111 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
112
113 switch (boot_cpu_data.x86) {
114 case 6:
115 return (msr - MSR_P6_EVNTSEL0);
116 case 15:
117 return (msr - MSR_P4_BSU_ESCR0);
118 }
119 }
120 return 0;
121}
122
123/* checks for a bit availability (hack for oprofile) */
124int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
125{
126 int cpu;
127 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
128 for_each_possible_cpu (cpu) {
129 if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
130 return 0;
131 }
132 return 1;
133}
134
135/* checks the an msr for availability */
136int avail_to_resrv_perfctr_nmi(unsigned int msr)
137{
138 unsigned int counter;
139 int cpu;
140
141 counter = nmi_perfctr_msr_to_bit(msr);
142 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
143
144 for_each_possible_cpu (cpu) {
145 if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
146 return 0;
147 }
148 return 1;
149}
150
151static int __reserve_perfctr_nmi(int cpu, unsigned int msr)
152{
153 unsigned int counter;
154 if (cpu < 0)
155 cpu = smp_processor_id();
156
157 counter = nmi_perfctr_msr_to_bit(msr);
158 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
159
160 if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
161 return 1;
162 return 0;
163}
164
165static void __release_perfctr_nmi(int cpu, unsigned int msr)
166{
167 unsigned int counter;
168 if (cpu < 0)
169 cpu = smp_processor_id();
170
171 counter = nmi_perfctr_msr_to_bit(msr);
172 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
173
174 clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]);
175}
176
177int reserve_perfctr_nmi(unsigned int msr)
178{
179 int cpu, i;
180 for_each_possible_cpu (cpu) {
181 if (!__reserve_perfctr_nmi(cpu, msr)) {
182 for_each_possible_cpu (i) {
183 if (i >= cpu)
184 break;
185 __release_perfctr_nmi(i, msr);
186 }
187 return 0;
188 }
189 }
190 return 1;
191}
192
193void release_perfctr_nmi(unsigned int msr)
194{
195 int cpu;
196 for_each_possible_cpu (cpu) {
197 __release_perfctr_nmi(cpu, msr);
198 }
199}
200
201int __reserve_evntsel_nmi(int cpu, unsigned int msr)
202{
203 unsigned int counter;
204 if (cpu < 0)
205 cpu = smp_processor_id();
206
207 counter = nmi_evntsel_msr_to_bit(msr);
208 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
209
210 if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]))
211 return 1;
212 return 0;
213}
214
215static void __release_evntsel_nmi(int cpu, unsigned int msr)
216{
217 unsigned int counter;
218 if (cpu < 0)
219 cpu = smp_processor_id();
220
221 counter = nmi_evntsel_msr_to_bit(msr);
222 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
223
224 clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]);
225}
226
227int reserve_evntsel_nmi(unsigned int msr)
228{
229 int cpu, i;
230 for_each_possible_cpu (cpu) {
231 if (!__reserve_evntsel_nmi(cpu, msr)) {
232 for_each_possible_cpu (i) {
233 if (i >= cpu)
234 break;
235 __release_evntsel_nmi(i, msr);
236 }
237 return 0;
238 }
239 }
240 return 1;
241}
242
243void release_evntsel_nmi(unsigned int msr)
244{
245 int cpu;
246 for_each_possible_cpu (cpu) {
247 __release_evntsel_nmi(cpu, msr);
248 }
249}
250
251static __cpuinit inline int nmi_known_cpu(void)
252{
253 switch (boot_cpu_data.x86_vendor) {
254 case X86_VENDOR_AMD:
255 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
256 || (boot_cpu_data.x86 == 16));
257 case X86_VENDOR_INTEL:
258 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
259 return 1;
260 else
261 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
262 }
263 return 0;
264}
265
266static int endflag __initdata = 0; 54static int endflag __initdata = 0;
267 55
268#ifdef CONFIG_SMP 56#ifdef CONFIG_SMP
@@ -284,28 +72,6 @@ static __init void nmi_cpu_busy(void *data)
284} 72}
285#endif 73#endif
286 74
287static unsigned int adjust_for_32bit_ctr(unsigned int hz)
288{
289 u64 counter_val;
290 unsigned int retval = hz;
291
292 /*
293 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
294 * are writable, with higher bits sign extending from bit 31.
295 * So, we can only program the counter with 31 bit values and
296 * 32nd bit should be 1, for 33.. to be 1.
297 * Find the appropriate nmi_hz
298 */
299 counter_val = (u64)cpu_khz * 1000;
300 do_div(counter_val, retval);
301 if (counter_val > 0x7fffffffULL) {
302 u64 count = (u64)cpu_khz * 1000;
303 do_div(count, 0x7fffffffUL);
304 retval = count + 1;
305 }
306 return retval;
307}
308
309static int __init check_nmi_watchdog(void) 75static int __init check_nmi_watchdog(void)
310{ 76{
311 unsigned int *prev_nmi_count; 77 unsigned int *prev_nmi_count;
@@ -338,14 +104,14 @@ static int __init check_nmi_watchdog(void)
338 if (!cpu_isset(cpu, cpu_callin_map)) 104 if (!cpu_isset(cpu, cpu_callin_map))
339 continue; 105 continue;
340#endif 106#endif
341 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) 107 if (!per_cpu(wd_enabled, cpu))
342 continue; 108 continue;
343 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 109 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
344 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 110 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
345 cpu, 111 cpu,
346 prev_nmi_count[cpu], 112 prev_nmi_count[cpu],
347 nmi_count(cpu)); 113 nmi_count(cpu));
348 per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; 114 per_cpu(wd_enabled, cpu) = 0;
349 atomic_dec(&nmi_active); 115 atomic_dec(&nmi_active);
350 } 116 }
351 } 117 }
@@ -359,16 +125,8 @@ static int __init check_nmi_watchdog(void)
359 125
360 /* now that we know it works we can reduce NMI frequency to 126 /* now that we know it works we can reduce NMI frequency to
361 something more reasonable; makes a difference in some configs */ 127 something more reasonable; makes a difference in some configs */
362 if (nmi_watchdog == NMI_LOCAL_APIC) { 128 if (nmi_watchdog == NMI_LOCAL_APIC)
363 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 129 nmi_hz = lapic_adjust_nmi_hz(1);
364
365 nmi_hz = 1;
366
367 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
368 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
369 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
370 }
371 }
372 130
373 kfree(prev_nmi_count); 131 kfree(prev_nmi_count);
374 return 0; 132 return 0;
@@ -391,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str)
391 149
392__setup("nmi_watchdog=", setup_nmi_watchdog); 150__setup("nmi_watchdog=", setup_nmi_watchdog);
393 151
394static void disable_lapic_nmi_watchdog(void)
395{
396 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
397
398 if (atomic_read(&nmi_active) <= 0)
399 return;
400
401 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
402
403 BUG_ON(atomic_read(&nmi_active) != 0);
404}
405
406static void enable_lapic_nmi_watchdog(void)
407{
408 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
409
410 /* are we already enabled */
411 if (atomic_read(&nmi_active) != 0)
412 return;
413
414 /* are we lapic aware */
415 if (nmi_known_cpu() <= 0)
416 return;
417 152
418 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); 153/* Suspend/resume support */
419 touch_nmi_watchdog();
420}
421
422void disable_timer_nmi_watchdog(void)
423{
424 BUG_ON(nmi_watchdog != NMI_IO_APIC);
425
426 if (atomic_read(&nmi_active) <= 0)
427 return;
428
429 disable_irq(0);
430 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
431
432 BUG_ON(atomic_read(&nmi_active) != 0);
433}
434
435void enable_timer_nmi_watchdog(void)
436{
437 BUG_ON(nmi_watchdog != NMI_IO_APIC);
438
439 if (atomic_read(&nmi_active) == 0) {
440 touch_nmi_watchdog();
441 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
442 enable_irq(0);
443 }
444}
445
446static void __acpi_nmi_disable(void *__unused)
447{
448 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
449}
450
451/*
452 * Disable timer based NMIs on all CPUs:
453 */
454void acpi_nmi_disable(void)
455{
456 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
457 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
458}
459
460static void __acpi_nmi_enable(void *__unused)
461{
462 apic_write_around(APIC_LVT0, APIC_DM_NMI);
463}
464
465/*
466 * Enable timer based NMIs on all CPUs:
467 */
468void acpi_nmi_enable(void)
469{
470 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
471 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
472}
473 154
474#ifdef CONFIG_PM 155#ifdef CONFIG_PM
475 156
@@ -516,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void)
516 if (nmi_watchdog != NMI_LOCAL_APIC) 197 if (nmi_watchdog != NMI_LOCAL_APIC)
517 return 0; 198 return 0;
518 199
519 if ( atomic_read(&nmi_active) < 0 ) 200 if (atomic_read(&nmi_active) < 0)
520 return 0; 201 return 0;
521 202
522 error = sysdev_class_register(&nmi_sysclass); 203 error = sysdev_class_register(&nmi_sysclass);
@@ -529,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs);
529 210
530#endif /* CONFIG_PM */ 211#endif /* CONFIG_PM */
531 212
532/* 213static void __acpi_nmi_enable(void *__unused)
533 * Activate the NMI watchdog via the local APIC.
534 * Original code written by Keith Owens.
535 */
536
537static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
538{
539 u64 count = (u64)cpu_khz * 1000;
540
541 do_div(count, nmi_hz);
542 if(descr)
543 Dprintk("setting %s to -0x%08Lx\n", descr, count);
544 wrmsrl(perfctr_msr, 0 - count);
545}
546
547static void write_watchdog_counter32(unsigned int perfctr_msr,
548 const char *descr)
549{
550 u64 count = (u64)cpu_khz * 1000;
551
552 do_div(count, nmi_hz);
553 if(descr)
554 Dprintk("setting %s to -0x%08Lx\n", descr, count);
555 wrmsr(perfctr_msr, (u32)(-count), 0);
556}
557
558/* Note that these events don't tick when the CPU idles. This means
559 the frequency varies with CPU load. */
560
561#define K7_EVNTSEL_ENABLE (1 << 22)
562#define K7_EVNTSEL_INT (1 << 20)
563#define K7_EVNTSEL_OS (1 << 17)
564#define K7_EVNTSEL_USR (1 << 16)
565#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
566#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
567
568static int setup_k7_watchdog(void)
569{
570 unsigned int perfctr_msr, evntsel_msr;
571 unsigned int evntsel;
572 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
573
574 perfctr_msr = MSR_K7_PERFCTR0;
575 evntsel_msr = MSR_K7_EVNTSEL0;
576 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
577 goto fail;
578
579 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
580 goto fail1;
581
582 wrmsrl(perfctr_msr, 0UL);
583
584 evntsel = K7_EVNTSEL_INT
585 | K7_EVNTSEL_OS
586 | K7_EVNTSEL_USR
587 | K7_NMI_EVENT;
588
589 /* setup the timer */
590 wrmsr(evntsel_msr, evntsel, 0);
591 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
592 apic_write(APIC_LVTPC, APIC_DM_NMI);
593 evntsel |= K7_EVNTSEL_ENABLE;
594 wrmsr(evntsel_msr, evntsel, 0);
595
596 wd->perfctr_msr = perfctr_msr;
597 wd->evntsel_msr = evntsel_msr;
598 wd->cccr_msr = 0; //unused
599 wd->check_bit = 1ULL<<63;
600 return 1;
601fail1:
602 __release_perfctr_nmi(-1, perfctr_msr);
603fail:
604 return 0;
605}
606
607static void stop_k7_watchdog(void)
608{
609 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
610
611 wrmsr(wd->evntsel_msr, 0, 0);
612
613 __release_evntsel_nmi(-1, wd->evntsel_msr);
614 __release_perfctr_nmi(-1, wd->perfctr_msr);
615}
616
617#define P6_EVNTSEL0_ENABLE (1 << 22)
618#define P6_EVNTSEL_INT (1 << 20)
619#define P6_EVNTSEL_OS (1 << 17)
620#define P6_EVNTSEL_USR (1 << 16)
621#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
622#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
623
624static int setup_p6_watchdog(void)
625{
626 unsigned int perfctr_msr, evntsel_msr;
627 unsigned int evntsel;
628 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
629
630 perfctr_msr = MSR_P6_PERFCTR0;
631 evntsel_msr = MSR_P6_EVNTSEL0;
632 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
633 goto fail;
634
635 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
636 goto fail1;
637
638 wrmsrl(perfctr_msr, 0UL);
639
640 evntsel = P6_EVNTSEL_INT
641 | P6_EVNTSEL_OS
642 | P6_EVNTSEL_USR
643 | P6_NMI_EVENT;
644
645 /* setup the timer */
646 wrmsr(evntsel_msr, evntsel, 0);
647 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
648 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
649 apic_write(APIC_LVTPC, APIC_DM_NMI);
650 evntsel |= P6_EVNTSEL0_ENABLE;
651 wrmsr(evntsel_msr, evntsel, 0);
652
653 wd->perfctr_msr = perfctr_msr;
654 wd->evntsel_msr = evntsel_msr;
655 wd->cccr_msr = 0; //unused
656 wd->check_bit = 1ULL<<39;
657 return 1;
658fail1:
659 __release_perfctr_nmi(-1, perfctr_msr);
660fail:
661 return 0;
662}
663
664static void stop_p6_watchdog(void)
665{
666 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
667
668 wrmsr(wd->evntsel_msr, 0, 0);
669
670 __release_evntsel_nmi(-1, wd->evntsel_msr);
671 __release_perfctr_nmi(-1, wd->perfctr_msr);
672}
673
674/* Note that these events don't tick when the CPU idles. This means
675 the frequency varies with CPU load. */
676
677#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
678#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
679#define P4_ESCR_OS (1<<3)
680#define P4_ESCR_USR (1<<2)
681#define P4_CCCR_OVF_PMI0 (1<<26)
682#define P4_CCCR_OVF_PMI1 (1<<27)
683#define P4_CCCR_THRESHOLD(N) ((N)<<20)
684#define P4_CCCR_COMPLEMENT (1<<19)
685#define P4_CCCR_COMPARE (1<<18)
686#define P4_CCCR_REQUIRED (3<<16)
687#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
688#define P4_CCCR_ENABLE (1<<12)
689#define P4_CCCR_OVF (1<<31)
690/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
691 CRU_ESCR0 (with any non-null event selector) through a complemented
692 max threshold. [IA32-Vol3, Section 14.9.9] */
693
694static int setup_p4_watchdog(void)
695{ 214{
696 unsigned int perfctr_msr, evntsel_msr, cccr_msr; 215 apic_write_around(APIC_LVT0, APIC_DM_NMI);
697 unsigned int evntsel, cccr_val;
698 unsigned int misc_enable, dummy;
699 unsigned int ht_num;
700 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
701
702 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
703 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
704 return 0;
705
706#ifdef CONFIG_SMP
707 /* detect which hyperthread we are on */
708 if (smp_num_siblings == 2) {
709 unsigned int ebx, apicid;
710
711 ebx = cpuid_ebx(1);
712 apicid = (ebx >> 24) & 0xff;
713 ht_num = apicid & 1;
714 } else
715#endif
716 ht_num = 0;
717
718 /* performance counters are shared resources
719 * assign each hyperthread its own set
720 * (re-use the ESCR0 register, seems safe
721 * and keeps the cccr_val the same)
722 */
723 if (!ht_num) {
724 /* logical cpu 0 */
725 perfctr_msr = MSR_P4_IQ_PERFCTR0;
726 evntsel_msr = MSR_P4_CRU_ESCR0;
727 cccr_msr = MSR_P4_IQ_CCCR0;
728 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
729 } else {
730 /* logical cpu 1 */
731 perfctr_msr = MSR_P4_IQ_PERFCTR1;
732 evntsel_msr = MSR_P4_CRU_ESCR0;
733 cccr_msr = MSR_P4_IQ_CCCR1;
734 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
735 }
736
737 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
738 goto fail;
739
740 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
741 goto fail1;
742
743 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
744 | P4_ESCR_OS
745 | P4_ESCR_USR;
746
747 cccr_val |= P4_CCCR_THRESHOLD(15)
748 | P4_CCCR_COMPLEMENT
749 | P4_CCCR_COMPARE
750 | P4_CCCR_REQUIRED;
751
752 wrmsr(evntsel_msr, evntsel, 0);
753 wrmsr(cccr_msr, cccr_val, 0);
754 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
755 apic_write(APIC_LVTPC, APIC_DM_NMI);
756 cccr_val |= P4_CCCR_ENABLE;
757 wrmsr(cccr_msr, cccr_val, 0);
758 wd->perfctr_msr = perfctr_msr;
759 wd->evntsel_msr = evntsel_msr;
760 wd->cccr_msr = cccr_msr;
761 wd->check_bit = 1ULL<<39;
762 return 1;
763fail1:
764 __release_perfctr_nmi(-1, perfctr_msr);
765fail:
766 return 0;
767} 216}
768 217
769static void stop_p4_watchdog(void) 218/*
219 * Enable timer based NMIs on all CPUs:
220 */
221void acpi_nmi_enable(void)
770{ 222{
771 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 223 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
772 224 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
773 wrmsr(wd->cccr_msr, 0, 0);
774 wrmsr(wd->evntsel_msr, 0, 0);
775
776 __release_evntsel_nmi(-1, wd->evntsel_msr);
777 __release_perfctr_nmi(-1, wd->perfctr_msr);
778} 225}
779 226
780#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 227static void __acpi_nmi_disable(void *__unused)
781#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
782
783static int setup_intel_arch_watchdog(void)
784{ 228{
785 unsigned int ebx; 229 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
786 union cpuid10_eax eax;
787 unsigned int unused;
788 unsigned int perfctr_msr, evntsel_msr;
789 unsigned int evntsel;
790 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
791
792 /*
793 * Check whether the Architectural PerfMon supports
794 * Unhalted Core Cycles Event or not.
795 * NOTE: Corresponding bit = 0 in ebx indicates event present.
796 */
797 cpuid(10, &(eax.full), &ebx, &unused, &unused);
798 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
799 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
800 goto fail;
801
802 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
803 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
804
805 if (!__reserve_perfctr_nmi(-1, perfctr_msr))
806 goto fail;
807
808 if (!__reserve_evntsel_nmi(-1, evntsel_msr))
809 goto fail1;
810
811 wrmsrl(perfctr_msr, 0UL);
812
813 evntsel = ARCH_PERFMON_EVENTSEL_INT
814 | ARCH_PERFMON_EVENTSEL_OS
815 | ARCH_PERFMON_EVENTSEL_USR
816 | ARCH_PERFMON_NMI_EVENT_SEL
817 | ARCH_PERFMON_NMI_EVENT_UMASK;
818
819 /* setup the timer */
820 wrmsr(evntsel_msr, evntsel, 0);
821 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
822 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
823 apic_write(APIC_LVTPC, APIC_DM_NMI);
824 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
825 wrmsr(evntsel_msr, evntsel, 0);
826
827 wd->perfctr_msr = perfctr_msr;
828 wd->evntsel_msr = evntsel_msr;
829 wd->cccr_msr = 0; //unused
830 wd->check_bit = 1ULL << (eax.split.bit_width - 1);
831 return 1;
832fail1:
833 __release_perfctr_nmi(-1, perfctr_msr);
834fail:
835 return 0;
836} 230}
837 231
838static void stop_intel_arch_watchdog(void) 232/*
233 * Disable timer based NMIs on all CPUs:
234 */
235void acpi_nmi_disable(void)
839{ 236{
840 unsigned int ebx; 237 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
841 union cpuid10_eax eax; 238 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
842 unsigned int unused;
843 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
844
845 /*
846 * Check whether the Architectural PerfMon supports
847 * Unhalted Core Cycles Event or not.
848 * NOTE: Corresponding bit = 0 in ebx indicates event present.
849 */
850 cpuid(10, &(eax.full), &ebx, &unused, &unused);
851 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
852 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
853 return;
854
855 wrmsr(wd->evntsel_msr, 0, 0);
856 __release_evntsel_nmi(-1, wd->evntsel_msr);
857 __release_perfctr_nmi(-1, wd->perfctr_msr);
858} 239}
859 240
860void setup_apic_nmi_watchdog (void *unused) 241void setup_apic_nmi_watchdog (void *unused)
861{ 242{
862 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 243 if (__get_cpu_var(wd_enabled))
863 244 return;
864 /* only support LOCAL and IO APICs for now */
865 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
866 (nmi_watchdog != NMI_IO_APIC))
867 return;
868
869 if (wd->enabled == 1)
870 return;
871 245
872 /* cheap hack to support suspend/resume */ 246 /* cheap hack to support suspend/resume */
873 /* if cpu0 is not active neither should the other cpus */ 247 /* if cpu0 is not active neither should the other cpus */
874 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) 248 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
875 return; 249 return;
876 250
877 if (nmi_watchdog == NMI_LOCAL_APIC) { 251 switch (nmi_watchdog) {
878 switch (boot_cpu_data.x86_vendor) { 252 case NMI_LOCAL_APIC:
879 case X86_VENDOR_AMD: 253 __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
880 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 254 if (lapic_watchdog_init(nmi_hz) < 0) {
881 boot_cpu_data.x86 != 16) 255 __get_cpu_var(wd_enabled) = 0;
882 return;
883 if (!setup_k7_watchdog())
884 return;
885 break;
886 case X86_VENDOR_INTEL:
887 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
888 if (!setup_intel_arch_watchdog())
889 return;
890 break;
891 }
892 switch (boot_cpu_data.x86) {
893 case 6:
894 if (boot_cpu_data.x86_model > 0xd)
895 return;
896
897 if (!setup_p6_watchdog())
898 return;
899 break;
900 case 15:
901 if (boot_cpu_data.x86_model > 0x4)
902 return;
903
904 if (!setup_p4_watchdog())
905 return;
906 break;
907 default:
908 return;
909 }
910 break;
911 default:
912 return; 256 return;
913 } 257 }
258 /* FALL THROUGH */
259 case NMI_IO_APIC:
260 __get_cpu_var(wd_enabled) = 1;
261 atomic_inc(&nmi_active);
914 } 262 }
915 wd->enabled = 1;
916 atomic_inc(&nmi_active);
917} 263}
918 264
919void stop_apic_nmi_watchdog(void *unused) 265void stop_apic_nmi_watchdog(void *unused)
920{ 266{
921 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
922
923 /* only support LOCAL and IO APICs for now */ 267 /* only support LOCAL and IO APICs for now */
924 if ((nmi_watchdog != NMI_LOCAL_APIC) && 268 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
925 (nmi_watchdog != NMI_IO_APIC)) 269 (nmi_watchdog != NMI_IO_APIC))
926 return; 270 return;
927 271 if (__get_cpu_var(wd_enabled) == 0)
928 if (wd->enabled == 0)
929 return; 272 return;
930 273 if (nmi_watchdog == NMI_LOCAL_APIC)
931 if (nmi_watchdog == NMI_LOCAL_APIC) { 274 lapic_watchdog_stop();
932 switch (boot_cpu_data.x86_vendor) { 275 __get_cpu_var(wd_enabled) = 0;
933 case X86_VENDOR_AMD:
934 stop_k7_watchdog();
935 break;
936 case X86_VENDOR_INTEL:
937 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
938 stop_intel_arch_watchdog();
939 break;
940 }
941 switch (boot_cpu_data.x86) {
942 case 6:
943 if (boot_cpu_data.x86_model > 0xd)
944 break;
945 stop_p6_watchdog();
946 break;
947 case 15:
948 if (boot_cpu_data.x86_model > 0x4)
949 break;
950 stop_p4_watchdog();
951 break;
952 }
953 break;
954 default:
955 return;
956 }
957 }
958 wd->enabled = 0;
959 atomic_dec(&nmi_active); 276 atomic_dec(&nmi_active);
960} 277}
961 278
@@ -1011,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
1011 unsigned int sum; 328 unsigned int sum;
1012 int touched = 0; 329 int touched = 0;
1013 int cpu = smp_processor_id(); 330 int cpu = smp_processor_id();
1014 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
1015 u64 dummy;
1016 int rc=0; 331 int rc=0;
1017 332
1018 /* check for other users first */ 333 /* check for other users first */
@@ -1055,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
1055 alert_counter[cpu] = 0; 370 alert_counter[cpu] = 0;
1056 } 371 }
1057 /* see if the nmi watchdog went off */ 372 /* see if the nmi watchdog went off */
1058 if (wd->enabled) { 373 if (!__get_cpu_var(wd_enabled))
1059 if (nmi_watchdog == NMI_LOCAL_APIC) { 374 return rc;
1060 rdmsrl(wd->perfctr_msr, dummy); 375 switch (nmi_watchdog) {
1061 if (dummy & wd->check_bit){ 376 case NMI_LOCAL_APIC:
1062 /* this wasn't a watchdog timer interrupt */ 377 rc |= lapic_wd_event(nmi_hz);
1063 goto done; 378 break;
1064 } 379 case NMI_IO_APIC:
1065 380 /* don't know how to accurately check for this.
1066 /* only Intel P4 uses the cccr msr */ 381 * just assume it was a watchdog timer interrupt
1067 if (wd->cccr_msr != 0) { 382 * This matches the old behaviour.
1068 /* 383 */
1069 * P4 quirks: 384 rc = 1;
1070 * - An overflown perfctr will assert its interrupt 385 break;
1071 * until the OVF flag in its CCCR is cleared.
1072 * - LVTPC is masked on interrupt and must be
1073 * unmasked by the LVTPC handler.
1074 */
1075 rdmsrl(wd->cccr_msr, dummy);
1076 dummy &= ~P4_CCCR_OVF;
1077 wrmsrl(wd->cccr_msr, dummy);
1078 apic_write(APIC_LVTPC, APIC_DM_NMI);
1079 /* start the cycle over again */
1080 write_watchdog_counter(wd->perfctr_msr, NULL);
1081 }
1082 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
1083 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
1084 /* P6 based Pentium M need to re-unmask
1085 * the apic vector but it doesn't hurt
1086 * other P6 variant.
1087 * ArchPerfom/Core Duo also needs this */
1088 apic_write(APIC_LVTPC, APIC_DM_NMI);
1089 /* P6/ARCH_PERFMON has 32 bit counter write */
1090 write_watchdog_counter32(wd->perfctr_msr, NULL);
1091 } else {
1092 /* start the cycle over again */
1093 write_watchdog_counter(wd->perfctr_msr, NULL);
1094 }
1095 rc = 1;
1096 } else if (nmi_watchdog == NMI_IO_APIC) {
1097 /* don't know how to accurately check for this.
1098 * just assume it was a watchdog timer interrupt
1099 * This matches the old behaviour.
1100 */
1101 rc = 1;
1102 }
1103 } 386 }
1104done:
1105 return rc; 387 return rc;
1106} 388}
1107 389
@@ -1146,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1146 } 428 }
1147 429
1148 if (nmi_watchdog == NMI_DEFAULT) { 430 if (nmi_watchdog == NMI_DEFAULT) {
1149 if (nmi_known_cpu() > 0) 431 if (lapic_watchdog_ok())
1150 nmi_watchdog = NMI_LOCAL_APIC; 432 nmi_watchdog = NMI_LOCAL_APIC;
1151 else 433 else
1152 nmi_watchdog = NMI_IO_APIC; 434 nmi_watchdog = NMI_IO_APIC;
@@ -1182,11 +464,3 @@ void __trigger_all_cpu_backtrace(void)
1182 464
1183EXPORT_SYMBOL(nmi_active); 465EXPORT_SYMBOL(nmi_active);
1184EXPORT_SYMBOL(nmi_watchdog); 466EXPORT_SYMBOL(nmi_watchdog);
1185EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1186EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1187EXPORT_SYMBOL(reserve_perfctr_nmi);
1188EXPORT_SYMBOL(release_perfctr_nmi);
1189EXPORT_SYMBOL(reserve_evntsel_nmi);
1190EXPORT_SYMBOL(release_evntsel_nmi);
1191EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1192EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 2ec331e03fa9..5c10f376bce1 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
20#include <linux/efi.h> 20#include <linux/efi.h>
21#include <linux/bcd.h> 21#include <linux/bcd.h>
22#include <linux/start_kernel.h> 22#include <linux/start_kernel.h>
23#include <linux/highmem.h>
23 24
24#include <asm/bug.h> 25#include <asm/bug.h>
25#include <asm/paravirt.h> 26#include <asm/paravirt.h>
@@ -35,7 +36,7 @@
35#include <asm/timer.h> 36#include <asm/timer.h>
36 37
37/* nop stub */ 38/* nop stub */
38static void native_nop(void) 39void _paravirt_nop(void)
39{ 40{
40} 41}
41 42
@@ -54,331 +55,148 @@ char *memory_setup(void)
54#define DEF_NATIVE(name, code) \ 55#define DEF_NATIVE(name, code) \
55 extern const char start_##name[], end_##name[]; \ 56 extern const char start_##name[], end_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":") 57 asm("start_" #name ": " code "; end_" #name ":")
57DEF_NATIVE(cli, "cli"); 58
58DEF_NATIVE(sti, "sti"); 59DEF_NATIVE(irq_disable, "cli");
59DEF_NATIVE(popf, "push %eax; popf"); 60DEF_NATIVE(irq_enable, "sti");
60DEF_NATIVE(pushf, "pushf; pop %eax"); 61DEF_NATIVE(restore_fl, "push %eax; popf");
61DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); 62DEF_NATIVE(save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret"); 63DEF_NATIVE(iret, "iret");
63DEF_NATIVE(sti_sysexit, "sti; sysexit"); 64DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
65DEF_NATIVE(read_cr2, "mov %cr2, %eax");
66DEF_NATIVE(write_cr3, "mov %eax, %cr3");
67DEF_NATIVE(read_cr3, "mov %cr3, %eax");
68DEF_NATIVE(clts, "clts");
69DEF_NATIVE(read_tsc, "rdtsc");
64 70
65static const struct native_insns 71DEF_NATIVE(ud2a, "ud2a");
66{
67 const char *start, *end;
68} native_insns[] = {
69 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
70 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
71 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
72 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
73 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
74 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
75 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
76};
77 72
78static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) 73static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
79{ 74{
80 unsigned int insn_len; 75 const unsigned char *start, *end;
81 76 unsigned ret;
82 /* Don't touch it if we don't have a replacement */ 77
83 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) 78 switch(type) {
84 return len; 79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
85 80 SITE(irq_disable);
86 insn_len = native_insns[type].end - native_insns[type].start; 81 SITE(irq_enable);
87 82 SITE(restore_fl);
88 /* Similarly if we can't fit replacement. */ 83 SITE(save_fl);
89 if (len < insn_len) 84 SITE(iret);
90 return len; 85 SITE(irq_enable_sysexit);
86 SITE(read_cr2);
87 SITE(read_cr3);
88 SITE(write_cr3);
89 SITE(clts);
90 SITE(read_tsc);
91#undef SITE
92
93 patch_site:
94 ret = paravirt_patch_insns(insns, len, start, end);
95 break;
91 96
92 memcpy(insns, native_insns[type].start, insn_len); 97 case PARAVIRT_PATCH(make_pgd):
93 return insn_len; 98 case PARAVIRT_PATCH(make_pte):
94} 99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
95 109
96static unsigned long native_get_debugreg(int regno)
97{
98 unsigned long val = 0; /* Damn you, gcc! */
99
100 switch (regno) {
101 case 0:
102 asm("movl %%db0, %0" :"=r" (val)); break;
103 case 1:
104 asm("movl %%db1, %0" :"=r" (val)); break;
105 case 2:
106 asm("movl %%db2, %0" :"=r" (val)); break;
107 case 3:
108 asm("movl %%db3, %0" :"=r" (val)); break;
109 case 6:
110 asm("movl %%db6, %0" :"=r" (val)); break;
111 case 7:
112 asm("movl %%db7, %0" :"=r" (val)); break;
113 default: 110 default:
114 BUG(); 111 ret = paravirt_patch_default(type, clobbers, insns, len);
115 }
116 return val;
117}
118
119static void native_set_debugreg(int regno, unsigned long value)
120{
121 switch (regno) {
122 case 0:
123 asm("movl %0,%%db0" : /* no output */ :"r" (value));
124 break;
125 case 1:
126 asm("movl %0,%%db1" : /* no output */ :"r" (value));
127 break;
128 case 2:
129 asm("movl %0,%%db2" : /* no output */ :"r" (value));
130 break; 112 break;
131 case 3:
132 asm("movl %0,%%db3" : /* no output */ :"r" (value));
133 break;
134 case 6:
135 asm("movl %0,%%db6" : /* no output */ :"r" (value));
136 break;
137 case 7:
138 asm("movl %0,%%db7" : /* no output */ :"r" (value));
139 break;
140 default:
141 BUG();
142 } 113 }
143}
144
145void init_IRQ(void)
146{
147 paravirt_ops.init_IRQ();
148}
149
150static void native_clts(void)
151{
152 asm volatile ("clts");
153}
154
155static unsigned long native_read_cr0(void)
156{
157 unsigned long val;
158 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
159 return val;
160}
161
162static void native_write_cr0(unsigned long val)
163{
164 asm volatile("movl %0,%%cr0": :"r" (val));
165}
166
167static unsigned long native_read_cr2(void)
168{
169 unsigned long val;
170 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
171 return val;
172}
173
174static void native_write_cr2(unsigned long val)
175{
176 asm volatile("movl %0,%%cr2": :"r" (val));
177}
178
179static unsigned long native_read_cr3(void)
180{
181 unsigned long val;
182 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
183 return val;
184}
185
186static void native_write_cr3(unsigned long val)
187{
188 asm volatile("movl %0,%%cr3": :"r" (val));
189}
190
191static unsigned long native_read_cr4(void)
192{
193 unsigned long val;
194 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
195 return val;
196}
197
198static unsigned long native_read_cr4_safe(void)
199{
200 unsigned long val;
201 /* This could fault if %cr4 does not exist */
202 asm("1: movl %%cr4, %0 \n"
203 "2: \n"
204 ".section __ex_table,\"a\" \n"
205 ".long 1b,2b \n"
206 ".previous \n"
207 : "=r" (val): "0" (0));
208 return val;
209}
210
211static void native_write_cr4(unsigned long val)
212{
213 asm volatile("movl %0,%%cr4": :"r" (val));
214}
215
216static unsigned long native_save_fl(void)
217{
218 unsigned long f;
219 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
220 return f;
221}
222
223static void native_restore_fl(unsigned long f)
224{
225 asm volatile("pushl %0 ; popfl": /* no output */
226 :"g" (f)
227 :"memory", "cc");
228}
229
230static void native_irq_disable(void)
231{
232 asm volatile("cli": : :"memory");
233}
234
235static void native_irq_enable(void)
236{
237 asm volatile("sti": : :"memory");
238}
239
240static void native_safe_halt(void)
241{
242 asm volatile("sti; hlt": : :"memory");
243}
244 114
245static void native_halt(void) 115 return ret;
246{
247 asm volatile("hlt": : :"memory");
248} 116}
249 117
250static void native_wbinvd(void) 118unsigned paravirt_patch_nop(void)
251{ 119{
252 asm volatile("wbinvd": : :"memory"); 120 return 0;
253} 121}
254 122
255static unsigned long long native_read_msr(unsigned int msr, int *err) 123unsigned paravirt_patch_ignore(unsigned len)
256{ 124{
257 unsigned long long val; 125 return len;
258
259 asm volatile("2: rdmsr ; xorl %0,%0\n"
260 "1:\n\t"
261 ".section .fixup,\"ax\"\n\t"
262 "3: movl %3,%0 ; jmp 1b\n\t"
263 ".previous\n\t"
264 ".section __ex_table,\"a\"\n"
265 " .align 4\n\t"
266 " .long 2b,3b\n\t"
267 ".previous"
268 : "=r" (*err), "=A" (val)
269 : "c" (msr), "i" (-EFAULT));
270
271 return val;
272} 126}
273 127
274static int native_write_msr(unsigned int msr, unsigned long long val) 128unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
129 void *site, u16 site_clobbers,
130 unsigned len)
275{ 131{
276 int err; 132 unsigned char *call = site;
277 asm volatile("2: wrmsr ; xorl %0,%0\n" 133 unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
278 "1:\n\t"
279 ".section .fixup,\"ax\"\n\t"
280 "3: movl %4,%0 ; jmp 1b\n\t"
281 ".previous\n\t"
282 ".section __ex_table,\"a\"\n"
283 " .align 4\n\t"
284 " .long 2b,3b\n\t"
285 ".previous"
286 : "=a" (err)
287 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
288 "i" (-EFAULT));
289 return err;
290}
291 134
292static unsigned long long native_read_tsc(void) 135 if (tgt_clobbers & ~site_clobbers)
293{ 136 return len; /* target would clobber too much for this site */
294 unsigned long long val; 137 if (len < 5)
295 asm volatile("rdtsc" : "=A" (val)); 138 return len; /* call too long for patch site */
296 return val;
297}
298 139
299static unsigned long long native_read_pmc(void) 140 *call++ = 0xe8; /* call */
300{ 141 *(unsigned long *)call = delta;
301 unsigned long long val;
302 asm volatile("rdpmc" : "=A" (val));
303 return val;
304}
305 142
306static void native_load_tr_desc(void) 143 return 5;
307{
308 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
309} 144}
310 145
311static void native_load_gdt(const struct Xgt_desc_struct *dtr) 146unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
312{ 147{
313 asm volatile("lgdt %0"::"m" (*dtr)); 148 unsigned char *jmp = site;
314} 149 unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
315 150
316static void native_load_idt(const struct Xgt_desc_struct *dtr) 151 if (len < 5)
317{ 152 return len; /* call too long for patch site */
318 asm volatile("lidt %0"::"m" (*dtr));
319}
320 153
321static void native_store_gdt(struct Xgt_desc_struct *dtr) 154 *jmp++ = 0xe9; /* jmp */
322{ 155 *(unsigned long *)jmp = delta;
323 asm ("sgdt %0":"=m" (*dtr));
324}
325 156
326static void native_store_idt(struct Xgt_desc_struct *dtr) 157 return 5;
327{
328 asm ("sidt %0":"=m" (*dtr));
329} 158}
330 159
331static unsigned long native_store_tr(void) 160unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
332{ 161{
333 unsigned long tr; 162 void *opfunc = *((void **)&paravirt_ops + type);
334 asm ("str %0":"=r" (tr)); 163 unsigned ret;
335 return tr;
336}
337 164
338static void native_load_tls(struct thread_struct *t, unsigned int cpu) 165 if (opfunc == NULL)
339{ 166 /* If there's no function, patch it with a ud2a (BUG) */
340#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] 167 ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a);
341 C(0); C(1); C(2); 168 else if (opfunc == paravirt_nop)
342#undef C 169 /* If the operation is a nop, then nop the callsite */
343} 170 ret = paravirt_patch_nop();
171 else if (type == PARAVIRT_PATCH(iret) ||
172 type == PARAVIRT_PATCH(irq_enable_sysexit))
173 /* If operation requires a jmp, then jmp */
174 ret = paravirt_patch_jmp(opfunc, site, len);
175 else
176 /* Otherwise call the function; assume target could
177 clobber any caller-save reg */
178 ret = paravirt_patch_call(opfunc, CLBR_ANY,
179 site, clobbers, len);
344 180
345static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) 181 return ret;
346{
347 u32 *lp = (u32 *)((char *)dt + entry*8);
348 lp[0] = entry_low;
349 lp[1] = entry_high;
350} 182}
351 183
352static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) 184unsigned paravirt_patch_insns(void *site, unsigned len,
185 const char *start, const char *end)
353{ 186{
354 native_write_dt_entry(dt, entrynum, low, high); 187 unsigned insn_len = end - start;
355}
356 188
357static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) 189 if (insn_len > len || start == NULL)
358{ 190 insn_len = len;
359 native_write_dt_entry(dt, entrynum, low, high); 191 else
360} 192 memcpy(site, start, insn_len);
361
362static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
363{
364 native_write_dt_entry(dt, entrynum, low, high);
365}
366 193
367static void native_load_esp0(struct tss_struct *tss, 194 return insn_len;
368 struct thread_struct *thread)
369{
370 tss->esp0 = thread->esp0;
371
372 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
373 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
374 tss->ss1 = thread->sysenter_cs;
375 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
376 }
377} 195}
378 196
379static void native_io_delay(void) 197void init_IRQ(void)
380{ 198{
381 asm volatile("outb %al,$0x80"); 199 paravirt_ops.init_IRQ();
382} 200}
383 201
384static void native_flush_tlb(void) 202static void native_flush_tlb(void)
@@ -395,83 +213,11 @@ static void native_flush_tlb_global(void)
395 __native_flush_tlb_global(); 213 __native_flush_tlb_global();
396} 214}
397 215
398static void native_flush_tlb_single(u32 addr) 216static void native_flush_tlb_single(unsigned long addr)
399{ 217{
400 __native_flush_tlb_single(addr); 218 __native_flush_tlb_single(addr);
401} 219}
402 220
403#ifndef CONFIG_X86_PAE
404static void native_set_pte(pte_t *ptep, pte_t pteval)
405{
406 *ptep = pteval;
407}
408
409static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
410{
411 *ptep = pteval;
412}
413
414static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
415{
416 *pmdp = pmdval;
417}
418
419#else /* CONFIG_X86_PAE */
420
421static void native_set_pte(pte_t *ptep, pte_t pte)
422{
423 ptep->pte_high = pte.pte_high;
424 smp_wmb();
425 ptep->pte_low = pte.pte_low;
426}
427
428static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
429{
430 ptep->pte_high = pte.pte_high;
431 smp_wmb();
432 ptep->pte_low = pte.pte_low;
433}
434
435static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
436{
437 ptep->pte_low = 0;
438 smp_wmb();
439 ptep->pte_high = pte.pte_high;
440 smp_wmb();
441 ptep->pte_low = pte.pte_low;
442}
443
444static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
445{
446 set_64bit((unsigned long long *)ptep,pte_val(pteval));
447}
448
449static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
450{
451 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
452}
453
454static void native_set_pud(pud_t *pudp, pud_t pudval)
455{
456 *pudp = pudval;
457}
458
459static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
460{
461 ptep->pte_low = 0;
462 smp_wmb();
463 ptep->pte_high = 0;
464}
465
466static void native_pmd_clear(pmd_t *pmd)
467{
468 u32 *tmp = (u32 *)pmd;
469 *tmp = 0;
470 smp_wmb();
471 *(tmp + 1) = 0;
472}
473#endif /* CONFIG_X86_PAE */
474
475/* These are in entry.S */ 221/* These are in entry.S */
476extern void native_iret(void); 222extern void native_iret(void);
477extern void native_irq_enable_sysexit(void); 223extern void native_irq_enable_sysexit(void);
@@ -487,10 +233,11 @@ struct paravirt_ops paravirt_ops = {
487 .name = "bare hardware", 233 .name = "bare hardware",
488 .paravirt_enabled = 0, 234 .paravirt_enabled = 0,
489 .kernel_rpl = 0, 235 .kernel_rpl = 0,
236 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
490 237
491 .patch = native_patch, 238 .patch = native_patch,
492 .banner = default_banner, 239 .banner = default_banner,
493 .arch_setup = native_nop, 240 .arch_setup = paravirt_nop,
494 .memory_setup = machine_specific_memory_setup, 241 .memory_setup = machine_specific_memory_setup,
495 .get_wallclock = native_get_wallclock, 242 .get_wallclock = native_get_wallclock,
496 .set_wallclock = native_set_wallclock, 243 .set_wallclock = native_set_wallclock,
@@ -517,8 +264,8 @@ struct paravirt_ops paravirt_ops = {
517 .safe_halt = native_safe_halt, 264 .safe_halt = native_safe_halt,
518 .halt = native_halt, 265 .halt = native_halt,
519 .wbinvd = native_wbinvd, 266 .wbinvd = native_wbinvd,
520 .read_msr = native_read_msr, 267 .read_msr = native_read_msr_safe,
521 .write_msr = native_write_msr, 268 .write_msr = native_write_msr_safe,
522 .read_tsc = native_read_tsc, 269 .read_tsc = native_read_tsc,
523 .read_pmc = native_read_pmc, 270 .read_pmc = native_read_pmc,
524 .get_scheduled_cycles = native_read_tsc, 271 .get_scheduled_cycles = native_read_tsc,
@@ -531,9 +278,9 @@ struct paravirt_ops paravirt_ops = {
531 .store_idt = native_store_idt, 278 .store_idt = native_store_idt,
532 .store_tr = native_store_tr, 279 .store_tr = native_store_tr,
533 .load_tls = native_load_tls, 280 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry, 281 .write_ldt_entry = write_dt_entry,
535 .write_gdt_entry = native_write_gdt_entry, 282 .write_gdt_entry = write_dt_entry,
536 .write_idt_entry = native_write_idt_entry, 283 .write_idt_entry = write_dt_entry,
537 .load_esp0 = native_load_esp0, 284 .load_esp0 = native_load_esp0,
538 285
539 .set_iopl_mask = native_set_iopl_mask, 286 .set_iopl_mask = native_set_iopl_mask,
@@ -545,44 +292,57 @@ struct paravirt_ops paravirt_ops = {
545 .apic_read = native_apic_read, 292 .apic_read = native_apic_read,
546 .setup_boot_clock = setup_boot_APIC_clock, 293 .setup_boot_clock = setup_boot_APIC_clock,
547 .setup_secondary_clock = setup_secondary_APIC_clock, 294 .setup_secondary_clock = setup_secondary_APIC_clock,
295 .startup_ipi_hook = paravirt_nop,
548#endif 296#endif
549 .set_lazy_mode = (void *)native_nop, 297 .set_lazy_mode = paravirt_nop,
298
299 .pagetable_setup_start = native_pagetable_setup_start,
300 .pagetable_setup_done = native_pagetable_setup_done,
550 301
551 .flush_tlb_user = native_flush_tlb, 302 .flush_tlb_user = native_flush_tlb,
552 .flush_tlb_kernel = native_flush_tlb_global, 303 .flush_tlb_kernel = native_flush_tlb_global,
553 .flush_tlb_single = native_flush_tlb_single, 304 .flush_tlb_single = native_flush_tlb_single,
305 .flush_tlb_others = native_flush_tlb_others,
554 306
555 .map_pt_hook = (void *)native_nop, 307 .alloc_pt = paravirt_nop,
556 308 .alloc_pd = paravirt_nop,
557 .alloc_pt = (void *)native_nop, 309 .alloc_pd_clone = paravirt_nop,
558 .alloc_pd = (void *)native_nop, 310 .release_pt = paravirt_nop,
559 .alloc_pd_clone = (void *)native_nop, 311 .release_pd = paravirt_nop,
560 .release_pt = (void *)native_nop,
561 .release_pd = (void *)native_nop,
562 312
563 .set_pte = native_set_pte, 313 .set_pte = native_set_pte,
564 .set_pte_at = native_set_pte_at, 314 .set_pte_at = native_set_pte_at,
565 .set_pmd = native_set_pmd, 315 .set_pmd = native_set_pmd,
566 .pte_update = (void *)native_nop, 316 .pte_update = paravirt_nop,
567 .pte_update_defer = (void *)native_nop, 317 .pte_update_defer = paravirt_nop,
318
319#ifdef CONFIG_HIGHPTE
320 .kmap_atomic_pte = kmap_atomic,
321#endif
322
568#ifdef CONFIG_X86_PAE 323#ifdef CONFIG_X86_PAE
569 .set_pte_atomic = native_set_pte_atomic, 324 .set_pte_atomic = native_set_pte_atomic,
570 .set_pte_present = native_set_pte_present, 325 .set_pte_present = native_set_pte_present,
571 .set_pud = native_set_pud, 326 .set_pud = native_set_pud,
572 .pte_clear = native_pte_clear, 327 .pte_clear = native_pte_clear,
573 .pmd_clear = native_pmd_clear, 328 .pmd_clear = native_pmd_clear,
329
330 .pmd_val = native_pmd_val,
331 .make_pmd = native_make_pmd,
574#endif 332#endif
575 333
334 .pte_val = native_pte_val,
335 .pgd_val = native_pgd_val,
336
337 .make_pte = native_make_pte,
338 .make_pgd = native_make_pgd,
339
576 .irq_enable_sysexit = native_irq_enable_sysexit, 340 .irq_enable_sysexit = native_irq_enable_sysexit,
577 .iret = native_iret, 341 .iret = native_iret,
578 342
579 .startup_ipi_hook = (void *)native_nop, 343 .dup_mmap = paravirt_nop,
344 .exit_mmap = paravirt_nop,
345 .activate_mm = paravirt_nop,
580}; 346};
581 347
582/* 348EXPORT_SYMBOL(paravirt_ops);
583 * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops
584 * semantics are subject to change. Hence we only do this
585 * internal-only export of this, until it gets sorted out and
586 * all lowlevel CPU ops used by modules are separately exported.
587 */
588EXPORT_SYMBOL_GPL(paravirt_ops);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 393a67d5d943..61999479b7a4 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
39#include <linux/random.h> 39#include <linux/random.h>
40#include <linux/personality.h> 40#include <linux/personality.h>
41#include <linux/tick.h> 41#include <linux/tick.h>
42#include <linux/percpu.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/pgtable.h> 45#include <asm/pgtable.h>
@@ -57,7 +58,6 @@
57 58
58#include <asm/tlbflush.h> 59#include <asm/tlbflush.h>
59#include <asm/cpu.h> 60#include <asm/cpu.h>
60#include <asm/pda.h>
61 61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 63
@@ -66,6 +66,12 @@ static int hlt_counter;
66unsigned long boot_option_idle_override = 0; 66unsigned long boot_option_idle_override = 0;
67EXPORT_SYMBOL(boot_option_idle_override); 67EXPORT_SYMBOL(boot_option_idle_override);
68 68
69DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
70EXPORT_PER_CPU_SYMBOL(current_task);
71
72DEFINE_PER_CPU(int, cpu_number);
73EXPORT_PER_CPU_SYMBOL(cpu_number);
74
69/* 75/*
70 * Return saved PC of a blocked thread. 76 * Return saved PC of a blocked thread.
71 */ 77 */
@@ -272,25 +278,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
272 } 278 }
273} 279}
274 280
275static int __init idle_setup (char *str) 281static int __init idle_setup(char *str)
276{ 282{
277 if (!strncmp(str, "poll", 4)) { 283 if (!strcmp(str, "poll")) {
278 printk("using polling idle threads.\n"); 284 printk("using polling idle threads.\n");
279 pm_idle = poll_idle; 285 pm_idle = poll_idle;
280#ifdef CONFIG_X86_SMP 286#ifdef CONFIG_X86_SMP
281 if (smp_num_siblings > 1) 287 if (smp_num_siblings > 1)
282 printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); 288 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
283#endif 289#endif
284 } else if (!strncmp(str, "halt", 4)) { 290 } else if (!strcmp(str, "mwait"))
285 printk("using halt in idle threads.\n"); 291 force_mwait = 1;
286 pm_idle = default_idle; 292 else
287 } 293 return -1;
288 294
289 boot_option_idle_override = 1; 295 boot_option_idle_override = 1;
290 return 1; 296 return 0;
291} 297}
292 298early_param("idle", idle_setup);
293__setup("idle=", idle_setup);
294 299
295void show_regs(struct pt_regs * regs) 300void show_regs(struct pt_regs * regs)
296{ 301{
@@ -343,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
343 348
344 regs.xds = __USER_DS; 349 regs.xds = __USER_DS;
345 regs.xes = __USER_DS; 350 regs.xes = __USER_DS;
346 regs.xfs = __KERNEL_PDA; 351 regs.xfs = __KERNEL_PERCPU;
347 regs.orig_eax = -1; 352 regs.orig_eax = -1;
348 regs.eip = (unsigned long) kernel_thread_helper; 353 regs.eip = (unsigned long) kernel_thread_helper;
349 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 354 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -376,7 +381,7 @@ void exit_thread(void)
376 t->io_bitmap_max = 0; 381 t->io_bitmap_max = 0;
377 tss->io_bitmap_owner = NULL; 382 tss->io_bitmap_owner = NULL;
378 tss->io_bitmap_max = 0; 383 tss->io_bitmap_max = 0;
379 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 384 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
380 put_cpu(); 385 put_cpu();
381 } 386 }
382} 387}
@@ -555,7 +560,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
555 * Disable the bitmap via an invalid offset. We still cache 560 * Disable the bitmap via an invalid offset. We still cache
556 * the previous bitmap owner and the IO bitmap contents: 561 * the previous bitmap owner and the IO bitmap contents:
557 */ 562 */
558 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 563 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
559 return; 564 return;
560 } 565 }
561 566
@@ -565,7 +570,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
565 * matches the next task, we dont have to do anything but 570 * matches the next task, we dont have to do anything but
566 * to set a valid offset in the TSS: 571 * to set a valid offset in the TSS:
567 */ 572 */
568 tss->io_bitmap_base = IO_BITMAP_OFFSET; 573 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
569 return; 574 return;
570 } 575 }
571 /* 576 /*
@@ -577,7 +582,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
577 * redundant copies when the currently switched task does not 582 * redundant copies when the currently switched task does not
578 * perform any I/O during its timeslice. 583 * perform any I/O during its timeslice.
579 */ 584 */
580 tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 585 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
581} 586}
582 587
583/* 588/*
@@ -712,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
712 if (prev->gs | next->gs) 717 if (prev->gs | next->gs)
713 loadsegment(gs, next->gs); 718 loadsegment(gs, next->gs);
714 719
715 write_pda(pcurrent, next_p); 720 x86_write_percpu(current_task, next_p);
716 721
717 return prev_p; 722 return prev_p;
718} 723}
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 34874c398b44..9f6ab1789bb0 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,12 +3,10 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
9 6
10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) 8
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
12{ 10{
13 u8 config, rev; 11 u8 config, rev;
14 u32 word; 12 u32 word;
@@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
16 /* BIOS may enable hardware IRQ balancing for 14 /* BIOS may enable hardware IRQ balancing for
17 * E7520/E7320/E7525(revision ID 0x9 and below) 15 * E7520/E7320/E7525(revision ID 0x9 and below)
18 * based platforms. 16 * based platforms.
19 * For those platforms, make sure that the genapic is set to 'flat' 17 * Disable SW irqbalance/affinity on those platforms.
20 */ 18 */
21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
22 if (rev > 0x9) 20 if (rev > 0x9)
23 return; 21 return;
24 22
23 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24
25 /* enable access to config space*/ 25 /* enable access to config space*/
26 pci_read_config_byte(dev, 0xf4, &config); 26 pci_read_config_byte(dev, 0xf4, &config);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 27 pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33#ifdef CONFIG_X86_64
34 if (genapic != &apic_flat)
35 panic("APIC mode must be flat on this system\n");
36#elif defined(CONFIG_X86_GENERICARCH)
37 if (genapic != &apic_default)
38 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46
47void __init quirk_intel_irqbalance(void)
48{
49 u8 config, rev;
50 u32 word;
51
52 /* BIOS may enable hardware IRQ balancing for
53 * E7520/E7320/E7525(revision ID 0x9 and below)
54 * based platforms.
55 * Disable SW irqbalance/affinity on those platforms.
56 */
57 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
58 if (rev > 0x9)
59 return;
60
61 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
62
63 /* enable access to config space */
64 config = read_pci_config_byte(0, 0, 0, 0xf4);
65 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
66
67 /* read xTPR register */
68 word = read_pci_config_16(0, 0, 0x40, 0x4c);
69
70 if (!(word & (1 << 13))) {
71 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 33 printk(KERN_INFO "Disabling irq balancing and affinity\n");
72#ifdef CONFIG_IRQBALANCE 34#ifdef CONFIG_IRQBALANCE
73 irqbalance_disable(""); 35 irqbalance_disable("");
@@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void)
76#ifdef CONFIG_PROC_FS 38#ifdef CONFIG_PROC_FS
77 no_irq_affinity = 1; 39 no_irq_affinity = 1;
78#endif 40#endif
79#ifdef CONFIG_HOTPLUG_CPU
80 printk(KERN_INFO "Disabling cpu hotplug control\n");
81 enable_cpu_hotplug = 0;
82#endif
83#ifdef CONFIG_X86_64
84 /* force the genapic selection to flat mode so that
85 * interrupts can be redirected to more than one CPU.
86 */
87 genapic_force = &apic_flat;
88#endif
89 } 41 }
90 42
91 /* put back the original value for config space */ 43 /* put back the original value for config space*/
92 if (!(config & 0x2)) 44 if (!(config & 0x2))
93 write_pci_config_byte(0, 0, 0, 0xf4, config); 45 pci_write_config_byte(dev, 0xf4, config);
94} 46}
95DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); 47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
96DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); 48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
97DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); 49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
98
99#endif 50#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 3514b4153f7f..50dfc65319cd 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -17,7 +17,8 @@
17#include <asm/apic.h> 17#include <asm/apic.h>
18#include <asm/desc.h> 18#include <asm/desc.h>
19#include "mach_reboot.h" 19#include "mach_reboot.h"
20#include <linux/reboot_fixups.h> 20#include <asm/reboot_fixups.h>
21#include <asm/reboot.h>
21 22
22/* 23/*
23 * Power off function, if any 24 * Power off function, if any
@@ -197,8 +198,6 @@ static unsigned char jump_to_bios [] =
197 */ 198 */
198void machine_real_restart(unsigned char *code, int length) 199void machine_real_restart(unsigned char *code, int length)
199{ 200{
200 unsigned long flags;
201
202 local_irq_disable(); 201 local_irq_disable();
203 202
204 /* Write zero to CMOS register number 0x0f, which the BIOS POST 203 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -211,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length)
211 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) 210 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
212 */ 211 */
213 212
214 spin_lock_irqsave(&rtc_lock, flags); 213 spin_lock(&rtc_lock);
215 CMOS_WRITE(0x00, 0x8f); 214 CMOS_WRITE(0x00, 0x8f);
216 spin_unlock_irqrestore(&rtc_lock, flags); 215 spin_unlock(&rtc_lock);
217 216
218 /* Remap the kernel at virtual address zero, as well as offset zero 217 /* Remap the kernel at virtual address zero, as well as offset zero
219 from the kernel segment. This assumes the kernel segment starts at 218 from the kernel segment. This assumes the kernel segment starts at
@@ -280,7 +279,7 @@ void machine_real_restart(unsigned char *code, int length)
280EXPORT_SYMBOL(machine_real_restart); 279EXPORT_SYMBOL(machine_real_restart);
281#endif 280#endif
282 281
283void machine_shutdown(void) 282static void native_machine_shutdown(void)
284{ 283{
285#ifdef CONFIG_SMP 284#ifdef CONFIG_SMP
286 int reboot_cpu_id; 285 int reboot_cpu_id;
@@ -316,7 +315,11 @@ void machine_shutdown(void)
316#endif 315#endif
317} 316}
318 317
319void machine_emergency_restart(void) 318void __attribute__((weak)) mach_reboot_fixups(void)
319{
320}
321
322static void native_machine_emergency_restart(void)
320{ 323{
321 if (!reboot_thru_bios) { 324 if (!reboot_thru_bios) {
322 if (efi_enabled) { 325 if (efi_enabled) {
@@ -340,17 +343,17 @@ void machine_emergency_restart(void)
340 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 343 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
341} 344}
342 345
343void machine_restart(char * __unused) 346static void native_machine_restart(char * __unused)
344{ 347{
345 machine_shutdown(); 348 machine_shutdown();
346 machine_emergency_restart(); 349 machine_emergency_restart();
347} 350}
348 351
349void machine_halt(void) 352static void native_machine_halt(void)
350{ 353{
351} 354}
352 355
353void machine_power_off(void) 356static void native_machine_power_off(void)
354{ 357{
355 if (pm_power_off) { 358 if (pm_power_off) {
356 machine_shutdown(); 359 machine_shutdown();
@@ -359,3 +362,35 @@ void machine_power_off(void)
359} 362}
360 363
361 364
365struct machine_ops machine_ops = {
366 .power_off = native_machine_power_off,
367 .shutdown = native_machine_shutdown,
368 .emergency_restart = native_machine_emergency_restart,
369 .restart = native_machine_restart,
370 .halt = native_machine_halt,
371};
372
373void machine_power_off(void)
374{
375 machine_ops.power_off();
376}
377
378void machine_shutdown(void)
379{
380 machine_ops.shutdown();
381}
382
383void machine_emergency_restart(void)
384{
385 machine_ops.emergency_restart();
386}
387
388void machine_restart(char *cmd)
389{
390 machine_ops.restart(cmd);
391}
392
393void machine_halt(void)
394{
395 machine_ops.halt();
396}
diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c
index 99aab41a05b0..2d78d918340f 100644
--- a/arch/i386/kernel/reboot_fixups.c
+++ b/arch/i386/kernel/reboot_fixups.c
@@ -10,7 +10,7 @@
10 10
11#include <asm/delay.h> 11#include <asm/delay.h>
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14 14
15static void cs5530a_warm_reset(struct pci_dev *dev) 15static void cs5530a_warm_reset(struct pci_dev *dev)
16{ 16{
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 0e8977871b1f..89a45a9ddcd4 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -165,20 +165,20 @@ void fastcall send_IPI_self(int vector)
165} 165}
166 166
167/* 167/*
168 * This is only used on smaller machines. 168 * This is used to send an IPI with no shorthand notation (the destination is
169 * specified in bits 56 to 63 of the ICR).
169 */ 170 */
170void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) 171static inline void __send_IPI_dest_field(unsigned long mask, int vector)
171{ 172{
172 unsigned long mask = cpus_addr(cpumask)[0];
173 unsigned long cfg; 173 unsigned long cfg;
174 unsigned long flags;
175 174
176 local_irq_save(flags);
177 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
178 /* 175 /*
179 * Wait for idle. 176 * Wait for idle.
180 */ 177 */
181 apic_wait_icr_idle(); 178 if (unlikely(vector == NMI_VECTOR))
179 safe_apic_wait_icr_idle();
180 else
181 apic_wait_icr_idle();
182 182
183 /* 183 /*
184 * prepare target chip field 184 * prepare target chip field
@@ -195,13 +195,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
195 * Send the IPI. The write to APIC_ICR fires this off. 195 * Send the IPI. The write to APIC_ICR fires this off.
196 */ 196 */
197 apic_write_around(APIC_ICR, cfg); 197 apic_write_around(APIC_ICR, cfg);
198}
199
200/*
201 * This is only used on smaller machines.
202 */
203void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
204{
205 unsigned long mask = cpus_addr(cpumask)[0];
206 unsigned long flags;
198 207
208 local_irq_save(flags);
209 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
210 __send_IPI_dest_field(mask, vector);
199 local_irq_restore(flags); 211 local_irq_restore(flags);
200} 212}
201 213
202void send_IPI_mask_sequence(cpumask_t mask, int vector) 214void send_IPI_mask_sequence(cpumask_t mask, int vector)
203{ 215{
204 unsigned long cfg, flags; 216 unsigned long flags;
205 unsigned int query_cpu; 217 unsigned int query_cpu;
206 218
207 /* 219 /*
@@ -211,30 +223,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
211 */ 223 */
212 224
213 local_irq_save(flags); 225 local_irq_save(flags);
214
215 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { 226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
216 if (cpu_isset(query_cpu, mask)) { 227 if (cpu_isset(query_cpu, mask)) {
217 228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
218 /* 229 vector);
219 * Wait for idle.
220 */
221 apic_wait_icr_idle();
222
223 /*
224 * prepare target chip field
225 */
226 cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
227 apic_write_around(APIC_ICR2, cfg);
228
229 /*
230 * program the ICR
231 */
232 cfg = __prepare_ICR(0, vector);
233
234 /*
235 * Send the IPI. The write to APIC_ICR fires this off.
236 */
237 apic_write_around(APIC_ICR, cfg);
238 } 230 }
239 } 231 }
240 local_irq_restore(flags); 232 local_irq_restore(flags);
@@ -256,7 +248,6 @@ static cpumask_t flush_cpumask;
256static struct mm_struct * flush_mm; 248static struct mm_struct * flush_mm;
257static unsigned long flush_va; 249static unsigned long flush_va;
258static DEFINE_SPINLOCK(tlbstate_lock); 250static DEFINE_SPINLOCK(tlbstate_lock);
259#define FLUSH_ALL 0xffffffff
260 251
261/* 252/*
262 * We cannot call mmdrop() because we are in interrupt context, 253 * We cannot call mmdrop() because we are in interrupt context,
@@ -338,7 +329,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
338 329
339 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 330 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
340 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 331 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
341 if (flush_va == FLUSH_ALL) 332 if (flush_va == TLB_FLUSH_ALL)
342 local_flush_tlb(); 333 local_flush_tlb();
343 else 334 else
344 __flush_tlb_one(flush_va); 335 __flush_tlb_one(flush_va);
@@ -353,9 +344,11 @@ out:
353 put_cpu_no_resched(); 344 put_cpu_no_resched();
354} 345}
355 346
356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 347void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
357 unsigned long va) 348 unsigned long va)
358{ 349{
350 cpumask_t cpumask = *cpumaskp;
351
359 /* 352 /*
360 * A couple of (to be removed) sanity checks: 353 * A couple of (to be removed) sanity checks:
361 * 354 *
@@ -366,10 +359,12 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
366 BUG_ON(cpu_isset(smp_processor_id(), cpumask)); 359 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
367 BUG_ON(!mm); 360 BUG_ON(!mm);
368 361
362#ifdef CONFIG_HOTPLUG_CPU
369 /* If a CPU which we ran on has gone down, OK. */ 363 /* If a CPU which we ran on has gone down, OK. */
370 cpus_and(cpumask, cpumask, cpu_online_map); 364 cpus_and(cpumask, cpumask, cpu_online_map);
371 if (cpus_empty(cpumask)) 365 if (unlikely(cpus_empty(cpumask)))
372 return; 366 return;
367#endif
373 368
374 /* 369 /*
375 * i'm not happy about this global shared spinlock in the 370 * i'm not happy about this global shared spinlock in the
@@ -380,17 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
380 375
381 flush_mm = mm; 376 flush_mm = mm;
382 flush_va = va; 377 flush_va = va;
383#if NR_CPUS <= BITS_PER_LONG 378 cpus_or(flush_cpumask, cpumask, flush_cpumask);
384 atomic_set_mask(cpumask, &flush_cpumask);
385#else
386 {
387 int k;
388 unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
389 unsigned long *cpu_mask = (unsigned long *)&cpumask;
390 for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
391 atomic_set_mask(cpu_mask[k], &flush_mask[k]);
392 }
393#endif
394 /* 379 /*
395 * We have to send the IPI only to 380 * We have to send the IPI only to
396 * CPUs affected. 381 * CPUs affected.
@@ -417,7 +402,7 @@ void flush_tlb_current_task(void)
417 402
418 local_flush_tlb(); 403 local_flush_tlb();
419 if (!cpus_empty(cpu_mask)) 404 if (!cpus_empty(cpu_mask))
420 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 405 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
421 preempt_enable(); 406 preempt_enable();
422} 407}
423 408
@@ -436,7 +421,7 @@ void flush_tlb_mm (struct mm_struct * mm)
436 leave_mm(smp_processor_id()); 421 leave_mm(smp_processor_id());
437 } 422 }
438 if (!cpus_empty(cpu_mask)) 423 if (!cpus_empty(cpu_mask))
439 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 424 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
440 425
441 preempt_enable(); 426 preempt_enable();
442} 427}
@@ -483,7 +468,7 @@ void flush_tlb_all(void)
483 * it goes straight through and wastes no time serializing 468 * it goes straight through and wastes no time serializing
484 * anything. Worst case is that we lose a reschedule ... 469 * anything. Worst case is that we lose a reschedule ...
485 */ 470 */
486void smp_send_reschedule(int cpu) 471void native_smp_send_reschedule(int cpu)
487{ 472{
488 WARN_ON(cpu_is_offline(cpu)); 473 WARN_ON(cpu_is_offline(cpu));
489 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 474 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
@@ -515,36 +500,78 @@ void unlock_ipi_call_lock(void)
515 500
516static struct call_data_struct *call_data; 501static struct call_data_struct *call_data;
517 502
503static void __smp_call_function(void (*func) (void *info), void *info,
504 int nonatomic, int wait)
505{
506 struct call_data_struct data;
507 int cpus = num_online_cpus() - 1;
508
509 if (!cpus)
510 return;
511
512 data.func = func;
513 data.info = info;
514 atomic_set(&data.started, 0);
515 data.wait = wait;
516 if (wait)
517 atomic_set(&data.finished, 0);
518
519 call_data = &data;
520 mb();
521
522 /* Send a message to all other CPUs and wait for them to respond */
523 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
524
525 /* Wait for response */
526 while (atomic_read(&data.started) != cpus)
527 cpu_relax();
528
529 if (wait)
530 while (atomic_read(&data.finished) != cpus)
531 cpu_relax();
532}
533
534
518/** 535/**
519 * smp_call_function(): Run a function on all other CPUs. 536 * smp_call_function_mask(): Run a function on a set of other CPUs.
537 * @mask: The set of cpus to run on. Must not include the current cpu.
520 * @func: The function to run. This must be fast and non-blocking. 538 * @func: The function to run. This must be fast and non-blocking.
521 * @info: An arbitrary pointer to pass to the function. 539 * @info: An arbitrary pointer to pass to the function.
522 * @nonatomic: currently unused.
523 * @wait: If true, wait (atomically) until function has completed on other CPUs. 540 * @wait: If true, wait (atomically) until function has completed on other CPUs.
524 * 541 *
525 * Returns 0 on success, else a negative status code. Does not return until 542 * Returns 0 on success, else a negative status code.
526 * remote CPUs are nearly ready to execute <<func>> or are or have executed. 543 *
544 * If @wait is true, then returns once @func has returned; otherwise
545 * it returns just before the target cpu calls @func.
527 * 546 *
528 * You must not call this function with disabled interrupts or from a 547 * You must not call this function with disabled interrupts or from a
529 * hardware interrupt handler or from a bottom half handler. 548 * hardware interrupt handler or from a bottom half handler.
530 */ 549 */
531int smp_call_function (void (*func) (void *info), void *info, int nonatomic, 550int native_smp_call_function_mask(cpumask_t mask,
532 int wait) 551 void (*func)(void *), void *info,
552 int wait)
533{ 553{
534 struct call_data_struct data; 554 struct call_data_struct data;
555 cpumask_t allbutself;
535 int cpus; 556 int cpus;
536 557
558 /* Can deadlock when called with interrupts disabled */
559 WARN_ON(irqs_disabled());
560
537 /* Holding any lock stops cpus from going down. */ 561 /* Holding any lock stops cpus from going down. */
538 spin_lock(&call_lock); 562 spin_lock(&call_lock);
539 cpus = num_online_cpus() - 1; 563
564 allbutself = cpu_online_map;
565 cpu_clear(smp_processor_id(), allbutself);
566
567 cpus_and(mask, mask, allbutself);
568 cpus = cpus_weight(mask);
569
540 if (!cpus) { 570 if (!cpus) {
541 spin_unlock(&call_lock); 571 spin_unlock(&call_lock);
542 return 0; 572 return 0;
543 } 573 }
544 574
545 /* Can deadlock when called with interrupts disabled */
546 WARN_ON(irqs_disabled());
547
548 data.func = func; 575 data.func = func;
549 data.info = info; 576 data.info = info;
550 atomic_set(&data.started, 0); 577 atomic_set(&data.started, 0);
@@ -554,9 +581,12 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
554 581
555 call_data = &data; 582 call_data = &data;
556 mb(); 583 mb();
557 584
558 /* Send a message to all other CPUs and wait for them to respond */ 585 /* Send a message to other CPUs */
559 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 586 if (cpus_equal(mask, allbutself))
587 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
588 else
589 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
560 590
561 /* Wait for response */ 591 /* Wait for response */
562 while (atomic_read(&data.started) != cpus) 592 while (atomic_read(&data.started) != cpus)
@@ -569,15 +599,68 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
569 599
570 return 0; 600 return 0;
571} 601}
602
603/**
604 * smp_call_function(): Run a function on all other CPUs.
605 * @func: The function to run. This must be fast and non-blocking.
606 * @info: An arbitrary pointer to pass to the function.
607 * @nonatomic: Unused.
608 * @wait: If true, wait (atomically) until function has completed on other CPUs.
609 *
610 * Returns 0 on success, else a negative status code.
611 *
612 * If @wait is true, then returns once @func has returned; otherwise
613 * it returns just before the target cpu calls @func.
614 *
615 * You must not call this function with disabled interrupts or from a
616 * hardware interrupt handler or from a bottom half handler.
617 */
618int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
619 int wait)
620{
621 return smp_call_function_mask(cpu_online_map, func, info, wait);
622}
572EXPORT_SYMBOL(smp_call_function); 623EXPORT_SYMBOL(smp_call_function);
573 624
625/**
626 * smp_call_function_single - Run a function on another CPU
627 * @cpu: The target CPU. Cannot be the calling CPU.
628 * @func: The function to run. This must be fast and non-blocking.
629 * @info: An arbitrary pointer to pass to the function.
630 * @nonatomic: Unused.
631 * @wait: If true, wait until function has completed on other CPUs.
632 *
633 * Returns 0 on success, else a negative status code.
634 *
635 * If @wait is true, then returns once @func has returned; otherwise
636 * it returns just before the target cpu calls @func.
637 */
638int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
639 int nonatomic, int wait)
640{
641 /* prevent preemption and reschedule on another processor */
642 int ret;
643 int me = get_cpu();
644 if (cpu == me) {
645 WARN_ON(1);
646 put_cpu();
647 return -EBUSY;
648 }
649
650 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
651
652 put_cpu();
653 return ret;
654}
655EXPORT_SYMBOL(smp_call_function_single);
656
574static void stop_this_cpu (void * dummy) 657static void stop_this_cpu (void * dummy)
575{ 658{
659 local_irq_disable();
576 /* 660 /*
577 * Remove this CPU: 661 * Remove this CPU:
578 */ 662 */
579 cpu_clear(smp_processor_id(), cpu_online_map); 663 cpu_clear(smp_processor_id(), cpu_online_map);
580 local_irq_disable();
581 disable_local_APIC(); 664 disable_local_APIC();
582 if (cpu_data[smp_processor_id()].hlt_works_ok) 665 if (cpu_data[smp_processor_id()].hlt_works_ok)
583 for(;;) halt(); 666 for(;;) halt();
@@ -588,13 +671,18 @@ static void stop_this_cpu (void * dummy)
588 * this function calls the 'stop' function on all other CPUs in the system. 671 * this function calls the 'stop' function on all other CPUs in the system.
589 */ 672 */
590 673
591void smp_send_stop(void) 674void native_smp_send_stop(void)
592{ 675{
593 smp_call_function(stop_this_cpu, NULL, 1, 0); 676 /* Don't deadlock on the call lock in panic */
677 int nolock = !spin_trylock(&call_lock);
678 unsigned long flags;
594 679
595 local_irq_disable(); 680 local_irq_save(flags);
681 __smp_call_function(stop_this_cpu, NULL, 0, 0);
682 if (!nolock)
683 spin_unlock(&call_lock);
596 disable_local_APIC(); 684 disable_local_APIC();
597 local_irq_enable(); 685 local_irq_restore(flags);
598} 686}
599 687
600/* 688/*
@@ -633,77 +721,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
633 } 721 }
634} 722}
635 723
636/*
637 * this function sends a 'generic call function' IPI to one other CPU
638 * in the system.
639 *
640 * cpu is a standard Linux logical CPU number.
641 */
642static void
643__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
644 int nonatomic, int wait)
645{
646 struct call_data_struct data;
647 int cpus = 1;
648
649 data.func = func;
650 data.info = info;
651 atomic_set(&data.started, 0);
652 data.wait = wait;
653 if (wait)
654 atomic_set(&data.finished, 0);
655
656 call_data = &data;
657 wmb();
658 /* Send a message to all other CPUs and wait for them to respond */
659 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
660
661 /* Wait for response */
662 while (atomic_read(&data.started) != cpus)
663 cpu_relax();
664
665 if (!wait)
666 return;
667
668 while (atomic_read(&data.finished) != cpus)
669 cpu_relax();
670}
671
672/*
673 * smp_call_function_single - Run a function on another CPU
674 * @func: The function to run. This must be fast and non-blocking.
675 * @info: An arbitrary pointer to pass to the function.
676 * @nonatomic: Currently unused.
677 * @wait: If true, wait until function has completed on other CPUs.
678 *
679 * Retrurns 0 on success, else a negative status code.
680 *
681 * Does not return until the remote CPU is nearly ready to execute <func>
682 * or is or has executed.
683 */
684
685int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
686 int nonatomic, int wait)
687{
688 /* prevent preemption and reschedule on another processor */
689 int me = get_cpu();
690 if (cpu == me) {
691 WARN_ON(1);
692 put_cpu();
693 return -EBUSY;
694 }
695
696 /* Can deadlock when called with interrupts disabled */
697 WARN_ON(irqs_disabled());
698
699 spin_lock_bh(&call_lock);
700 __smp_call_function_single(cpu, func, info, nonatomic, wait);
701 spin_unlock_bh(&call_lock);
702 put_cpu();
703 return 0;
704}
705EXPORT_SYMBOL(smp_call_function_single);
706
707static int convert_apicid_to_cpu(int apic_id) 724static int convert_apicid_to_cpu(int apic_id)
708{ 725{
709 int i; 726 int i;
@@ -730,3 +747,14 @@ int safe_smp_processor_id(void)
730 747
731 return cpuid >= 0 ? cpuid : 0; 748 return cpuid >= 0 ? cpuid : 0;
732} 749}
750
751struct smp_ops smp_ops = {
752 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
753 .smp_prepare_cpus = native_smp_prepare_cpus,
754 .cpu_up = native_cpu_up,
755 .smp_cpus_done = native_smp_cpus_done,
756
757 .smp_send_stop = native_smp_send_stop,
758 .smp_send_reschedule = native_smp_send_reschedule,
759 .smp_call_function_mask = native_smp_call_function_mask,
760};
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4ff55e675576..a4b7ad283f49 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -53,13 +53,12 @@
53#include <asm/desc.h> 53#include <asm/desc.h>
54#include <asm/arch_hooks.h> 54#include <asm/arch_hooks.h>
55#include <asm/nmi.h> 55#include <asm/nmi.h>
56#include <asm/pda.h>
57#include <asm/genapic.h>
58 56
59#include <mach_apic.h> 57#include <mach_apic.h>
60#include <mach_wakecpu.h> 58#include <mach_wakecpu.h>
61#include <smpboot_hooks.h> 59#include <smpboot_hooks.h>
62#include <asm/vmi.h> 60#include <asm/vmi.h>
61#include <asm/mtrr.h>
63 62
64/* Set if we find a B stepping CPU */ 63/* Set if we find a B stepping CPU */
65static int __devinitdata smp_b_stepping; 64static int __devinitdata smp_b_stepping;
@@ -100,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
100 99
101u8 apicid_2_node[MAX_APICID]; 100u8 apicid_2_node[MAX_APICID];
102 101
102DEFINE_PER_CPU(unsigned long, this_cpu_off);
103EXPORT_PER_CPU_SYMBOL(this_cpu_off);
104
103/* 105/*
104 * Trampoline 80x86 program as an array. 106 * Trampoline 80x86 program as an array.
105 */ 107 */
@@ -156,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id)
156 158
157 *c = boot_cpu_data; 159 *c = boot_cpu_data;
158 if (id!=0) 160 if (id!=0)
159 identify_cpu(c); 161 identify_secondary_cpu(c);
160 /* 162 /*
161 * Mask B, Pentium, but not Pentium MMX 163 * Mask B, Pentium, but not Pentium MMX
162 */ 164 */
@@ -379,14 +381,14 @@ set_cpu_sibling_map(int cpu)
379static void __cpuinit start_secondary(void *unused) 381static void __cpuinit start_secondary(void *unused)
380{ 382{
381 /* 383 /*
382 * Don't put *anything* before secondary_cpu_init(), SMP 384 * Don't put *anything* before cpu_init(), SMP booting is too
383 * booting is too fragile that we want to limit the 385 * fragile that we want to limit the things done here to the
384 * things done here to the most necessary things. 386 * most necessary things.
385 */ 387 */
386#ifdef CONFIG_VMI 388#ifdef CONFIG_VMI
387 vmi_bringup(); 389 vmi_bringup();
388#endif 390#endif
389 secondary_cpu_init(); 391 cpu_init();
390 preempt_disable(); 392 preempt_disable();
391 smp_callin(); 393 smp_callin();
392 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 394 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -441,12 +443,6 @@ static void __cpuinit start_secondary(void *unused)
441void __devinit initialize_secondary(void) 443void __devinit initialize_secondary(void)
442{ 444{
443 /* 445 /*
444 * switch to the per CPU GDT we already set up
445 * in do_boot_cpu()
446 */
447 cpu_set_gdt(current_thread_info()->cpu);
448
449 /*
450 * We don't actually need to load the full TSS, 446 * We don't actually need to load the full TSS,
451 * basically just the stack pointer and the eip. 447 * basically just the stack pointer and the eip.
452 */ 448 */
@@ -463,7 +459,6 @@ extern struct {
463 void * esp; 459 void * esp;
464 unsigned short ss; 460 unsigned short ss;
465} stack_start; 461} stack_start;
466extern struct i386_pda *start_pda;
467 462
468#ifdef CONFIG_NUMA 463#ifdef CONFIG_NUMA
469 464
@@ -521,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu)
521 unmap_cpu_to_node(cpu); 516 unmap_cpu_to_node(cpu);
522} 517}
523 518
524#if APIC_DEBUG
525static inline void __inquire_remote_apic(int apicid) 519static inline void __inquire_remote_apic(int apicid)
526{ 520{
527 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 521 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
528 char *names[] = { "ID", "VERSION", "SPIV" }; 522 char *names[] = { "ID", "VERSION", "SPIV" };
529 int timeout, status; 523 int timeout;
524 unsigned long status;
530 525
531 printk("Inquiring remote APIC #%d...\n", apicid); 526 printk("Inquiring remote APIC #%d...\n", apicid);
532 527
@@ -536,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid)
536 /* 531 /*
537 * Wait for idle. 532 * Wait for idle.
538 */ 533 */
539 apic_wait_icr_idle(); 534 status = safe_apic_wait_icr_idle();
535 if (status)
536 printk("a previous APIC delivery may have failed\n");
540 537
541 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 538 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
542 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 539 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -550,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid)
550 switch (status) { 547 switch (status) {
551 case APIC_ICR_RR_VALID: 548 case APIC_ICR_RR_VALID:
552 status = apic_read(APIC_RRR); 549 status = apic_read(APIC_RRR);
553 printk("%08x\n", status); 550 printk("%lx\n", status);
554 break; 551 break;
555 default: 552 default:
556 printk("failed\n"); 553 printk("failed\n");
557 } 554 }
558 } 555 }
559} 556}
560#endif
561 557
562#ifdef WAKE_SECONDARY_VIA_NMI 558#ifdef WAKE_SECONDARY_VIA_NMI
563/* 559/*
@@ -568,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid)
568static int __devinit 564static int __devinit
569wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 565wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
570{ 566{
571 unsigned long send_status = 0, accept_status = 0; 567 unsigned long send_status, accept_status = 0;
572 int timeout, maxlvt; 568 int maxlvt;
573 569
574 /* Target chip */ 570 /* Target chip */
575 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 571 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
@@ -579,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 575 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
580 576
581 Dprintk("Waiting for send to finish...\n"); 577 Dprintk("Waiting for send to finish...\n");
582 timeout = 0; 578 send_status = safe_apic_wait_icr_idle();
583 do {
584 Dprintk("+");
585 udelay(100);
586 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
587 } while (send_status && (timeout++ < 1000));
588 579
589 /* 580 /*
590 * Give the other CPU some time to accept the IPI. 581 * Give the other CPU some time to accept the IPI.
@@ -614,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
614static int __devinit 605static int __devinit
615wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) 606wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
616{ 607{
617 unsigned long send_status = 0, accept_status = 0; 608 unsigned long send_status, accept_status = 0;
618 int maxlvt, timeout, num_starts, j; 609 int maxlvt, num_starts, j;
619 610
620 /* 611 /*
621 * Be paranoid about clearing APIC errors. 612 * Be paranoid about clearing APIC errors.
@@ -640,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
640 | APIC_DM_INIT); 631 | APIC_DM_INIT);
641 632
642 Dprintk("Waiting for send to finish...\n"); 633 Dprintk("Waiting for send to finish...\n");
643 timeout = 0; 634 send_status = safe_apic_wait_icr_idle();
644 do {
645 Dprintk("+");
646 udelay(100);
647 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
648 } while (send_status && (timeout++ < 1000));
649 635
650 mdelay(10); 636 mdelay(10);
651 637
@@ -658,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
658 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 644 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
659 645
660 Dprintk("Waiting for send to finish...\n"); 646 Dprintk("Waiting for send to finish...\n");
661 timeout = 0; 647 send_status = safe_apic_wait_icr_idle();
662 do {
663 Dprintk("+");
664 udelay(100);
665 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
666 } while (send_status && (timeout++ < 1000));
667 648
668 atomic_set(&init_deasserted, 1); 649 atomic_set(&init_deasserted, 1);
669 650
@@ -719,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
719 Dprintk("Startup point 1.\n"); 700 Dprintk("Startup point 1.\n");
720 701
721 Dprintk("Waiting for send to finish...\n"); 702 Dprintk("Waiting for send to finish...\n");
722 timeout = 0; 703 send_status = safe_apic_wait_icr_idle();
723 do {
724 Dprintk("+");
725 udelay(100);
726 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
727 } while (send_status && (timeout++ < 1000));
728 704
729 /* 705 /*
730 * Give the other CPU some time to accept the IPI. 706 * Give the other CPU some time to accept the IPI.
@@ -788,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu)
788#define alloc_idle_task(cpu) fork_idle(cpu) 764#define alloc_idle_task(cpu) fork_idle(cpu)
789#endif 765#endif
790 766
767/* Initialize the CPU's GDT. This is either the boot CPU doing itself
768 (still using the master per-cpu area), or a CPU doing it for a
769 secondary which will soon come up. */
770static __cpuinit void init_gdt(int cpu)
771{
772 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
773
774 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
775 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
776 __per_cpu_offset[cpu], 0xFFFFF,
777 0x80 | DESCTYPE_S | 0x2, 0x8);
778
779 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
780 per_cpu(cpu_number, cpu) = cpu;
781}
782
783/* Defined in head.S */
784extern struct Xgt_desc_struct early_gdt_descr;
785
791static int __cpuinit do_boot_cpu(int apicid, int cpu) 786static int __cpuinit do_boot_cpu(int apicid, int cpu)
792/* 787/*
793 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 788 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -802,6 +797,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
802 unsigned short nmi_high = 0, nmi_low = 0; 797 unsigned short nmi_high = 0, nmi_low = 0;
803 798
804 /* 799 /*
800 * Save current MTRR state in case it was changed since early boot
801 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
802 */
803 mtrr_save_state();
804
805 /*
805 * We can't use kernel_thread since we must avoid to 806 * We can't use kernel_thread since we must avoid to
806 * reschedule the child. 807 * reschedule the child.
807 */ 808 */
@@ -809,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
809 if (IS_ERR(idle)) 810 if (IS_ERR(idle))
810 panic("failed fork for CPU %d", cpu); 811 panic("failed fork for CPU %d", cpu);
811 812
812 /* Pre-allocate and initialize the CPU's GDT and PDA so it 813 init_gdt(cpu);
813 doesn't have to do any memory allocation during the 814 per_cpu(current_task, cpu) = idle;
814 delicate CPU-bringup phase. */ 815 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
815 if (!init_gdt(cpu, idle)) {
816 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
817 return -1; /* ? */
818 }
819 816
820 idle->thread.eip = (unsigned long) start_secondary; 817 idle->thread.eip = (unsigned long) start_secondary;
821 /* start_eip had better be page-aligned! */ 818 /* start_eip had better be page-aligned! */
@@ -941,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
941 DECLARE_COMPLETION_ONSTACK(done); 938 DECLARE_COMPLETION_ONSTACK(done);
942 struct warm_boot_cpu_info info; 939 struct warm_boot_cpu_info info;
943 int apicid, ret; 940 int apicid, ret;
944 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
945 941
946 apicid = x86_cpu_to_apicid[cpu]; 942 apicid = x86_cpu_to_apicid[cpu];
947 if (apicid == BAD_APICID) { 943 if (apicid == BAD_APICID) {
@@ -949,18 +945,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
949 goto exit; 945 goto exit;
950 } 946 }
951 947
952 /*
953 * the CPU isn't initialized at boot time, allocate gdt table here.
954 * cpu_init will initialize it
955 */
956 if (!cpu_gdt_descr->address) {
957 cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
958 if (!cpu_gdt_descr->address)
959 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
960 ret = -ENOMEM;
961 goto exit;
962 }
963
964 info.complete = &done; 948 info.complete = &done;
965 info.apicid = apicid; 949 info.apicid = apicid;
966 info.cpu = cpu; 950 info.cpu = cpu;
@@ -1173,7 +1157,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1173 1157
1174/* These are wrappers to interface to the new boot process. Someone 1158/* These are wrappers to interface to the new boot process. Someone
1175 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ 1159 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
1176void __init smp_prepare_cpus(unsigned int max_cpus) 1160void __init native_smp_prepare_cpus(unsigned int max_cpus)
1177{ 1161{
1178 smp_commenced_mask = cpumask_of_cpu(0); 1162 smp_commenced_mask = cpumask_of_cpu(0);
1179 cpu_callin_map = cpumask_of_cpu(0); 1163 cpu_callin_map = cpumask_of_cpu(0);
@@ -1181,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
1181 smp_boot_cpus(max_cpus); 1165 smp_boot_cpus(max_cpus);
1182} 1166}
1183 1167
1184void __devinit smp_prepare_boot_cpu(void) 1168void __init native_smp_prepare_boot_cpu(void)
1185{ 1169{
1186 cpu_set(smp_processor_id(), cpu_online_map); 1170 unsigned int cpu = smp_processor_id();
1187 cpu_set(smp_processor_id(), cpu_callout_map); 1171
1188 cpu_set(smp_processor_id(), cpu_present_map); 1172 init_gdt(cpu);
1189 cpu_set(smp_processor_id(), cpu_possible_map); 1173 switch_to_new_gdt();
1190 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 1174
1175 cpu_set(cpu, cpu_online_map);
1176 cpu_set(cpu, cpu_callout_map);
1177 cpu_set(cpu, cpu_present_map);
1178 cpu_set(cpu, cpu_possible_map);
1179 __get_cpu_var(cpu_state) = CPU_ONLINE;
1191} 1180}
1192 1181
1193#ifdef CONFIG_HOTPLUG_CPU 1182#ifdef CONFIG_HOTPLUG_CPU
@@ -1277,7 +1266,7 @@ void __cpu_die(unsigned int cpu)
1277} 1266}
1278#endif /* CONFIG_HOTPLUG_CPU */ 1267#endif /* CONFIG_HOTPLUG_CPU */
1279 1268
1280int __cpuinit __cpu_up(unsigned int cpu) 1269int __cpuinit native_cpu_up(unsigned int cpu)
1281{ 1270{
1282 unsigned long flags; 1271 unsigned long flags;
1283#ifdef CONFIG_HOTPLUG_CPU 1272#ifdef CONFIG_HOTPLUG_CPU
@@ -1319,15 +1308,10 @@ int __cpuinit __cpu_up(unsigned int cpu)
1319 touch_nmi_watchdog(); 1308 touch_nmi_watchdog();
1320 } 1309 }
1321 1310
1322#ifdef CONFIG_X86_GENERICARCH
1323 if (num_online_cpus() > 8 && genapic == &apic_default)
1324 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1325#endif
1326
1327 return 0; 1311 return 0;
1328} 1312}
1329 1313
1330void __init smp_cpus_done(unsigned int max_cpus) 1314void __init native_smp_cpus_done(unsigned int max_cpus)
1331{ 1315{
1332#ifdef CONFIG_X86_IO_APIC 1316#ifdef CONFIG_X86_IO_APIC
1333 setup_ioapic_dest(); 1317 setup_ioapic_dest();
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 13ca54a85a1c..ff4ee6f3326b 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -22,16 +22,26 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/unistd.h> 24#include <asm/unistd.h>
25#include <asm/elf.h>
26#include <asm/tlbflush.h>
27
28enum {
29 VDSO_DISABLED = 0,
30 VDSO_ENABLED = 1,
31 VDSO_COMPAT = 2,
32};
33
34#ifdef CONFIG_COMPAT_VDSO
35#define VDSO_DEFAULT VDSO_COMPAT
36#else
37#define VDSO_DEFAULT VDSO_ENABLED
38#endif
25 39
26/* 40/*
27 * Should the kernel map a VDSO page into processes and pass its 41 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 42 * address down to glibc upon exec()?
29 */ 43 */
30#ifdef CONFIG_PARAVIRT 44unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
31unsigned int __read_mostly vdso_enabled = 0;
32#else
33unsigned int __read_mostly vdso_enabled = 1;
34#endif
35 45
36EXPORT_SYMBOL_GPL(vdso_enabled); 46EXPORT_SYMBOL_GPL(vdso_enabled);
37 47
@@ -46,6 +56,123 @@ __setup("vdso=", vdso_setup);
46 56
47extern asmlinkage void sysenter_entry(void); 57extern asmlinkage void sysenter_entry(void);
48 58
59static __init void reloc_symtab(Elf32_Ehdr *ehdr,
60 unsigned offset, unsigned size)
61{
62 Elf32_Sym *sym = (void *)ehdr + offset;
63 unsigned nsym = size / sizeof(*sym);
64 unsigned i;
65
66 for(i = 0; i < nsym; i++, sym++) {
67 if (sym->st_shndx == SHN_UNDEF ||
68 sym->st_shndx == SHN_ABS)
69 continue; /* skip */
70
71 if (sym->st_shndx > SHN_LORESERVE) {
72 printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
73 sym->st_shndx);
74 continue;
75 }
76
77 switch(ELF_ST_TYPE(sym->st_info)) {
78 case STT_OBJECT:
79 case STT_FUNC:
80 case STT_SECTION:
81 case STT_FILE:
82 sym->st_value += VDSO_HIGH_BASE;
83 }
84 }
85}
86
87static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
88{
89 Elf32_Dyn *dyn = (void *)ehdr + offset;
90
91 for(; dyn->d_tag != DT_NULL; dyn++)
92 switch(dyn->d_tag) {
93 case DT_PLTGOT:
94 case DT_HASH:
95 case DT_STRTAB:
96 case DT_SYMTAB:
97 case DT_RELA:
98 case DT_INIT:
99 case DT_FINI:
100 case DT_REL:
101 case DT_DEBUG:
102 case DT_JMPREL:
103 case DT_VERSYM:
104 case DT_VERDEF:
105 case DT_VERNEED:
106 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
107 /* definitely pointers needing relocation */
108 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
109 break;
110
111 case DT_ENCODING ... OLD_DT_LOOS-1:
112 case DT_LOOS ... DT_HIOS-1:
113 /* Tags above DT_ENCODING are pointers if
114 they're even */
115 if (dyn->d_tag >= DT_ENCODING &&
116 (dyn->d_tag & 1) == 0)
117 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
118 break;
119
120 case DT_VERDEFNUM:
121 case DT_VERNEEDNUM:
122 case DT_FLAGS_1:
123 case DT_RELACOUNT:
124 case DT_RELCOUNT:
125 case DT_VALRNGLO ... DT_VALRNGHI:
126 /* definitely not pointers */
127 break;
128
129 case OLD_DT_LOOS ... DT_LOOS-1:
130 case DT_HIOS ... DT_VALRNGLO-1:
131 default:
132 if (dyn->d_tag > DT_ENCODING)
133 printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
134 dyn->d_tag);
135 break;
136 }
137}
138
139static __init void relocate_vdso(Elf32_Ehdr *ehdr)
140{
141 Elf32_Phdr *phdr;
142 Elf32_Shdr *shdr;
143 int i;
144
145 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
146 !elf_check_arch(ehdr) ||
147 ehdr->e_type != ET_DYN);
148
149 ehdr->e_entry += VDSO_HIGH_BASE;
150
151 /* rebase phdrs */
152 phdr = (void *)ehdr + ehdr->e_phoff;
153 for (i = 0; i < ehdr->e_phnum; i++) {
154 phdr[i].p_vaddr += VDSO_HIGH_BASE;
155
156 /* relocate dynamic stuff */
157 if (phdr[i].p_type == PT_DYNAMIC)
158 reloc_dyn(ehdr, phdr[i].p_offset);
159 }
160
161 /* rebase sections */
162 shdr = (void *)ehdr + ehdr->e_shoff;
163 for(i = 0; i < ehdr->e_shnum; i++) {
164 if (!(shdr[i].sh_flags & SHF_ALLOC))
165 continue;
166
167 shdr[i].sh_addr += VDSO_HIGH_BASE;
168
169 if (shdr[i].sh_type == SHT_SYMTAB ||
170 shdr[i].sh_type == SHT_DYNSYM)
171 reloc_symtab(ehdr, shdr[i].sh_offset,
172 shdr[i].sh_size);
173 }
174}
175
49void enable_sep_cpu(void) 176void enable_sep_cpu(void)
50{ 177{
51 int cpu = get_cpu(); 178 int cpu = get_cpu();
@@ -56,14 +183,33 @@ void enable_sep_cpu(void)
56 return; 183 return;
57 } 184 }
58 185
59 tss->ss1 = __KERNEL_CS; 186 tss->x86_tss.ss1 = __KERNEL_CS;
60 tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; 187 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
61 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 188 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
62 wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); 189 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
63 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); 190 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
64 put_cpu(); 191 put_cpu();
65} 192}
66 193
194static struct vm_area_struct gate_vma;
195
196static int __init gate_vma_init(void)
197{
198 gate_vma.vm_mm = NULL;
199 gate_vma.vm_start = FIXADDR_USER_START;
200 gate_vma.vm_end = FIXADDR_USER_END;
201 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
202 gate_vma.vm_page_prot = __P101;
203 /*
204 * Make sure the vDSO gets into every core dump.
205 * Dumping its contents makes post-mortem fully interpretable later
206 * without matching up the same kernel and hardware config to see
207 * what PC values meant.
208 */
209 gate_vma.vm_flags |= VM_ALWAYSDUMP;
210 return 0;
211}
212
67/* 213/*
68 * These symbols are defined by vsyscall.o to mark the bounds 214 * These symbols are defined by vsyscall.o to mark the bounds
69 * of the ELF DSO images included therein. 215 * of the ELF DSO images included therein.
@@ -72,31 +218,48 @@ extern const char vsyscall_int80_start, vsyscall_int80_end;
72extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; 218extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
73static struct page *syscall_pages[1]; 219static struct page *syscall_pages[1];
74 220
221static void map_compat_vdso(int map)
222{
223 static int vdso_mapped;
224
225 if (map == vdso_mapped)
226 return;
227
228 vdso_mapped = map;
229
230 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
231 map ? PAGE_READONLY_EXEC : PAGE_NONE);
232
233 /* flush stray tlbs */
234 flush_tlb_all();
235}
236
75int __init sysenter_setup(void) 237int __init sysenter_setup(void)
76{ 238{
77 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); 239 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
240 const void *vsyscall;
241 size_t vsyscall_len;
242
78 syscall_pages[0] = virt_to_page(syscall_page); 243 syscall_pages[0] = virt_to_page(syscall_page);
79 244
80#ifdef CONFIG_COMPAT_VDSO 245 gate_vma_init();
81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); 246
82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 247 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
83#endif
84 248
85 if (!boot_cpu_has(X86_FEATURE_SEP)) { 249 if (!boot_cpu_has(X86_FEATURE_SEP)) {
86 memcpy(syscall_page, 250 vsyscall = &vsyscall_int80_start;
87 &vsyscall_int80_start, 251 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
88 &vsyscall_int80_end - &vsyscall_int80_start); 252 } else {
89 return 0; 253 vsyscall = &vsyscall_sysenter_start;
254 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
90 } 255 }
91 256
92 memcpy(syscall_page, 257 memcpy(syscall_page, vsyscall, vsyscall_len);
93 &vsyscall_sysenter_start, 258 relocate_vdso(syscall_page);
94 &vsyscall_sysenter_end - &vsyscall_sysenter_start);
95 259
96 return 0; 260 return 0;
97} 261}
98 262
99#ifndef CONFIG_COMPAT_VDSO
100/* Defined in vsyscall-sysenter.S */ 263/* Defined in vsyscall-sysenter.S */
101extern void SYSENTER_RETURN; 264extern void SYSENTER_RETURN;
102 265
@@ -105,36 +268,52 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
105{ 268{
106 struct mm_struct *mm = current->mm; 269 struct mm_struct *mm = current->mm;
107 unsigned long addr; 270 unsigned long addr;
108 int ret; 271 int ret = 0;
272 bool compat;
109 273
110 down_write(&mm->mmap_sem); 274 down_write(&mm->mmap_sem);
111 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
112 if (IS_ERR_VALUE(addr)) {
113 ret = addr;
114 goto up_fail;
115 }
116 275
117 /* 276 /* Test compat mode once here, in case someone
118 * MAYWRITE to allow gdb to COW and set breakpoints 277 changes it via sysctl */
119 * 278 compat = (vdso_enabled == VDSO_COMPAT);
120 * Make sure the vDSO gets into every core dump. 279
121 * Dumping its contents makes post-mortem fully interpretable later 280 map_compat_vdso(compat);
122 * without matching up the same kernel and hardware config to see 281
123 * what PC values meant. 282 if (compat)
124 */ 283 addr = VDSO_HIGH_BASE;
125 ret = install_special_mapping(mm, addr, PAGE_SIZE, 284 else {
126 VM_READ|VM_EXEC| 285 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
127 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 286 if (IS_ERR_VALUE(addr)) {
128 VM_ALWAYSDUMP, 287 ret = addr;
129 syscall_pages); 288 goto up_fail;
130 if (ret) 289 }
131 goto up_fail; 290
291 /*
292 * MAYWRITE to allow gdb to COW and set breakpoints
293 *
294 * Make sure the vDSO gets into every core dump.
295 * Dumping its contents makes post-mortem fully
296 * interpretable later without matching up the same
297 * kernel and hardware config to see what PC values
298 * meant.
299 */
300 ret = install_special_mapping(mm, addr, PAGE_SIZE,
301 VM_READ|VM_EXEC|
302 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
303 VM_ALWAYSDUMP,
304 syscall_pages);
305
306 if (ret)
307 goto up_fail;
308 }
132 309
133 current->mm->context.vdso = (void *)addr; 310 current->mm->context.vdso = (void *)addr;
134 current_thread_info()->sysenter_return = 311 current_thread_info()->sysenter_return =
135 (void *)VDSO_SYM(&SYSENTER_RETURN); 312 (void *)VDSO_SYM(&SYSENTER_RETURN);
136up_fail: 313
314 up_fail:
137 up_write(&mm->mmap_sem); 315 up_write(&mm->mmap_sem);
316
138 return ret; 317 return ret;
139} 318}
140 319
@@ -147,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma)
147 326
148struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 327struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
149{ 328{
329 struct mm_struct *mm = tsk->mm;
330
331 /* Check to see if this task was created in compat vdso mode */
332 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
333 return &gate_vma;
150 return NULL; 334 return NULL;
151} 335}
152 336
@@ -159,4 +343,3 @@ int in_gate_area_no_task(unsigned long addr)
159{ 343{
160 return 0; 344 return 0;
161} 345}
162#endif
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 94e5cb091104..a665df61f08c 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -70,8 +70,6 @@
70 70
71#include <asm/i8259.h> 71#include <asm/i8259.h>
72 72
73int pit_latch_buggy; /* extern */
74
75#include "do_timer.h" 73#include "do_timer.h"
76 74
77unsigned int cpu_khz; /* Detected as we calibrate the TSC */ 75unsigned int cpu_khz; /* Detected as we calibrate the TSC */
diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S
index 2f1814c5cfd7..f62815f8d06a 100644
--- a/arch/i386/kernel/trampoline.S
+++ b/arch/i386/kernel/trampoline.S
@@ -29,7 +29,7 @@
29 * 29 *
30 * TYPE VALUE 30 * TYPE VALUE
31 * R_386_32 startup_32_smp 31 * R_386_32 startup_32_smp
32 * R_386_32 boot_gdt_table 32 * R_386_32 boot_gdt
33 */ 33 */
34 34
35#include <linux/linkage.h> 35#include <linux/linkage.h>
@@ -62,8 +62,8 @@ r_base = .
62 * to 32 bit. 62 * to 32 bit.
63 */ 63 */
64 64
65 lidtl boot_idt - r_base # load idt with 0, 0 65 lidtl boot_idt_descr - r_base # load idt with 0, 0
66 lgdtl boot_gdt - r_base # load gdt with whatever is appropriate 66 lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
67 67
68 xor %ax, %ax 68 xor %ax, %ax
69 inc %ax # protected mode (PE) bit 69 inc %ax # protected mode (PE) bit
@@ -73,11 +73,11 @@ r_base = .
73 73
74 # These need to be in the same 64K segment as the above; 74 # These need to be in the same 64K segment as the above;
75 # hence we don't use the boot_gdt_descr defined in head.S 75 # hence we don't use the boot_gdt_descr defined in head.S
76boot_gdt: 76boot_gdt_descr:
77 .word __BOOT_DS + 7 # gdt limit 77 .word __BOOT_DS + 7 # gdt limit
78 .long boot_gdt_table-__PAGE_OFFSET # gdt base 78 .long boot_gdt - __PAGE_OFFSET # gdt base
79 79
80boot_idt: 80boot_idt_descr:
81 .word 0 # idt limit = 0 81 .word 0 # idt limit = 0
82 .long 0 # idt base = 0L 82 .long 0 # idt base = 0L
83 83
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index af0d3f70a817..f21b41e7770c 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
476 siginfo_t *info) 476 siginfo_t *info)
477{ 477{
478 struct task_struct *tsk = current; 478 struct task_struct *tsk = current;
479 tsk->thread.error_code = error_code;
480 tsk->thread.trap_no = trapnr;
481 479
482 if (regs->eflags & VM_MASK) { 480 if (regs->eflags & VM_MASK) {
483 if (vm86) 481 if (vm86)
@@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
489 goto kernel_trap; 487 goto kernel_trap;
490 488
491 trap_signal: { 489 trap_signal: {
490 /*
491 * We want error_code and trap_no set for userspace faults and
492 * kernelspace faults which result in die(), but not
493 * kernelspace faults which are fixed up. die() gives the
494 * process no chance to handle the signal and notice the
495 * kernel fault information, so that won't result in polluting
496 * the information about previously queued, but not yet
497 * delivered, faults. See also do_general_protection below.
498 */
499 tsk->thread.error_code = error_code;
500 tsk->thread.trap_no = trapnr;
501
492 if (info) 502 if (info)
493 force_sig_info(signr, info, tsk); 503 force_sig_info(signr, info, tsk);
494 else 504 else
@@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
497 } 507 }
498 508
499 kernel_trap: { 509 kernel_trap: {
500 if (!fixup_exception(regs)) 510 if (!fixup_exception(regs)) {
511 tsk->thread.error_code = error_code;
512 tsk->thread.trap_no = trapnr;
501 die(str, regs, error_code); 513 die(str, regs, error_code);
514 }
502 return; 515 return;
503 } 516 }
504 517
@@ -583,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
583 * and we set the offset field correctly. Then we let the CPU to 596 * and we set the offset field correctly. Then we let the CPU to
584 * restart the faulting instruction. 597 * restart the faulting instruction.
585 */ 598 */
586 if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && 599 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
587 thread->io_bitmap_ptr) { 600 thread->io_bitmap_ptr) {
588 memcpy(tss->io_bitmap, thread->io_bitmap_ptr, 601 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
589 thread->io_bitmap_max); 602 thread->io_bitmap_max);
@@ -596,16 +609,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
596 thread->io_bitmap_max, 0xff, 609 thread->io_bitmap_max, 0xff,
597 tss->io_bitmap_max - thread->io_bitmap_max); 610 tss->io_bitmap_max - thread->io_bitmap_max);
598 tss->io_bitmap_max = thread->io_bitmap_max; 611 tss->io_bitmap_max = thread->io_bitmap_max;
599 tss->io_bitmap_base = IO_BITMAP_OFFSET; 612 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
600 tss->io_bitmap_owner = thread; 613 tss->io_bitmap_owner = thread;
601 put_cpu(); 614 put_cpu();
602 return; 615 return;
603 } 616 }
604 put_cpu(); 617 put_cpu();
605 618
606 current->thread.error_code = error_code;
607 current->thread.trap_no = 13;
608
609 if (regs->eflags & VM_MASK) 619 if (regs->eflags & VM_MASK)
610 goto gp_in_vm86; 620 goto gp_in_vm86;
611 621
@@ -624,6 +634,8 @@ gp_in_vm86:
624 634
625gp_in_kernel: 635gp_in_kernel:
626 if (!fixup_exception(regs)) { 636 if (!fixup_exception(regs)) {
637 current->thread.error_code = error_code;
638 current->thread.trap_no = 13;
627 if (notify_die(DIE_GPF, "general protection fault", regs, 639 if (notify_die(DIE_GPF, "general protection fault", regs,
628 error_code, 13, SIGSEGV) == NOTIFY_STOP) 640 error_code, 13, SIGSEGV) == NOTIFY_STOP)
629 return; 641 return;
@@ -1018,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1018fastcall unsigned long patch_espfix_desc(unsigned long uesp, 1030fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1019 unsigned long kesp) 1031 unsigned long kesp)
1020{ 1032{
1021 int cpu = smp_processor_id(); 1033 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
1022 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1023 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1024 unsigned long base = (kesp - uesp) & -THREAD_SIZE; 1034 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1025 unsigned long new_kesp = kesp - base; 1035 unsigned long new_kesp = kesp - base;
1026 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; 1036 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 6cb8f5336732..f64b81f3033b 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
200{ 200{
201 struct cpufreq_freqs *freq = data; 201 struct cpufreq_freqs *freq = data;
202 202
203 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
204 write_seqlock_irq(&xtime_lock);
205
206 if (!ref_freq) { 203 if (!ref_freq) {
207 if (!freq->old){ 204 if (!freq->old){
208 ref_freq = freq->new; 205 ref_freq = freq->new;
209 goto end; 206 return 0;
210 } 207 }
211 ref_freq = freq->old; 208 ref_freq = freq->old;
212 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; 209 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
@@ -233,13 +230,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
233 * TSC based sched_clock turns 230 * TSC based sched_clock turns
234 * to junk w/ cpufreq 231 * to junk w/ cpufreq
235 */ 232 */
236 mark_tsc_unstable(); 233 mark_tsc_unstable("cpufreq changes");
237 } 234 }
238 } 235 }
239 } 236 }
240end:
241 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
242 write_sequnlock_irq(&xtime_lock);
243 237
244 return 0; 238 return 0;
245} 239}
@@ -281,11 +275,12 @@ static struct clocksource clocksource_tsc = {
281 CLOCK_SOURCE_MUST_VERIFY, 275 CLOCK_SOURCE_MUST_VERIFY,
282}; 276};
283 277
284void mark_tsc_unstable(void) 278void mark_tsc_unstable(char *reason)
285{ 279{
286 if (!tsc_unstable) { 280 if (!tsc_unstable) {
287 tsc_unstable = 1; 281 tsc_unstable = 1;
288 tsc_enabled = 0; 282 tsc_enabled = 0;
283 printk("Marking TSC unstable due to: %s.\n", reason);
289 /* Can be called before registration */ 284 /* Can be called before registration */
290 if (clocksource_tsc.mult) 285 if (clocksource_tsc.mult)
291 clocksource_change_rating(&clocksource_tsc, 0); 286 clocksource_change_rating(&clocksource_tsc, 0);
diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S
new file mode 100644
index 000000000000..e51a8695d54e
--- /dev/null
+++ b/arch/i386/kernel/verify_cpu.S
@@ -0,0 +1,65 @@
1/* Check if CPU has some minimum CPUID bits
2 This runs in 16bit mode so that the caller can still use the BIOS
3 to output errors on the screen */
4#include <asm/cpufeature.h>
5
6verify_cpu:
7 pushfl # Save caller passed flags
8 pushl $0 # Kill any dangerous flags
9 popfl
10
11#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4
12 pushfl
13 orl $(1<<18),(%esp) # try setting AC
14 popfl
15 pushfl
16 popl %eax
17 testl $(1<<18),%eax
18 jz bad
19#endif
20#if REQUIRED_MASK1 != 0
21 pushfl # standard way to check for cpuid
22 popl %eax
23 movl %eax,%ebx
24 xorl $0x200000,%eax
25 pushl %eax
26 popfl
27 pushfl
28 popl %eax
29 cmpl %eax,%ebx
30 pushfl # standard way to check for cpuid
31 popl %eax
32 movl %eax,%ebx
33 xorl $0x200000,%eax
34 pushl %eax
35 popfl
36 pushfl
37 popl %eax
38 cmpl %eax,%ebx
39 jz bad # REQUIRED_MASK1 != 0 requires CPUID
40
41 movl $0x0,%eax # See if cpuid 1 is implemented
42 cpuid
43 cmpl $0x1,%eax
44 jb bad # no cpuid 1
45
46 movl $0x1,%eax # Does the cpu have what it takes
47 cpuid
48
49#if CONFIG_X86_MINIMUM_CPU_MODEL > 4
50#error add proper model checking here
51#endif
52
53 andl $REQUIRED_MASK1,%edx
54 xorl $REQUIRED_MASK1,%edx
55 jnz bad
56#endif /* REQUIRED_MASK1 */
57
58 popfl
59 xor %eax,%eax
60 ret
61
62bad:
63 popfl
64 movl $1,%eax
65 ret
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index 697a70e8c0c9..c8726c424b35 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -26,6 +26,7 @@
26#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h>
29#include <asm/vmi.h> 30#include <asm/vmi.h>
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/fixmap.h> 32#include <asm/fixmap.h>
@@ -56,7 +57,7 @@ static int disable_noidle;
56static int disable_vmi_timer; 57static int disable_vmi_timer;
57 58
58/* Cached VMI operations */ 59/* Cached VMI operations */
59struct { 60static struct {
60 void (*cpuid)(void /* non-c */); 61 void (*cpuid)(void /* non-c */);
61 void (*_set_ldt)(u32 selector); 62 void (*_set_ldt)(u32 selector);
62 void (*set_tr)(u32 selector); 63 void (*set_tr)(u32 selector);
@@ -65,16 +66,15 @@ struct {
65 void (*release_page)(u32, u32); 66 void (*release_page)(u32, u32);
66 void (*set_pte)(pte_t, pte_t *, unsigned); 67 void (*set_pte)(pte_t, pte_t *, unsigned);
67 void (*update_pte)(pte_t *, unsigned); 68 void (*update_pte)(pte_t *, unsigned);
68 void (*set_linear_mapping)(int, u32, u32, u32); 69 void (*set_linear_mapping)(int, void *, u32, u32);
69 void (*flush_tlb)(int); 70 void (*_flush_tlb)(int);
70 void (*set_initial_ap_state)(int, int); 71 void (*set_initial_ap_state)(int, int);
71 void (*halt)(void); 72 void (*halt)(void);
72 void (*set_lazy_mode)(int mode); 73 void (*set_lazy_mode)(int mode);
73} vmi_ops; 74} vmi_ops;
74 75
75/* XXX move this to alternative.h */ 76/* Cached VMI operations */
76extern struct paravirt_patch __start_parainstructions[], 77struct vmi_timer_ops vmi_timer_ops;
77 __stop_parainstructions[];
78 78
79/* 79/*
80 * VMI patching routines. 80 * VMI patching routines.
@@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[],
83#define MNEM_JMP 0xe9 83#define MNEM_JMP 0xe9
84#define MNEM_RET 0xc3 84#define MNEM_RET 0xc3
85 85
86static char irq_save_disable_callout[] = {
87 MNEM_CALL, 0, 0, 0, 0,
88 MNEM_CALL, 0, 0, 0, 0,
89 MNEM_RET
90};
91#define IRQ_PATCH_INT_MASK 0 86#define IRQ_PATCH_INT_MASK 0
92#define IRQ_PATCH_DISABLE 5 87#define IRQ_PATCH_DISABLE 5
93 88
@@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns)
135static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) 130static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
136{ 131{
137 switch (type) { 132 switch (type) {
138 case PARAVIRT_IRQ_DISABLE: 133 case PARAVIRT_PATCH(irq_disable):
139 return patch_internal(VMI_CALL_DisableInterrupts, len, insns); 134 return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
140 case PARAVIRT_IRQ_ENABLE: 135 case PARAVIRT_PATCH(irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, insns); 136 return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
142 case PARAVIRT_RESTORE_FLAGS: 137 case PARAVIRT_PATCH(restore_fl):
143 return patch_internal(VMI_CALL_SetInterruptMask, len, insns); 138 return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
144 case PARAVIRT_SAVE_FLAGS: 139 case PARAVIRT_PATCH(save_fl):
145 return patch_internal(VMI_CALL_GetInterruptMask, len, insns); 140 return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
146 case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: 141 case PARAVIRT_PATCH(iret):
147 if (len >= 10) {
148 patch_internal(VMI_CALL_GetInterruptMask, len, insns);
149 patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
150 return 10;
151 } else {
152 /*
153 * You bastards didn't leave enough room to
154 * patch save_flags_irq_disable inline. Patch
155 * to a helper
156 */
157 BUG_ON(len < 5);
158 *(char *)insns = MNEM_CALL;
159 patch_offset(insns, irq_save_disable_callout);
160 return 5;
161 }
162 case PARAVIRT_INTERRUPT_RETURN:
163 return patch_internal(VMI_CALL_IRET, len, insns); 142 return patch_internal(VMI_CALL_IRET, len, insns);
164 case PARAVIRT_STI_SYSEXIT: 143 case PARAVIRT_PATCH(irq_enable_sysexit):
165 return patch_internal(VMI_CALL_SYSEXIT, len, insns); 144 return patch_internal(VMI_CALL_SYSEXIT, len, insns);
166 default: 145 default:
167 break; 146 break;
@@ -230,24 +209,24 @@ static void vmi_set_tr(void)
230static void vmi_load_esp0(struct tss_struct *tss, 209static void vmi_load_esp0(struct tss_struct *tss,
231 struct thread_struct *thread) 210 struct thread_struct *thread)
232{ 211{
233 tss->esp0 = thread->esp0; 212 tss->x86_tss.esp0 = thread->esp0;
234 213
235 /* This can only happen when SEP is enabled, no need to test "SEP"arately */ 214 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
236 if (unlikely(tss->ss1 != thread->sysenter_cs)) { 215 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
237 tss->ss1 = thread->sysenter_cs; 216 tss->x86_tss.ss1 = thread->sysenter_cs;
238 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); 217 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
239 } 218 }
240 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); 219 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
241} 220}
242 221
243static void vmi_flush_tlb_user(void) 222static void vmi_flush_tlb_user(void)
244{ 223{
245 vmi_ops.flush_tlb(VMI_FLUSH_TLB); 224 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
246} 225}
247 226
248static void vmi_flush_tlb_kernel(void) 227static void vmi_flush_tlb_kernel(void)
249{ 228{
250 vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); 229 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
251} 230}
252 231
253/* Stub to do nothing at all; used for delays and unimplemented calls */ 232/* Stub to do nothing at all; used for delays and unimplemented calls */
@@ -255,18 +234,6 @@ static void vmi_nop(void)
255{ 234{
256} 235}
257 236
258/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
259static fastcall void vmi_safe_halt(void)
260{
261 int idle = vmi_stop_hz_timer();
262 vmi_ops.halt();
263 if (idle) {
264 local_irq_disable();
265 vmi_account_time_restart_hz_timer();
266 local_irq_enable();
267 }
268}
269
270#ifdef CONFIG_DEBUG_PAGE_TYPE 237#ifdef CONFIG_DEBUG_PAGE_TYPE
271 238
272#ifdef CONFIG_X86_PAE 239#ifdef CONFIG_X86_PAE
@@ -370,8 +337,11 @@ static void vmi_check_page_type(u32 pfn, int type)
370#define vmi_check_page_type(p,t) do { } while (0) 337#define vmi_check_page_type(p,t) do { } while (0)
371#endif 338#endif
372 339
373static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) 340#ifdef CONFIG_HIGHPTE
341static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
374{ 342{
343 void *va = kmap_atomic(page, type);
344
375 /* 345 /*
376 * Internally, the VMI ROM must map virtual addresses to physical 346 * Internally, the VMI ROM must map virtual addresses to physical
377 * addresses for processing MMU updates. By the time MMU updates 347 * addresses for processing MMU updates. By the time MMU updates
@@ -385,8 +355,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
385 * args: SLOT VA COUNT PFN 355 * args: SLOT VA COUNT PFN
386 */ 356 */
387 BUG_ON(type != KM_PTE0 && type != KM_PTE1); 357 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
388 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn); 358 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
359
360 return va;
389} 361}
362#endif
390 363
391static void vmi_allocate_pt(u32 pfn) 364static void vmi_allocate_pt(u32 pfn)
392{ 365{
@@ -443,13 +416,13 @@ static void vmi_release_pd(u32 pfn)
443 ((level) | (is_current_as(mm, user) ? \ 416 ((level) | (is_current_as(mm, user) ? \
444 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) 417 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
445 418
446static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) 419static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
447{ 420{
448 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 421 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
449 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 422 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
450} 423}
451 424
452static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) 425static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
453{ 426{
454 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 427 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
455 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); 428 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
@@ -462,7 +435,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte)
462 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); 435 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
463} 436}
464 437
465static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) 438static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
466{ 439{
467 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 440 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
468 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 441 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
@@ -516,7 +489,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
516 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 489 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
517} 490}
518 491
519void vmi_pmd_clear(pmd_t *pmd) 492static void vmi_pmd_clear(pmd_t *pmd)
520{ 493{
521 const pte_t pte = { 0 }; 494 const pte_t pte = { 0 };
522 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); 495 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
@@ -525,8 +498,6 @@ void vmi_pmd_clear(pmd_t *pmd)
525#endif 498#endif
526 499
527#ifdef CONFIG_SMP 500#ifdef CONFIG_SMP
528extern void setup_pda(void);
529
530static void __devinit 501static void __devinit
531vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, 502vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
532 unsigned long start_esp) 503 unsigned long start_esp)
@@ -551,13 +522,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
551 522
552 ap.ds = __USER_DS; 523 ap.ds = __USER_DS;
553 ap.es = __USER_DS; 524 ap.es = __USER_DS;
554 ap.fs = __KERNEL_PDA; 525 ap.fs = __KERNEL_PERCPU;
555 ap.gs = 0; 526 ap.gs = 0;
556 527
557 ap.eflags = 0; 528 ap.eflags = 0;
558 529
559 setup_pda();
560
561#ifdef CONFIG_X86_PAE 530#ifdef CONFIG_X86_PAE
562 /* efer should match BSP efer. */ 531 /* efer should match BSP efer. */
563 if (cpu_has_nx) { 532 if (cpu_has_nx) {
@@ -575,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
575} 544}
576#endif 545#endif
577 546
578static void vmi_set_lazy_mode(int mode) 547static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
579{ 548{
580 static DEFINE_PER_CPU(int, lazy_mode); 549 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
581 550
582 if (!vmi_ops.set_lazy_mode) 551 if (!vmi_ops.set_lazy_mode)
583 return; 552 return;
@@ -685,7 +654,7 @@ void vmi_bringup(void)
685{ 654{
686 /* We must establish the lowmem mapping for MMU ops to work */ 655 /* We must establish the lowmem mapping for MMU ops to work */
687 if (vmi_ops.set_linear_mapping) 656 if (vmi_ops.set_linear_mapping)
688 vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); 657 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
689} 658}
690 659
691/* 660/*
@@ -740,7 +709,6 @@ do { \
740 } \ 709 } \
741} while (0) 710} while (0)
742 711
743
744/* 712/*
745 * Activate the VMI interface and switch into paravirtualized mode 713 * Activate the VMI interface and switch into paravirtualized mode
746 */ 714 */
@@ -796,12 +764,6 @@ static inline int __init activate_vmi(void)
796 para_fill(irq_disable, DisableInterrupts); 764 para_fill(irq_disable, DisableInterrupts);
797 para_fill(irq_enable, EnableInterrupts); 765 para_fill(irq_enable, EnableInterrupts);
798 766
799 /* irq_save_disable !!! sheer pain */
800 patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
801 (char *)paravirt_ops.save_fl);
802 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
803 (char *)paravirt_ops.irq_disable);
804
805 para_fill(wbinvd, WBINVD); 767 para_fill(wbinvd, WBINVD);
806 para_fill(read_tsc, RDTSC); 768 para_fill(read_tsc, RDTSC);
807 769
@@ -831,8 +793,8 @@ static inline int __init activate_vmi(void)
831 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); 793 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
832 794
833 /* user and kernel flush are just handled with different flags to FlushTLB */ 795 /* user and kernel flush are just handled with different flags to FlushTLB */
834 para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB); 796 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
835 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB); 797 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
836 para_fill(flush_tlb_single, InvalPage); 798 para_fill(flush_tlb_single, InvalPage);
837 799
838 /* 800 /*
@@ -878,8 +840,13 @@ static inline int __init activate_vmi(void)
878 paravirt_ops.release_pt = vmi_release_pt; 840 paravirt_ops.release_pt = vmi_release_pt;
879 paravirt_ops.release_pd = vmi_release_pd; 841 paravirt_ops.release_pd = vmi_release_pd;
880 } 842 }
881 para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping, 843
882 SetLinearMapping); 844 /* Set linear is needed in all cases */
845 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
846#ifdef CONFIG_HIGHPTE
847 if (vmi_ops.set_linear_mapping)
848 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
849#endif
883 850
884 /* 851 /*
885 * These MUST always be patched. Don't support indirect jumps 852 * These MUST always be patched. Don't support indirect jumps
@@ -920,8 +887,8 @@ static inline int __init activate_vmi(void)
920 paravirt_ops.get_wallclock = vmi_get_wallclock; 887 paravirt_ops.get_wallclock = vmi_get_wallclock;
921 paravirt_ops.set_wallclock = vmi_set_wallclock; 888 paravirt_ops.set_wallclock = vmi_set_wallclock;
922#ifdef CONFIG_X86_LOCAL_APIC 889#ifdef CONFIG_X86_LOCAL_APIC
923 paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; 890 paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
924 paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; 891 paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
925#endif 892#endif
926 paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; 893 paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
927 paravirt_ops.get_cpu_khz = vmi_cpu_khz; 894 paravirt_ops.get_cpu_khz = vmi_cpu_khz;
@@ -933,11 +900,7 @@ static inline int __init activate_vmi(void)
933 disable_vmi_timer = 1; 900 disable_vmi_timer = 1;
934 } 901 }
935 902
936 /* No idle HZ mode only works if VMI timer and no idle is enabled */ 903 para_fill(safe_halt, Halt);
937 if (disable_noidle || disable_vmi_timer)
938 para_fill(safe_halt, Halt);
939 else
940 para_wrap(safe_halt, vmi_safe_halt, halt, Halt);
941 904
942 /* 905 /*
943 * Alternative instruction rewriting doesn't happen soon enough 906 * Alternative instruction rewriting doesn't happen soon enough
@@ -945,7 +908,7 @@ static inline int __init activate_vmi(void)
945 * to do this before IRQs get reenabled. Fortunately, it is 908 * to do this before IRQs get reenabled. Fortunately, it is
946 * idempotent. 909 * idempotent.
947 */ 910 */
948 apply_paravirt(__start_parainstructions, __stop_parainstructions); 911 apply_paravirt(__parainstructions, __parainstructions_end);
949 912
950 vmi_bringup(); 913 vmi_bringup();
951 914
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
new file mode 100644
index 000000000000..26a37f8a8762
--- /dev/null
+++ b/arch/i386/kernel/vmiclock.c
@@ -0,0 +1,318 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/arch_hooks.h>
32#include <asm/apicdef.h>
33#include <asm/apic.h>
34#include <asm/timer.h>
35
36#include <irq_vectors.h>
37#include "io_ports.h"
38
39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41
42static DEFINE_PER_CPU(struct clock_event_device, local_events);
43
44static inline u32 vmi_counter(u32 flags)
45{
46 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
47 * cycle counter. */
48 return flags & VMI_ALARM_COUNTER_MASK;
49}
50
51/* paravirt_ops.get_wallclock = vmi_get_wallclock */
52unsigned long vmi_get_wallclock(void)
53{
54 unsigned long long wallclock;
55 wallclock = vmi_timer_ops.get_wallclock(); // nsec
56 (void)do_div(wallclock, 1000000000); // sec
57
58 return wallclock;
59}
60
61/* paravirt_ops.set_wallclock = vmi_set_wallclock */
62int vmi_set_wallclock(unsigned long now)
63{
64 return 0;
65}
66
67/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
68unsigned long long vmi_get_sched_cycles(void)
69{
70 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
71}
72
73/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
74unsigned long vmi_cpu_khz(void)
75{
76 unsigned long long khz;
77 khz = vmi_timer_ops.get_cycle_frequency();
78 (void)do_div(khz, 1000);
79 return khz;
80}
81
82static inline unsigned int vmi_get_timer_vector(void)
83{
84#ifdef CONFIG_X86_IO_APIC
85 return FIRST_DEVICE_VECTOR;
86#else
87 return FIRST_EXTERNAL_VECTOR;
88#endif
89}
90
91/** vmi clockchip */
92#ifdef CONFIG_X86_LOCAL_APIC
93static unsigned int startup_timer_irq(unsigned int irq)
94{
95 unsigned long val = apic_read(APIC_LVTT);
96 apic_write(APIC_LVTT, vmi_get_timer_vector());
97
98 return (val & APIC_SEND_PENDING);
99}
100
101static void mask_timer_irq(unsigned int irq)
102{
103 unsigned long val = apic_read(APIC_LVTT);
104 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
105}
106
107static void unmask_timer_irq(unsigned int irq)
108{
109 unsigned long val = apic_read(APIC_LVTT);
110 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
111}
112
113static void ack_timer_irq(unsigned int irq)
114{
115 ack_APIC_irq();
116}
117
118static struct irq_chip vmi_chip __read_mostly = {
119 .name = "VMI-LOCAL",
120 .startup = startup_timer_irq,
121 .mask = mask_timer_irq,
122 .unmask = unmask_timer_irq,
123 .ack = ack_timer_irq
124};
125#endif
126
127/** vmi clockevent */
128#define VMI_ALARM_WIRED_IRQ0 0x00000000
129#define VMI_ALARM_WIRED_LVTT 0x00010000
130static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
131
132static inline int vmi_get_alarm_wiring(void)
133{
134 return vmi_wiring;
135}
136
137static void vmi_timer_set_mode(enum clock_event_mode mode,
138 struct clock_event_device *evt)
139{
140 cycle_t now, cycles_per_hz;
141 BUG_ON(!irqs_disabled());
142
143 switch (mode) {
144 case CLOCK_EVT_MODE_ONESHOT:
145 break;
146 case CLOCK_EVT_MODE_PERIODIC:
147 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
148 (void)do_div(cycles_per_hz, HZ);
149 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
150 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
151 break;
152 case CLOCK_EVT_MODE_UNUSED:
153 case CLOCK_EVT_MODE_SHUTDOWN:
154 switch (evt->mode) {
155 case CLOCK_EVT_MODE_ONESHOT:
156 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
157 break;
158 case CLOCK_EVT_MODE_PERIODIC:
159 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
160 break;
161 default:
162 break;
163 }
164 break;
165 default:
166 break;
167 }
168}
169
170static int vmi_timer_next_event(unsigned long delta,
171 struct clock_event_device *evt)
172{
173 /* Unfortunately, set_next_event interface only passes relative
174 * expiry, but we want absolute expiry. It'd be better if were
175 * were passed an aboslute expiry, since a bunch of time may
176 * have been stolen between the time the delta is computed and
177 * when we set the alarm below. */
178 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
179
180 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
181 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
182 return 0;
183}
184
185static struct clock_event_device vmi_clockevent = {
186 .name = "vmi-timer",
187 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
188 .shift = 22,
189 .set_mode = vmi_timer_set_mode,
190 .set_next_event = vmi_timer_next_event,
191 .rating = 1000,
192 .irq = 0,
193};
194
195static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
196{
197 struct clock_event_device *evt = &__get_cpu_var(local_events);
198 evt->event_handler(evt);
199 return IRQ_HANDLED;
200}
201
202static struct irqaction vmi_clock_action = {
203 .name = "vmi-timer",
204 .handler = vmi_timer_interrupt,
205 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
206 .mask = CPU_MASK_ALL,
207};
208
209static void __devinit vmi_time_init_clockevent(void)
210{
211 cycle_t cycles_per_msec;
212 struct clock_event_device *evt;
213
214 int cpu = smp_processor_id();
215 evt = &__get_cpu_var(local_events);
216
217 /* Use cycles_per_msec since div_sc params are 32-bits. */
218 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
219 (void)do_div(cycles_per_msec, 1000);
220
221 memcpy(evt, &vmi_clockevent, sizeof(*evt));
222 /* Must pick .shift such that .mult fits in 32-bits. Choosing
223 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
224 * before overflow. */
225 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
226 /* Upper bound is clockevent's use of ulong for cycle deltas. */
227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
228 evt->min_delta_ns = clockevent_delta2ns(1, evt);
229 evt->cpumask = cpumask_of_cpu(cpu);
230
231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
232 evt->name, evt->mult, evt->shift);
233 clockevents_register_device(evt);
234}
235
236void __init vmi_time_init(void)
237{
238 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
239 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
240
241 vmi_time_init_clockevent();
242 setup_irq(0, &vmi_clock_action);
243}
244
245#ifdef CONFIG_X86_LOCAL_APIC
246void __devinit vmi_time_bsp_init(void)
247{
248 /*
249 * On APIC systems, we want local timers to fire on each cpu. We do
250 * this by programming LVTT to deliver timer events to the IRQ handler
251 * for IRQ-0, since we can't re-use the APIC local timer handler
252 * without interfering with that code.
253 */
254 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
255 local_irq_disable();
256#ifdef CONFIG_X86_SMP
257 /*
258 * XXX handle_percpu_irq only defined for SMP; we need to switch over
259 * to using it, since this is a local interrupt, which each CPU must
260 * handle individually without locking out or dropping simultaneous
261 * local timers on other CPUs. We also don't want to trigger the
262 * quirk workaround code for interrupts which gets invoked from
263 * handle_percpu_irq via eoi, so we use our own IRQ chip.
264 */
265 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
266#else
267 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
268#endif
269 vmi_wiring = VMI_ALARM_WIRED_LVTT;
270 apic_write(APIC_LVTT, vmi_get_timer_vector());
271 local_irq_enable();
272 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
273}
274
275void __devinit vmi_time_ap_init(void)
276{
277 vmi_time_init_clockevent();
278 apic_write(APIC_LVTT, vmi_get_timer_vector());
279}
280#endif
281
282/** vmi clocksource */
283
284static cycle_t read_real_cycles(void)
285{
286 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
287}
288
289static struct clocksource clocksource_vmi = {
290 .name = "vmi-timer",
291 .rating = 450,
292 .read = read_real_cycles,
293 .mask = CLOCKSOURCE_MASK(64),
294 .mult = 0, /* to be set */
295 .shift = 22,
296 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
297};
298
299static int __init init_vmi_clocksource(void)
300{
301 cycle_t cycles_per_msec;
302
303 if (!vmi_timer_ops.get_cycle_frequency)
304 return 0;
305 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
306 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
307 (void)do_div(cycles_per_msec, 1000);
308
309 /* Note that clocksource.{mult, shift} converts in the opposite direction
310 * as clockevents. */
311 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
312 clocksource_vmi.shift);
313
314 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
315 return clocksource_register(&clocksource_vmi);
316
317}
318module_init(init_vmi_clocksource);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
deleted file mode 100644
index 9dfb17739b67..000000000000
--- a/arch/i386/kernel/vmitime.c
+++ /dev/null
@@ -1,482 +0,0 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25/*
26 * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
27 * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
28 * See comments there for proper credits.
29 */
30
31#include <linux/spinlock.h>
32#include <linux/init.h>
33#include <linux/errno.h>
34#include <linux/jiffies.h>
35#include <linux/interrupt.h>
36#include <linux/kernel_stat.h>
37#include <linux/rcupdate.h>
38#include <linux/clocksource.h>
39
40#include <asm/timer.h>
41#include <asm/io.h>
42#include <asm/apic.h>
43#include <asm/div64.h>
44#include <asm/timer.h>
45#include <asm/desc.h>
46
47#include <asm/vmi.h>
48#include <asm/vmi_time.h>
49
50#include <mach_timer.h>
51#include <io_ports.h>
52
53#ifdef CONFIG_X86_LOCAL_APIC
54#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
55#else
56#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
57#endif
58
59/* Cached VMI operations */
60struct vmi_timer_ops vmi_timer_ops;
61
62#ifdef CONFIG_NO_IDLE_HZ
63
64/* /proc/sys/kernel/hz_timer state. */
65int sysctl_hz_timer;
66
67/* Some stats */
68static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
69static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
70static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
71
72#endif /* CONFIG_NO_IDLE_HZ */
73
74/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
75static int alarm_hz = CONFIG_VMI_ALARM_HZ;
76
77/* Cache of the value get_cycle_frequency / HZ. */
78static signed long long cycles_per_jiffy;
79
80/* Cache of the value get_cycle_frequency / alarm_hz. */
81static signed long long cycles_per_alarm;
82
83/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
84 * Protected by xtime_lock. */
85static unsigned long long real_cycles_accounted_system;
86
87/* The number of cycles accounted for by update_process_times(), per cpu. */
88static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
89
90/* The number of stolen cycles accounted, per cpu. */
91static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
92
93/* Clock source. */
94static cycle_t read_real_cycles(void)
95{
96 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
97}
98
99static cycle_t read_available_cycles(void)
100{
101 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
102}
103
104#if 0
105static cycle_t read_stolen_cycles(void)
106{
107 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
108}
109#endif /* 0 */
110
111static struct clocksource clocksource_vmi = {
112 .name = "vmi-timer",
113 .rating = 450,
114 .read = read_real_cycles,
115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */
117 .shift = 22,
118 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
119};
120
121
122/* Timer interrupt handler. */
123static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
124
125static struct irqaction vmi_timer_irq = {
126 .handler = vmi_timer_interrupt,
127 .flags = IRQF_DISABLED,
128 .mask = CPU_MASK_NONE,
129 .name = "VMI-alarm",
130};
131
132/* Alarm rate */
133static int __init vmi_timer_alarm_rate_setup(char* str)
134{
135 int alarm_rate;
136 if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
137 alarm_hz = alarm_rate;
138 printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
139 }
140 return 1;
141}
142__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
143
144
145/* Initialization */
146static void vmi_get_wallclock_ts(struct timespec *ts)
147{
148 unsigned long long wallclock;
149 wallclock = vmi_timer_ops.get_wallclock(); // nsec units
150 ts->tv_nsec = do_div(wallclock, 1000000000);
151 ts->tv_sec = wallclock;
152}
153
154unsigned long vmi_get_wallclock(void)
155{
156 struct timespec ts;
157 vmi_get_wallclock_ts(&ts);
158 return ts.tv_sec;
159}
160
161int vmi_set_wallclock(unsigned long now)
162{
163 return -1;
164}
165
166unsigned long long vmi_get_sched_cycles(void)
167{
168 return read_available_cycles();
169}
170
171unsigned long vmi_cpu_khz(void)
172{
173 unsigned long long khz;
174
175 khz = vmi_timer_ops.get_cycle_frequency();
176 (void)do_div(khz, 1000);
177 return khz;
178}
179
180void __init vmi_time_init(void)
181{
182 unsigned long long cycles_per_sec, cycles_per_msec;
183 unsigned long flags;
184
185 local_irq_save(flags);
186 setup_irq(0, &vmi_timer_irq);
187#ifdef CONFIG_X86_LOCAL_APIC
188 set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
189#endif
190
191 real_cycles_accounted_system = read_real_cycles();
192 per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
193
194 cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
195 cycles_per_jiffy = cycles_per_sec;
196 (void)do_div(cycles_per_jiffy, HZ);
197 cycles_per_alarm = cycles_per_sec;
198 (void)do_div(cycles_per_alarm, alarm_hz);
199 cycles_per_msec = cycles_per_sec;
200 (void)do_div(cycles_per_msec, 1000);
201
202 printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
203 "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
204 cycles_per_alarm);
205
206 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
207 clocksource_vmi.shift);
208 if (clocksource_register(&clocksource_vmi))
209 printk(KERN_WARNING "Error registering VMITIME clocksource.");
210
211 /* Disable PIT. */
212 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
213
214 /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
215 * reduce the latency calling update_process_times. */
216 vmi_timer_ops.set_alarm(
217 VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
218 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
219 cycles_per_alarm);
220
221 local_irq_restore(flags);
222}
223
224#ifdef CONFIG_X86_LOCAL_APIC
225
226void __init vmi_timer_setup_boot_alarm(void)
227{
228 local_irq_disable();
229
230 /* Route the interrupt to the correct vector. */
231 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
232
233 /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
234 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
235 vmi_timer_ops.set_alarm(
236 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
237 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
238 cycles_per_alarm);
239 local_irq_enable();
240}
241
242/* Initialize the time accounting variables for an AP on an SMP system.
243 * Also, set the local alarm for the AP. */
244void __devinit vmi_timer_setup_secondary_alarm(void)
245{
246 int cpu = smp_processor_id();
247
248 /* Route the interrupt to the correct vector. */
249 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
250
251 per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
252
253 vmi_timer_ops.set_alarm(
254 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
255 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
256 cycles_per_alarm);
257}
258
259#endif
260
261/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
262static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
263{
264 long long cycles_not_accounted;
265
266 write_seqlock(&xtime_lock);
267
268 cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
269 while (cycles_not_accounted >= cycles_per_jiffy) {
270 /* systems wide jiffies. */
271 do_timer(1);
272
273 cycles_not_accounted -= cycles_per_jiffy;
274 real_cycles_accounted_system += cycles_per_jiffy;
275 }
276
277 write_sequnlock(&xtime_lock);
278}
279
280/* Update per-cpu process times. */
281static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
282 unsigned long long cur_process_times_cycles)
283{
284 long long cycles_not_accounted;
285 cycles_not_accounted = cur_process_times_cycles -
286 per_cpu(process_times_cycles_accounted_cpu, cpu);
287
288 while (cycles_not_accounted >= cycles_per_jiffy) {
289 /* Account time to the current process. This includes
290 * calling into the scheduler to decrement the timeslice
291 * and possibly reschedule.*/
292 update_process_times(user_mode(regs));
293 /* XXX handle /proc/profile multiplier. */
294 profile_tick(CPU_PROFILING);
295
296 cycles_not_accounted -= cycles_per_jiffy;
297 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
298 }
299}
300
301#ifdef CONFIG_NO_IDLE_HZ
302/* Update per-cpu idle times. Used when a no-hz halt is ended. */
303static void vmi_account_no_hz_idle_cycles(int cpu,
304 unsigned long long cur_process_times_cycles)
305{
306 long long cycles_not_accounted;
307 unsigned long no_idle_hz_jiffies = 0;
308
309 cycles_not_accounted = cur_process_times_cycles -
310 per_cpu(process_times_cycles_accounted_cpu, cpu);
311
312 while (cycles_not_accounted >= cycles_per_jiffy) {
313 no_idle_hz_jiffies++;
314 cycles_not_accounted -= cycles_per_jiffy;
315 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
316 }
317 /* Account time to the idle process. */
318 account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
319}
320#endif
321
322/* Update per-cpu stolen time. */
323static void vmi_account_stolen_cycles(int cpu,
324 unsigned long long cur_real_cycles,
325 unsigned long long cur_avail_cycles)
326{
327 long long stolen_cycles_not_accounted;
328 unsigned long stolen_jiffies = 0;
329
330 if (cur_real_cycles < cur_avail_cycles)
331 return;
332
333 stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
334 per_cpu(stolen_cycles_accounted_cpu, cpu);
335
336 while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
337 stolen_jiffies++;
338 stolen_cycles_not_accounted -= cycles_per_jiffy;
339 per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
340 }
341 /* HACK: pass NULL to force time onto cpustat->steal. */
342 account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
343}
344
345/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
346 * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
347static void vmi_local_timer_interrupt(int cpu)
348{
349 unsigned long long cur_real_cycles, cur_process_times_cycles;
350
351 cur_real_cycles = read_real_cycles();
352 cur_process_times_cycles = read_available_cycles();
353 /* Update system wide (real) time state (xtime, jiffies). */
354 vmi_account_real_cycles(cur_real_cycles);
355 /* Update per-cpu process times. */
356 vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
357 /* Update time stolen from this cpu by the hypervisor. */
358 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
359}
360
361#ifdef CONFIG_NO_IDLE_HZ
362
363/* Must be called only from idle loop, with interrupts disabled. */
364int vmi_stop_hz_timer(void)
365{
366 /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
367
368 unsigned long seq, next;
369 unsigned long long real_cycles_expiry;
370 int cpu = smp_processor_id();
371
372 BUG_ON(!irqs_disabled());
373 if (sysctl_hz_timer != 0)
374 return 0;
375
376 cpu_set(cpu, nohz_cpu_mask);
377 smp_mb();
378
379 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
380 (next = next_timer_interrupt(),
381 time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) {
382 cpu_clear(cpu, nohz_cpu_mask);
383 return 0;
384 }
385
386 /* Convert jiffies to the real cycle counter. */
387 do {
388 seq = read_seqbegin(&xtime_lock);
389 real_cycles_expiry = real_cycles_accounted_system +
390 (long)(next - jiffies) * cycles_per_jiffy;
391 } while (read_seqretry(&xtime_lock, seq));
392
393 /* This cpu is going idle. Disable the periodic alarm. */
394 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
395 per_cpu(idle_start_jiffies, cpu) = jiffies;
396 /* Set the real time alarm to expire at the next event. */
397 vmi_timer_ops.set_alarm(
398 VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
399 real_cycles_expiry, 0);
400 return 1;
401}
402
403static void vmi_reenable_hz_timer(int cpu)
404{
405 /* For /proc/vmi/info idle_hz stat. */
406 per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
407 per_cpu(vmi_idle_no_hz_irqs, cpu)++;
408
409 /* Don't bother explicitly cancelling the one-shot alarm -- at
410 * worse we will receive a spurious timer interrupt. */
411 vmi_timer_ops.set_alarm(
412 VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
413 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
414 cycles_per_alarm);
415 /* Indicate this cpu is no longer nohz idle. */
416 cpu_clear(cpu, nohz_cpu_mask);
417}
418
419/* Called from interrupt handlers when (local) HZ timer is disabled. */
420void vmi_account_time_restart_hz_timer(void)
421{
422 unsigned long long cur_real_cycles, cur_process_times_cycles;
423 int cpu = smp_processor_id();
424
425 BUG_ON(!irqs_disabled());
426 /* Account the time during which the HZ timer was disabled. */
427 cur_real_cycles = read_real_cycles();
428 cur_process_times_cycles = read_available_cycles();
429 /* Update system wide (real) time state (xtime, jiffies). */
430 vmi_account_real_cycles(cur_real_cycles);
431 /* Update per-cpu idle times. */
432 vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
433 /* Update time stolen from this cpu by the hypervisor. */
434 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
435 /* Reenable the hz timer. */
436 vmi_reenable_hz_timer(cpu);
437}
438
439#endif /* CONFIG_NO_IDLE_HZ */
440
441/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
442 * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
443 * APIC setup and setup_boot_vmi_alarm() is called. */
444static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
445{
446 vmi_local_timer_interrupt(smp_processor_id());
447 return IRQ_HANDLED;
448}
449
450#ifdef CONFIG_X86_LOCAL_APIC
451
452/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
453 * Also used in UP when CONFIG_X86_LOCAL_APIC.
454 * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
455void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
456{
457 struct pt_regs *old_regs = set_irq_regs(regs);
458 int cpu = smp_processor_id();
459
460 /*
461 * the NMI deadlock-detector uses this.
462 */
463 per_cpu(irq_stat,cpu).apic_timer_irqs++;
464
465 /*
466 * NOTE! We'd better ACK the irq immediately,
467 * because timer handling can be slow.
468 */
469 ack_APIC_irq();
470
471 /*
472 * update_process_times() expects us to have done irq_enter().
473 * Besides, if we don't timer interrupts ignore the global
474 * interrupt lock, which is the WrongThing (tm) to do.
475 */
476 irq_enter();
477 vmi_local_timer_interrupt(cpu);
478 irq_exit();
479 set_irq_regs(old_regs);
480}
481
482#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6f38f818380b..23e8614edeee 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -26,12 +26,11 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
26OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
27ENTRY(phys_startup_32) 27ENTRY(phys_startup_32)
28jiffies = jiffies_64; 28jiffies = jiffies_64;
29_proxy_pda = 1;
30 29
31PHDRS { 30PHDRS {
32 text PT_LOAD FLAGS(5); /* R_E */ 31 text PT_LOAD FLAGS(5); /* R_E */
33 data PT_LOAD FLAGS(7); /* RWE */ 32 data PT_LOAD FLAGS(7); /* RWE */
34 note PT_NOTE FLAGS(4); /* R__ */ 33 note PT_NOTE FLAGS(0); /* ___ */
35} 34}
36SECTIONS 35SECTIONS
37{ 36{
@@ -61,8 +60,6 @@ SECTIONS
61 __stop___ex_table = .; 60 __stop___ex_table = .;
62 } 61 }
63 62
64 RODATA
65
66 BUG_TABLE 63 BUG_TABLE
67 64
68 . = ALIGN(4); 65 . = ALIGN(4);
@@ -72,6 +69,8 @@ SECTIONS
72 __tracedata_end = .; 69 __tracedata_end = .;
73 } 70 }
74 71
72 RODATA
73
75 /* writeable */ 74 /* writeable */
76 . = ALIGN(4096); 75 . = ALIGN(4096);
77 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ 76 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
@@ -117,22 +116,11 @@ SECTIONS
117 116
118 /* might get freed after init */ 117 /* might get freed after init */
119 . = ALIGN(4096); 118 . = ALIGN(4096);
120 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
121 __smp_alt_begin = .;
122 __smp_alt_instructions = .;
123 *(.smp_altinstructions)
124 __smp_alt_instructions_end = .;
125 }
126 . = ALIGN(4);
127 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 119 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
128 __smp_locks = .; 120 __smp_locks = .;
129 *(.smp_locks) 121 *(.smp_locks)
130 __smp_locks_end = .; 122 __smp_locks_end = .;
131 } 123 }
132 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
133 *(.smp_altinstr_replacement)
134 __smp_alt_end = .;
135 }
136 /* will be freed after init 124 /* will be freed after init
137 * Following ALIGN() is required to make sure no other data falls on the 125 * Following ALIGN() is required to make sure no other data falls on the
138 * same page where __smp_alt_end is pointing as that page might be freed 126 * same page where __smp_alt_end is pointing as that page might be freed
@@ -178,9 +166,9 @@ SECTIONS
178 } 166 }
179 . = ALIGN(4); 167 . = ALIGN(4);
180 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 168 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
181 __start_parainstructions = .; 169 __parainstructions = .;
182 *(.parainstructions) 170 *(.parainstructions)
183 __stop_parainstructions = .; 171 __parainstructions_end = .;
184 } 172 }
185 /* .exit.text is discard at runtime, not link time, to deal with references 173 /* .exit.text is discard at runtime, not link time, to deal with references
186 from .altinstructions and .eh_frame */ 174 from .altinstructions and .eh_frame */
@@ -194,7 +182,7 @@ SECTIONS
194 __initramfs_end = .; 182 __initramfs_end = .;
195 } 183 }
196#endif 184#endif
197 . = ALIGN(L1_CACHE_BYTES); 185 . = ALIGN(4096);
198 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 186 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
199 __per_cpu_start = .; 187 __per_cpu_start = .;
200 *(.data.percpu) 188 *(.data.percpu)
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index f66cd11adb72..4a8b0ed9b8fb 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
7 7
8SECTIONS 8SECTIONS
9{ 9{
10 . = VDSO_PRELINK + SIZEOF_HEADERS; 10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11 11
12 .hash : { *(.hash) } :text 12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) } 13 .gnu.hash : { *(.gnu.hash) }
@@ -21,7 +21,7 @@ SECTIONS
21 For the layouts to match, we need to skip more than enough 21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount 22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */ 23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK + 0x400; 24 . = VDSO_PRELINK_asm + 0x400;
25 25
26 .text : { *(.text) } :text =0x90909090 26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note 27 .note : { *(.note.*) } :text :note