aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-09-28 08:29:59 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2006-09-28 08:29:59 -0400
commit185a257f2f73bcd89050ad02da5bedbc28fc43fa (patch)
tree5e32586114534ed3f2165614cba3d578f5d87307 /arch/x86_64/kernel
parent3f1a9aaeffd8d1cbc5ab9776c45cbd66af1c9699 (diff)
parenta77c64c1a641950626181b4857abb701d8f38ccc (diff)
Merge branch 'master' into gfs2
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile9
-rw-r--r--arch/x86_64/kernel/aperture.c25
-rw-r--r--arch/x86_64/kernel/apic.c229
-rw-r--r--arch/x86_64/kernel/crash.c26
-rw-r--r--arch/x86_64/kernel/e820.c291
-rw-r--r--arch/x86_64/kernel/early-quirks.c122
-rw-r--r--arch/x86_64/kernel/early_printk.c20
-rw-r--r--arch/x86_64/kernel/entry.S63
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c1
-rw-r--r--arch/x86_64/kernel/genapic_flat.c5
-rw-r--r--arch/x86_64/kernel/head.S15
-rw-r--r--arch/x86_64/kernel/head64.c44
-rw-r--r--arch/x86_64/kernel/i8259.c15
-rw-r--r--arch/x86_64/kernel/io_apic.c482
-rw-r--r--arch/x86_64/kernel/ioport.c1
-rw-r--r--arch/x86_64/kernel/irq.c12
-rw-r--r--arch/x86_64/kernel/machine_kexec.c99
-rw-r--r--arch/x86_64/kernel/mce.c29
-rw-r--r--arch/x86_64/kernel/mce_intel.c30
-rw-r--r--arch/x86_64/kernel/mpparse.c238
-rw-r--r--arch/x86_64/kernel/nmi.c840
-rw-r--r--arch/x86_64/kernel/pci-calgary.c142
-rw-r--r--arch/x86_64/kernel/pci-dma.c7
-rw-r--r--arch/x86_64/kernel/pci-gart.c3
-rw-r--r--arch/x86_64/kernel/pci-nommu.c1
-rw-r--r--arch/x86_64/kernel/process.c110
-rw-r--r--arch/x86_64/kernel/ptrace.c29
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S171
-rw-r--r--arch/x86_64/kernel/setup.c249
-rw-r--r--arch/x86_64/kernel/setup64.c45
-rw-r--r--arch/x86_64/kernel/signal.c87
-rw-r--r--arch/x86_64/kernel/smp.c23
-rw-r--r--arch/x86_64/kernel/smpboot.c17
-rw-r--r--arch/x86_64/kernel/stacktrace.c220
-rw-r--r--arch/x86_64/kernel/suspend_asm.S2
-rw-r--r--arch/x86_64/kernel/tce.c12
-rw-r--r--arch/x86_64/kernel/time.c131
-rw-r--r--arch/x86_64/kernel/trampoline.S2
-rw-r--r--arch/x86_64/kernel/traps.c204
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S25
-rw-r--r--arch/x86_64/kernel/vsmp.c3
-rw-r--r--arch/x86_64/kernel/vsyscall.c98
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c1
43 files changed, 2084 insertions, 2094 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index b5aaeafc1cd3..3c7cbff04d3d 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
11 pci-dma.o pci-nommu.o alternative.o 11 pci-dma.o pci-nommu.o alternative.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-$(CONFIG_X86_MCE) += mce.o 14obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
15obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 15obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
16obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 16obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
17obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ 17obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
@@ -20,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o 20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o 21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o 22obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
23obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 23obj-y += apic.o nmi.o
24obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ 24obj-y += io_apic.o mpparse.o \
25 genapic.o genapic_cluster.o genapic_flat.o 25 genapic.o genapic_cluster.o genapic_flat.o
26obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o 26obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
27obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 27obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
@@ -39,12 +39,14 @@ obj-$(CONFIG_K8_NB) += k8.o
39obj-$(CONFIG_AUDIT) += audit.o 39obj-$(CONFIG_AUDIT) += audit.o
40 40
41obj-$(CONFIG_MODULES) += module.o 41obj-$(CONFIG_MODULES) += module.o
42obj-$(CONFIG_PCI) += early-quirks.o
42 43
43obj-y += topology.o 44obj-y += topology.o
44obj-y += intel_cacheinfo.o 45obj-y += intel_cacheinfo.o
45 46
46CFLAGS_vsyscall.o := $(PROFILING) -g0 47CFLAGS_vsyscall.o := $(PROFILING) -g0
47 48
49therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o
48bootflag-y += ../../i386/kernel/bootflag.o 50bootflag-y += ../../i386/kernel/bootflag.o
49cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o 51cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
50topology-y += ../../i386/kernel/topology.o 52topology-y += ../../i386/kernel/topology.o
@@ -54,4 +56,3 @@ quirks-y += ../../i386/kernel/quirks.o
54i8237-y += ../../i386/kernel/i8237.o 56i8237-y += ../../i386/kernel/i8237.o
55msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o 57msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
56alternative-y += ../../i386/kernel/alternative.o 58alternative-y += ../../i386/kernel/alternative.o
57
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 58af8e73738b..b487396c4c5b 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -17,6 +17,7 @@
17#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
18#include <linux/pci.h> 18#include <linux/pci.h>
19#include <linux/bitops.h> 19#include <linux/bitops.h>
20#include <linux/ioport.h>
20#include <asm/e820.h> 21#include <asm/e820.h>
21#include <asm/io.h> 22#include <asm/io.h>
22#include <asm/proto.h> 23#include <asm/proto.h>
@@ -33,6 +34,18 @@ int fallback_aper_force __initdata = 0;
33 34
34int fix_aperture __initdata = 1; 35int fix_aperture __initdata = 1;
35 36
37static struct resource gart_resource = {
38 .name = "GART",
39 .flags = IORESOURCE_MEM,
40};
41
42static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
43{
44 gart_resource.start = aper_base;
45 gart_resource.end = aper_base + aper_size - 1;
46 insert_resource(&iomem_resource, &gart_resource);
47}
48
36/* This code runs before the PCI subsystem is initialized, so just 49/* This code runs before the PCI subsystem is initialized, so just
37 access the northbridge directly. */ 50 access the northbridge directly. */
38 51
@@ -48,7 +61,7 @@ static u32 __init allocate_aperture(void)
48 61
49 /* 62 /*
50 * Aperture has to be naturally aligned. This means an 2GB aperture won't 63 * Aperture has to be naturally aligned. This means an 2GB aperture won't
51 * have much chances to find a place in the lower 4GB of memory. 64 * have much chance of finding a place in the lower 4GB of memory.
52 * Unfortunately we cannot move it up because that would make the 65 * Unfortunately we cannot move it up because that would make the
53 * IOMMU useless. 66 * IOMMU useless.
54 */ 67 */
@@ -62,6 +75,7 @@ static u32 __init allocate_aperture(void)
62 } 75 }
63 printk("Mapping aperture over %d KB of RAM @ %lx\n", 76 printk("Mapping aperture over %d KB of RAM @ %lx\n",
64 aper_size >> 10, __pa(p)); 77 aper_size >> 10, __pa(p));
78 insert_aperture_resource((u32)__pa(p), aper_size);
65 return (u32)__pa(p); 79 return (u32)__pa(p);
66} 80}
67 81
@@ -198,7 +212,7 @@ void __init iommu_hole_init(void)
198 u64 aper_base, last_aper_base = 0; 212 u64 aper_base, last_aper_base = 0;
199 int valid_agp = 0; 213 int valid_agp = 0;
200 214
201 if (iommu_aperture_disabled || !fix_aperture) 215 if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
202 return; 216 return;
203 217
204 printk("Checking aperture...\n"); 218 printk("Checking aperture...\n");
@@ -233,8 +247,13 @@ void __init iommu_hole_init(void)
233 last_aper_base = aper_base; 247 last_aper_base = aper_base;
234 } 248 }
235 249
236 if (!fix && !fallback_aper_force) 250 if (!fix && !fallback_aper_force) {
251 if (last_aper_base) {
252 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
253 insert_aperture_resource((u32)last_aper_base, n);
254 }
237 return; 255 return;
256 }
238 257
239 if (!fallback_aper_force) 258 if (!fallback_aper_force)
240 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 259 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 2b8cef037a65..135ff25e6b44 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
25#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/ioport.h>
28 29
29#include <asm/atomic.h> 30#include <asm/atomic.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
@@ -36,13 +37,20 @@
36#include <asm/idle.h> 37#include <asm/idle.h>
37#include <asm/proto.h> 38#include <asm/proto.h>
38#include <asm/timex.h> 39#include <asm/timex.h>
40#include <asm/apic.h>
39 41
42int apic_mapped;
40int apic_verbosity; 43int apic_verbosity;
41int apic_runs_main_timer; 44int apic_runs_main_timer;
42int apic_calibrate_pmtmr __initdata; 45int apic_calibrate_pmtmr __initdata;
43 46
44int disable_apic_timer __initdata; 47int disable_apic_timer __initdata;
45 48
49static struct resource lapic_resource = {
50 .name = "Local APIC",
51 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
52};
53
46/* 54/*
47 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as 55 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
48 * IPIs in place of local APIC timers 56 * IPIs in place of local APIC timers
@@ -136,72 +144,40 @@ void clear_local_APIC(void)
136 apic_read(APIC_ESR); 144 apic_read(APIC_ESR);
137} 145}
138 146
139void __init connect_bsp_APIC(void)
140{
141 if (pic_mode) {
142 /*
143 * Do not trust the local APIC being empty at bootup.
144 */
145 clear_local_APIC();
146 /*
147 * PIC mode, enable APIC mode in the IMCR, i.e.
148 * connect BSP's local APIC to INT and NMI lines.
149 */
150 apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
151 outb(0x70, 0x22);
152 outb(0x01, 0x23);
153 }
154}
155
156void disconnect_bsp_APIC(int virt_wire_setup) 147void disconnect_bsp_APIC(int virt_wire_setup)
157{ 148{
158 if (pic_mode) { 149 /* Go back to Virtual Wire compatibility mode */
159 /* 150 unsigned long value;
160 * Put the board back into PIC mode (has an effect 151
161 * only on certain older boards). Note that APIC 152 /* For the spurious interrupt use vector F, and enable it */
162 * interrupts, including IPIs, won't work beyond 153 value = apic_read(APIC_SPIV);
163 * this point! The only exception are INIT IPIs. 154 value &= ~APIC_VECTOR_MASK;
164 */ 155 value |= APIC_SPIV_APIC_ENABLED;
165 apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); 156 value |= 0xf;
166 outb(0x70, 0x22); 157 apic_write(APIC_SPIV, value);
167 outb(0x00, 0x23);
168 }
169 else {
170 /* Go back to Virtual Wire compatibility mode */
171 unsigned long value;
172
173 /* For the spurious interrupt use vector F, and enable it */
174 value = apic_read(APIC_SPIV);
175 value &= ~APIC_VECTOR_MASK;
176 value |= APIC_SPIV_APIC_ENABLED;
177 value |= 0xf;
178 apic_write(APIC_SPIV, value);
179
180 if (!virt_wire_setup) {
181 /* For LVT0 make it edge triggered, active high, external and enabled */
182 value = apic_read(APIC_LVT0);
183 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
184 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
185 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
186 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
187 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
188 apic_write(APIC_LVT0, value);
189 }
190 else {
191 /* Disable LVT0 */
192 apic_write(APIC_LVT0, APIC_LVT_MASKED);
193 }
194 158
195 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 159 if (!virt_wire_setup) {
196 value = apic_read(APIC_LVT1); 160 /* For LVT0 make it edge triggered, active high, external and enabled */
197 value &= ~( 161 value = apic_read(APIC_LVT0);
198 APIC_MODE_MASK | APIC_SEND_PENDING | 162 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
199 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 163 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
200 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 164 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
201 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 165 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
202 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 166 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
203 apic_write(APIC_LVT1, value); 167 apic_write(APIC_LVT0, value);
168 } else {
169 /* Disable LVT0 */
170 apic_write(APIC_LVT0, APIC_LVT_MASKED);
204 } 171 }
172
173 /* For LVT1 make it edge triggered, active high, nmi and enabled */
174 value = apic_read(APIC_LVT1);
175 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
176 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
177 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
178 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
179 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
180 apic_write(APIC_LVT1, value);
205} 181}
206 182
207void disable_local_APIC(void) 183void disable_local_APIC(void)
@@ -297,8 +273,6 @@ void __init sync_Arb_IDs(void)
297 | APIC_DM_INIT); 273 | APIC_DM_INIT);
298} 274}
299 275
300extern void __error_in_apic_c (void);
301
302/* 276/*
303 * An initial setup of the virtual wire mode. 277 * An initial setup of the virtual wire mode.
304 */ 278 */
@@ -345,8 +319,7 @@ void __cpuinit setup_local_APIC (void)
345 319
346 value = apic_read(APIC_LVR); 320 value = apic_read(APIC_LVR);
347 321
348 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) 322 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
349 __error_in_apic_c();
350 323
351 /* 324 /*
352 * Double-check whether this APIC is really registered. 325 * Double-check whether this APIC is really registered.
@@ -399,32 +372,8 @@ void __cpuinit setup_local_APIC (void)
399 */ 372 */
400 value |= APIC_SPIV_APIC_ENABLED; 373 value |= APIC_SPIV_APIC_ENABLED;
401 374
402 /* 375 /* We always use processor focus */
403 * Some unknown Intel IO/APIC (or APIC) errata is biting us with 376
404 * certain networking cards. If high frequency interrupts are
405 * happening on a particular IOAPIC pin, plus the IOAPIC routing
406 * entry is masked/unmasked at a high rate as well then sooner or
407 * later IOAPIC line gets 'stuck', no more interrupts are received
408 * from the device. If focus CPU is disabled then the hang goes
409 * away, oh well :-(
410 *
411 * [ This bug can be reproduced easily with a level-triggered
412 * PCI Ne2000 networking cards and PII/PIII processors, dual
413 * BX chipset. ]
414 */
415 /*
416 * Actually disabling the focus CPU check just makes the hang less
417 * frequent as it makes the interrupt distributon model be more
418 * like LRU than MRU (the short-term load is more even across CPUs).
419 * See also the comment in end_level_ioapic_irq(). --macro
420 */
421#if 1
422 /* Enable focus processor (bit==0) */
423 value &= ~APIC_SPIV_FOCUS_DISABLED;
424#else
425 /* Disable focus processor (bit==1) */
426 value |= APIC_SPIV_FOCUS_DISABLED;
427#endif
428 /* 377 /*
429 * Set spurious IRQ vector 378 * Set spurious IRQ vector
430 */ 379 */
@@ -442,7 +391,7 @@ void __cpuinit setup_local_APIC (void)
442 * TODO: set up through-local-APIC from through-I/O-APIC? --macro 391 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
443 */ 392 */
444 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; 393 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
445 if (!smp_processor_id() && (pic_mode || !value)) { 394 if (!smp_processor_id() && !value) {
446 value = APIC_DM_EXTINT; 395 value = APIC_DM_EXTINT;
447 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); 396 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
448 } else { 397 } else {
@@ -479,8 +428,7 @@ void __cpuinit setup_local_APIC (void)
479 } 428 }
480 429
481 nmi_watchdog_default(); 430 nmi_watchdog_default();
482 if (nmi_watchdog == NMI_LOCAL_APIC) 431 setup_apic_nmi_watchdog(NULL);
483 setup_apic_nmi_watchdog();
484 apic_pm_activate(); 432 apic_pm_activate();
485} 433}
486 434
@@ -527,8 +475,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
527 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 475 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
528 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 476 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
529 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 477 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
530 local_save_flags(flags); 478 local_irq_save(flags);
531 local_irq_disable();
532 disable_local_APIC(); 479 disable_local_APIC();
533 local_irq_restore(flags); 480 local_irq_restore(flags);
534 return 0; 481 return 0;
@@ -606,18 +553,24 @@ static void apic_pm_activate(void) { }
606 553
607static int __init apic_set_verbosity(char *str) 554static int __init apic_set_verbosity(char *str)
608{ 555{
556 if (str == NULL) {
557 skip_ioapic_setup = 0;
558 ioapic_force = 1;
559 return 0;
560 }
609 if (strcmp("debug", str) == 0) 561 if (strcmp("debug", str) == 0)
610 apic_verbosity = APIC_DEBUG; 562 apic_verbosity = APIC_DEBUG;
611 else if (strcmp("verbose", str) == 0) 563 else if (strcmp("verbose", str) == 0)
612 apic_verbosity = APIC_VERBOSE; 564 apic_verbosity = APIC_VERBOSE;
613 else 565 else {
614 printk(KERN_WARNING "APIC Verbosity level %s not recognised" 566 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
615 " use apic=verbose or apic=debug", str); 567 " use apic=verbose or apic=debug\n", str);
568 return -EINVAL;
569 }
616 570
617 return 1; 571 return 0;
618} 572}
619 573early_param("apic", apic_set_verbosity);
620__setup("apic=", apic_set_verbosity);
621 574
622/* 575/*
623 * Detect and enable local APICs on non-SMP boards. 576 * Detect and enable local APICs on non-SMP boards.
@@ -638,6 +591,40 @@ static int __init detect_init_APIC (void)
638 return 0; 591 return 0;
639} 592}
640 593
594#ifdef CONFIG_X86_IO_APIC
595static struct resource * __init ioapic_setup_resources(void)
596{
597#define IOAPIC_RESOURCE_NAME_SIZE 11
598 unsigned long n;
599 struct resource *res;
600 char *mem;
601 int i;
602
603 if (nr_ioapics <= 0)
604 return NULL;
605
606 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
607 n *= nr_ioapics;
608
609 res = alloc_bootmem(n);
610
611 if (!res)
612 return NULL;
613
614 memset(res, 0, n);
615 mem = (void *)&res[nr_ioapics];
616
617 for (i = 0; i < nr_ioapics; i++) {
618 res[i].name = mem;
619 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
620 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
621 mem += IOAPIC_RESOURCE_NAME_SIZE;
622 }
623
624 return res;
625}
626#endif
627
641void __init init_apic_mappings(void) 628void __init init_apic_mappings(void)
642{ 629{
643 unsigned long apic_phys; 630 unsigned long apic_phys;
@@ -654,19 +641,26 @@ void __init init_apic_mappings(void)
654 apic_phys = mp_lapic_addr; 641 apic_phys = mp_lapic_addr;
655 642
656 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 643 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
644 apic_mapped = 1;
657 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); 645 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
658 646
647 /* Put local APIC into the resource map. */
648 lapic_resource.start = apic_phys;
649 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
650 insert_resource(&iomem_resource, &lapic_resource);
651
659 /* 652 /*
660 * Fetch the APIC ID of the BSP in case we have a 653 * Fetch the APIC ID of the BSP in case we have a
661 * default configuration (or the MP table is broken). 654 * default configuration (or the MP table is broken).
662 */ 655 */
663 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); 656 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
664 657
665#ifdef CONFIG_X86_IO_APIC
666 { 658 {
667 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 659 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
668 int i; 660 int i;
661 struct resource *ioapic_res;
669 662
663 ioapic_res = ioapic_setup_resources();
670 for (i = 0; i < nr_ioapics; i++) { 664 for (i = 0; i < nr_ioapics; i++) {
671 if (smp_found_config) { 665 if (smp_found_config) {
672 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 666 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -678,9 +672,15 @@ void __init init_apic_mappings(void)
678 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", 672 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
679 __fix_to_virt(idx), ioapic_phys); 673 __fix_to_virt(idx), ioapic_phys);
680 idx++; 674 idx++;
675
676 if (ioapic_res) {
677 ioapic_res->start = ioapic_phys;
678 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
679 insert_resource(&iomem_resource, ioapic_res);
680 ioapic_res++;
681 }
681 } 682 }
682 } 683 }
683#endif
684} 684}
685 685
686/* 686/*
@@ -951,7 +951,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs)
951 * We take the 'long' return path, and there every subsystem 951 * We take the 'long' return path, and there every subsystem
952 * grabs the appropriate locks (kernel lock/ irq lock). 952 * grabs the appropriate locks (kernel lock/ irq lock).
953 * 953 *
954 * we might want to decouple profiling from the 'long path', 954 * We might want to decouple profiling from the 'long path',
955 * and do the profiling totally in assembly. 955 * and do the profiling totally in assembly.
956 * 956 *
957 * Currently this isn't too much of an issue (performance wise), 957 * Currently this isn't too much of an issue (performance wise),
@@ -1123,19 +1123,15 @@ int __init APIC_init_uniprocessor (void)
1123 1123
1124 verify_local_APIC(); 1124 verify_local_APIC();
1125 1125
1126 connect_bsp_APIC();
1127
1128 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); 1126 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1129 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); 1127 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1130 1128
1131 setup_local_APIC(); 1129 setup_local_APIC();
1132 1130
1133#ifdef CONFIG_X86_IO_APIC
1134 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1131 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1135 setup_IO_APIC(); 1132 setup_IO_APIC();
1136 else 1133 else
1137 nr_ioapics = 0; 1134 nr_ioapics = 0;
1138#endif
1139 setup_boot_APIC_clock(); 1135 setup_boot_APIC_clock();
1140 check_nmi_watchdog(); 1136 check_nmi_watchdog();
1141 return 0; 1137 return 0;
@@ -1144,14 +1140,17 @@ int __init APIC_init_uniprocessor (void)
1144static __init int setup_disableapic(char *str) 1140static __init int setup_disableapic(char *str)
1145{ 1141{
1146 disable_apic = 1; 1142 disable_apic = 1;
1147 return 1; 1143 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1148} 1144 return 0;
1145}
1146early_param("disableapic", setup_disableapic);
1149 1147
1148/* same as disableapic, for compatibility */
1150static __init int setup_nolapic(char *str) 1149static __init int setup_nolapic(char *str)
1151{ 1150{
1152 disable_apic = 1; 1151 return setup_disableapic(str);
1153 return 1;
1154} 1152}
1153early_param("nolapic", setup_nolapic);
1155 1154
1156static __init int setup_noapictimer(char *str) 1155static __init int setup_noapictimer(char *str)
1157{ 1156{
@@ -1184,11 +1183,5 @@ static __init int setup_apicpmtimer(char *s)
1184} 1183}
1185__setup("apicpmtimer", setup_apicpmtimer); 1184__setup("apicpmtimer", setup_apicpmtimer);
1186 1185
1187/* dummy parsing: see setup.c */
1188
1189__setup("disableapic", setup_disableapic);
1190__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
1191
1192__setup("noapictimer", setup_noapictimer); 1186__setup("noapictimer", setup_noapictimer);
1193 1187
1194/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index d8d5750d6106..3525f884af82 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -23,6 +23,7 @@
23#include <asm/nmi.h> 23#include <asm/nmi.h>
24#include <asm/hw_irq.h> 24#include <asm/hw_irq.h>
25#include <asm/mach_apic.h> 25#include <asm/mach_apic.h>
26#include <asm/kdebug.h>
26 27
27/* This keeps a track of which one is crashing cpu. */ 28/* This keeps a track of which one is crashing cpu. */
28static int crashing_cpu; 29static int crashing_cpu;
@@ -68,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
68 * for the data I pass, and I need tags 69 * for the data I pass, and I need tags
69 * on the data to indicate what information I have 70 * on the data to indicate what information I have
70 * squirrelled away. ELF notes happen to provide 71 * squirrelled away. ELF notes happen to provide
71 * all of that that no need to invent something new. 72 * all of that, no need to invent something new.
72 */ 73 */
73 74
74 buf = (u32*)per_cpu_ptr(crash_notes, cpu); 75 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
@@ -95,15 +96,25 @@ static void crash_save_self(struct pt_regs *regs)
95#ifdef CONFIG_SMP 96#ifdef CONFIG_SMP
96static atomic_t waiting_for_crash_ipi; 97static atomic_t waiting_for_crash_ipi;
97 98
98static int crash_nmi_callback(struct pt_regs *regs, int cpu) 99static int crash_nmi_callback(struct notifier_block *self,
100 unsigned long val, void *data)
99{ 101{
102 struct pt_regs *regs;
103 int cpu;
104
105 if (val != DIE_NMI_IPI)
106 return NOTIFY_OK;
107
108 regs = ((struct die_args *)data)->regs;
109 cpu = raw_smp_processor_id();
110
100 /* 111 /*
101 * Don't do anything if this handler is invoked on crashing cpu. 112 * Don't do anything if this handler is invoked on crashing cpu.
102 * Otherwise, system will completely hang. Crashing cpu can get 113 * Otherwise, system will completely hang. Crashing cpu can get
103 * an NMI if system was initially booted with nmi_watchdog parameter. 114 * an NMI if system was initially booted with nmi_watchdog parameter.
104 */ 115 */
105 if (cpu == crashing_cpu) 116 if (cpu == crashing_cpu)
106 return 1; 117 return NOTIFY_STOP;
107 local_irq_disable(); 118 local_irq_disable();
108 119
109 crash_save_this_cpu(regs, cpu); 120 crash_save_this_cpu(regs, cpu);
@@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void)
127 * cpu hotplug shouldn't matter. 138 * cpu hotplug shouldn't matter.
128 */ 139 */
129 140
141static struct notifier_block crash_nmi_nb = {
142 .notifier_call = crash_nmi_callback,
143};
144
130static void nmi_shootdown_cpus(void) 145static void nmi_shootdown_cpus(void)
131{ 146{
132 unsigned long msecs; 147 unsigned long msecs;
133 148
134 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); 149 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
135 set_nmi_callback(crash_nmi_callback); 150 if (register_die_notifier(&crash_nmi_nb))
151 return; /* return what? */
136 152
137 /* 153 /*
138 * Ensure the new callback function is set before sending 154 * Ensure the new callback function is set before sending
@@ -178,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
178 if(cpu_has_apic) 194 if(cpu_has_apic)
179 disable_local_APIC(); 195 disable_local_APIC();
180 196
181#if defined(CONFIG_X86_IO_APIC)
182 disable_IO_APIC(); 197 disable_IO_APIC();
183#endif
184 198
185 crash_save_self(regs); 199 crash_save_self(regs);
186} 200}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index d6d7f731f6f0..b3f0908668ec 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
16#include <linux/string.h> 16#include <linux/string.h>
17#include <linux/kexec.h> 17#include <linux/kexec.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/mm.h>
19 20
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/page.h> 22#include <asm/page.h>
@@ -24,6 +25,8 @@
24#include <asm/bootsetup.h> 25#include <asm/bootsetup.h>
25#include <asm/sections.h> 26#include <asm/sections.h>
26 27
28struct e820map e820 __initdata;
29
27/* 30/*
28 * PFN of last memory page. 31 * PFN of last memory page.
29 */ 32 */
@@ -40,7 +43,7 @@ unsigned long end_pfn_map;
40/* 43/*
41 * Last pfn which the user wants to use. 44 * Last pfn which the user wants to use.
42 */ 45 */
43unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; 46static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
44 47
45extern struct resource code_resource, data_resource; 48extern struct resource code_resource, data_resource;
46 49
@@ -69,12 +72,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
69 return 1; 72 return 1;
70 } 73 }
71#endif 74#endif
72 /* kernel code + 640k memory hole (later should not be needed, but 75 /* kernel code */
73 be paranoid for now) */
74 if (last >= 640*1024 && addr < 1024*1024) {
75 *addrp = 1024*1024;
76 return 1;
77 }
78 if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { 76 if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
79 *addrp = __pa_symbol(&_end); 77 *addrp = __pa_symbol(&_end);
80 return 1; 78 return 1;
@@ -164,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
164 return -1UL; 162 return -1UL;
165} 163}
166 164
167/*
168 * Free bootmem based on the e820 table for a node.
169 */
170void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
171{
172 int i;
173 for (i = 0; i < e820.nr_map; i++) {
174 struct e820entry *ei = &e820.map[i];
175 unsigned long last, addr;
176
177 if (ei->type != E820_RAM ||
178 ei->addr+ei->size <= start ||
179 ei->addr >= end)
180 continue;
181
182 addr = round_up(ei->addr, PAGE_SIZE);
183 if (addr < start)
184 addr = start;
185
186 last = round_down(ei->addr + ei->size, PAGE_SIZE);
187 if (last >= end)
188 last = end;
189
190 if (last > addr && last-addr >= PAGE_SIZE)
191 free_bootmem_node(pgdat, addr, last-addr);
192 }
193}
194
195/* 165/*
196 * Find the highest page frame number we have available 166 * Find the highest page frame number we have available
197 */ 167 */
198unsigned long __init e820_end_of_ram(void) 168unsigned long __init e820_end_of_ram(void)
199{ 169{
200 int i;
201 unsigned long end_pfn = 0; 170 unsigned long end_pfn = 0;
171 end_pfn = find_max_pfn_with_active_regions();
202 172
203 for (i = 0; i < e820.nr_map; i++) {
204 struct e820entry *ei = &e820.map[i];
205 unsigned long start, end;
206
207 start = round_up(ei->addr, PAGE_SIZE);
208 end = round_down(ei->addr + ei->size, PAGE_SIZE);
209 if (start >= end)
210 continue;
211 if (ei->type == E820_RAM) {
212 if (end > end_pfn<<PAGE_SHIFT)
213 end_pfn = end>>PAGE_SHIFT;
214 } else {
215 if (end > end_pfn_map<<PAGE_SHIFT)
216 end_pfn_map = end>>PAGE_SHIFT;
217 }
218 }
219
220 if (end_pfn > end_pfn_map) 173 if (end_pfn > end_pfn_map)
221 end_pfn_map = end_pfn; 174 end_pfn_map = end_pfn;
222 if (end_pfn_map > MAXMEM>>PAGE_SHIFT) 175 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -226,43 +179,10 @@ unsigned long __init e820_end_of_ram(void)
226 if (end_pfn > end_pfn_map) 179 if (end_pfn > end_pfn_map)
227 end_pfn = end_pfn_map; 180 end_pfn = end_pfn_map;
228 181
182 printk("end_pfn_map = %lu\n", end_pfn_map);
229 return end_pfn; 183 return end_pfn;
230} 184}
231 185
232/*
233 * Compute how much memory is missing in a range.
234 * Unlike the other functions in this file the arguments are in page numbers.
235 */
236unsigned long __init
237e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
238{
239 unsigned long ram = 0;
240 unsigned long start = start_pfn << PAGE_SHIFT;
241 unsigned long end = end_pfn << PAGE_SHIFT;
242 int i;
243 for (i = 0; i < e820.nr_map; i++) {
244 struct e820entry *ei = &e820.map[i];
245 unsigned long last, addr;
246
247 if (ei->type != E820_RAM ||
248 ei->addr+ei->size <= start ||
249 ei->addr >= end)
250 continue;
251
252 addr = round_up(ei->addr, PAGE_SIZE);
253 if (addr < start)
254 addr = start;
255
256 last = round_down(ei->addr + ei->size, PAGE_SIZE);
257 if (last >= end)
258 last = end;
259
260 if (last > addr)
261 ram += last - addr;
262 }
263 return ((end - start) - ram) >> PAGE_SHIFT;
264}
265
266/* 186/*
267 * Mark e820 reserved areas as busy for the resource manager. 187 * Mark e820 reserved areas as busy for the resource manager.
268 */ 188 */
@@ -297,6 +217,96 @@ void __init e820_reserve_resources(void)
297 } 217 }
298} 218}
299 219
220/* Mark pages corresponding to given address range as nosave */
221static void __init
222e820_mark_nosave_range(unsigned long start, unsigned long end)
223{
224 unsigned long pfn, max_pfn;
225
226 if (start >= end)
227 return;
228
229 printk("Nosave address range: %016lx - %016lx\n", start, end);
230 max_pfn = end >> PAGE_SHIFT;
231 for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
232 if (pfn_valid(pfn))
233 SetPageNosave(pfn_to_page(pfn));
234}
235
236/*
237 * Find the ranges of physical addresses that do not correspond to
238 * e820 RAM areas and mark the corresponding pages as nosave for software
239 * suspend and suspend to RAM.
240 *
241 * This function requires the e820 map to be sorted and without any
242 * overlapping entries and assumes the first e820 area to be RAM.
243 */
244void __init e820_mark_nosave_regions(void)
245{
246 int i;
247 unsigned long paddr;
248
249 paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
250 for (i = 1; i < e820.nr_map; i++) {
251 struct e820entry *ei = &e820.map[i];
252
253 if (paddr < ei->addr)
254 e820_mark_nosave_range(paddr,
255 round_up(ei->addr, PAGE_SIZE));
256
257 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
258 if (ei->type != E820_RAM)
259 e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
260 paddr);
261
262 if (paddr >= (end_pfn << PAGE_SHIFT))
263 break;
264 }
265}
266
267/* Walk the e820 map and register active regions within a node */
268void __init
269e820_register_active_regions(int nid, unsigned long start_pfn,
270 unsigned long end_pfn)
271{
272 int i;
273 unsigned long ei_startpfn, ei_endpfn;
274 for (i = 0; i < e820.nr_map; i++) {
275 struct e820entry *ei = &e820.map[i];
276 ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
277 ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
278 >> PAGE_SHIFT;
279
280 /* Skip map entries smaller than a page */
281 if (ei_startpfn > ei_endpfn)
282 continue;
283
284 /* Check if end_pfn_map should be updated */
285 if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
286 end_pfn_map = ei_endpfn;
287
288 /* Skip if map is outside the node */
289 if (ei->type != E820_RAM ||
290 ei_endpfn <= start_pfn ||
291 ei_startpfn >= end_pfn)
292 continue;
293
294 /* Check for overlaps */
295 if (ei_startpfn < start_pfn)
296 ei_startpfn = start_pfn;
297 if (ei_endpfn > end_pfn)
298 ei_endpfn = end_pfn;
299
300 /* Obey end_user_pfn to save on memmap */
301 if (ei_startpfn >= end_user_pfn)
302 continue;
303 if (ei_endpfn > end_user_pfn)
304 ei_endpfn = end_user_pfn;
305
306 add_active_range(nid, ei_startpfn, ei_endpfn);
307 }
308}
309
300/* 310/*
301 * Add a memory region to the kernel e820 map. 311 * Add a memory region to the kernel e820 map.
302 */ 312 */
@@ -517,13 +527,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
517 * If we're lucky and live on a modern system, the setup code 527 * If we're lucky and live on a modern system, the setup code
518 * will have given us a memory map that we can use to properly 528 * will have given us a memory map that we can use to properly
519 * set up memory. If we aren't, we'll fake a memory map. 529 * set up memory. If we aren't, we'll fake a memory map.
520 *
521 * We check to see that the memory map contains at least 2 elements
522 * before we'll use it, because the detection code in setup.S may
523 * not be perfect and most every PC known to man has two memory
524 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
525 * thinkpad 560x, for example, does not cooperate with the memory
526 * detection code.)
527 */ 530 */
528static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) 531static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
529{ 532{
@@ -541,34 +544,19 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
541 if (start > end) 544 if (start > end)
542 return -1; 545 return -1;
543 546
544 /*
545 * Some BIOSes claim RAM in the 640k - 1M region.
546 * Not right. Fix it up.
547 *
548 * This should be removed on Hammer which is supposed to not
549 * have non e820 covered ISA mappings there, but I had some strange
550 * problems so it stays for now. -AK
551 */
552 if (type == E820_RAM) {
553 if (start < 0x100000ULL && end > 0xA0000ULL) {
554 if (start < 0xA0000ULL)
555 add_memory_region(start, 0xA0000ULL-start, type);
556 if (end <= 0x100000ULL)
557 continue;
558 start = 0x100000ULL;
559 size = end - start;
560 }
561 }
562
563 add_memory_region(start, size, type); 547 add_memory_region(start, size, type);
564 } while (biosmap++,--nr_map); 548 } while (biosmap++,--nr_map);
565 return 0; 549 return 0;
566} 550}
567 551
568void __init setup_memory_region(void) 552void early_panic(char *msg)
569{ 553{
570 char *who = "BIOS-e820"; 554 early_printk(msg);
555 panic(msg);
556}
571 557
558void __init setup_memory_region(void)
559{
572 /* 560 /*
573 * Try to copy the BIOS-supplied E820-map. 561 * Try to copy the BIOS-supplied E820-map.
574 * 562 *
@@ -576,51 +564,70 @@ void __init setup_memory_region(void)
576 * the next section from 1mb->appropriate_mem_k 564 * the next section from 1mb->appropriate_mem_k
577 */ 565 */
578 sanitize_e820_map(E820_MAP, &E820_MAP_NR); 566 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
579 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { 567 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
580 unsigned long mem_size; 568 early_panic("Cannot find a valid memory map");
581
582 /* compare results from other methods and take the greater */
583 if (ALT_MEM_K < EXT_MEM_K) {
584 mem_size = EXT_MEM_K;
585 who = "BIOS-88";
586 } else {
587 mem_size = ALT_MEM_K;
588 who = "BIOS-e801";
589 }
590
591 e820.nr_map = 0;
592 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
593 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
594 }
595 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 569 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
596 e820_print_map(who); 570 e820_print_map("BIOS-e820");
597} 571}
598 572
599void __init parse_memopt(char *p, char **from) 573static int __init parse_memopt(char *p)
600{ 574{
601 end_user_pfn = memparse(p, from); 575 if (!p)
576 return -EINVAL;
577 end_user_pfn = memparse(p, &p);
602 end_user_pfn >>= PAGE_SHIFT; 578 end_user_pfn >>= PAGE_SHIFT;
579 return 0;
603} 580}
581early_param("mem", parse_memopt);
582
583static int userdef __initdata;
604 584
605void __init parse_memmapopt(char *p, char **from) 585static int __init parse_memmap_opt(char *p)
606{ 586{
587 char *oldp;
607 unsigned long long start_at, mem_size; 588 unsigned long long start_at, mem_size;
608 589
609 mem_size = memparse(p, from); 590 if (!strcmp(p, "exactmap")) {
610 p = *from; 591#ifdef CONFIG_CRASH_DUMP
592 /* If we are doing a crash dump, we
593 * still need to know the real mem
594 * size before original memory map is
595 * reset.
596 */
597 saved_max_pfn = e820_end_of_ram();
598#endif
599 end_pfn_map = 0;
600 e820.nr_map = 0;
601 userdef = 1;
602 return 0;
603 }
604
605 oldp = p;
606 mem_size = memparse(p, &p);
607 if (p == oldp)
608 return -EINVAL;
611 if (*p == '@') { 609 if (*p == '@') {
612 start_at = memparse(p+1, from); 610 start_at = memparse(p+1, &p);
613 add_memory_region(start_at, mem_size, E820_RAM); 611 add_memory_region(start_at, mem_size, E820_RAM);
614 } else if (*p == '#') { 612 } else if (*p == '#') {
615 start_at = memparse(p+1, from); 613 start_at = memparse(p+1, &p);
616 add_memory_region(start_at, mem_size, E820_ACPI); 614 add_memory_region(start_at, mem_size, E820_ACPI);
617 } else if (*p == '$') { 615 } else if (*p == '$') {
618 start_at = memparse(p+1, from); 616 start_at = memparse(p+1, &p);
619 add_memory_region(start_at, mem_size, E820_RESERVED); 617 add_memory_region(start_at, mem_size, E820_RESERVED);
620 } else { 618 } else {
621 end_user_pfn = (mem_size >> PAGE_SHIFT); 619 end_user_pfn = (mem_size >> PAGE_SHIFT);
622 } 620 }
623 p = *from; 621 return *p == '\0' ? 0 : -EINVAL;
622}
623early_param("memmap", parse_memmap_opt);
624
625void finish_e820_parsing(void)
626{
627 if (userdef) {
628 printk(KERN_INFO "user-defined physical RAM map:\n");
629 e820_print_map("user");
630 }
624} 631}
625 632
626unsigned long pci_mem_start = 0xaeedbabe; 633unsigned long pci_mem_start = 0xaeedbabe;
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
new file mode 100644
index 000000000000..208e38a372c1
--- /dev/null
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -0,0 +1,122 @@
1/* Various workarounds for chipset bugs.
2 This code runs very early and can't use the regular PCI subsystem
3 The entries are keyed to PCI bridges which usually identify chipsets
4 uniquely.
5 This is only for whole classes of chipsets with specific problems which
6 need early invasive action (e.g. before the timers are initialized).
7 Most PCI device specific workarounds can be done later and should be
8 in standard PCI quirks
9 Mainboard specific bugs should be handled by DMI entries.
10 CPU specific bugs in setup.c */
11
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci_ids.h>
15#include <asm/pci-direct.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18
19static void via_bugs(void)
20{
21#ifdef CONFIG_IOMMU
22 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
23 !iommu_aperture_allowed) {
24 printk(KERN_INFO
25 "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
26 iommu_aperture_disabled = 1;
27 }
28#endif
29}
30
31#ifdef CONFIG_ACPI
32
33static int nvidia_hpet_detected __initdata;
34
35static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
36{
37 nvidia_hpet_detected = 1;
38 return 0;
39}
40#endif
41
42static void nvidia_bugs(void)
43{
44#ifdef CONFIG_ACPI
45 /*
46 * All timer overrides on Nvidia are
47 * wrong unless HPET is enabled.
48 */
49 nvidia_hpet_detected = 0;
50 acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
51 if (nvidia_hpet_detected == 0) {
52 acpi_skip_timer_override = 1;
53 printk(KERN_INFO "Nvidia board "
54 "detected. Ignoring ACPI "
55 "timer override.\n");
56 }
57#endif
58 /* RED-PEN skip them on mptables too? */
59
60}
61
62static void ati_bugs(void)
63{
64#if 1 /* for testing */
65 printk("ATI board detected\n");
66#endif
67 /* No bugs right now */
68}
69
70struct chipset {
71 u16 vendor;
72 void (*f)(void);
73};
74
75static struct chipset early_qrk[] = {
76 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
77 { PCI_VENDOR_ID_VIA, via_bugs },
78 { PCI_VENDOR_ID_ATI, ati_bugs },
79 {}
80};
81
82void __init early_quirks(void)
83{
84 int num, slot, func;
85
86 if (!early_pci_allowed())
87 return;
88
89 /* Poor man's PCI discovery */
90 for (num = 0; num < 32; num++) {
91 for (slot = 0; slot < 32; slot++) {
92 for (func = 0; func < 8; func++) {
93 u32 class;
94 u32 vendor;
95 u8 type;
96 int i;
97 class = read_pci_config(num,slot,func,
98 PCI_CLASS_REVISION);
99 if (class == 0xffffffff)
100 break;
101
102 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
103 continue;
104
105 vendor = read_pci_config(num, slot, func,
106 PCI_VENDOR_ID);
107 vendor &= 0xffff;
108
109 for (i = 0; early_qrk[i].f; i++)
110 if (early_qrk[i].vendor == vendor) {
111 early_qrk[i].f();
112 return;
113 }
114
115 type = read_pci_config_byte(num, slot, func,
116 PCI_HEADER_TYPE);
117 if (!(type & 0x80))
118 break;
119 }
120 }
121 }
122}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 140051e07fa6..e22ecd54870d 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...)
215 215
216static int __initdata keep_early; 216static int __initdata keep_early;
217 217
218int __init setup_early_printk(char *opt) 218static int __init setup_early_printk(char *buf)
219{ 219{
220 char *space; 220 if (!buf)
221 char buf[256]; 221 return 0;
222 222
223 if (early_console_initialized) 223 if (early_console_initialized)
224 return 1; 224 return 0;
225 225 early_console_initialized = 1;
226 strlcpy(buf,opt,sizeof(buf));
227 space = strchr(buf, ' ');
228 if (space)
229 *space = 0;
230 226
231 if (strstr(buf,"keep")) 227 if (!strcmp(buf,"keep"))
232 keep_early = 1; 228 keep_early = 1;
233 229
234 if (!strncmp(buf, "serial", 6)) { 230 if (!strncmp(buf, "serial", 6)) {
@@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt)
248 early_console = &simnow_console; 244 early_console = &simnow_console;
249 keep_early = 1; 245 keep_early = 1;
250 } 246 }
251 early_console_initialized = 1;
252 register_console(early_console); 247 register_console(early_console);
253 return 0; 248 return 0;
254} 249}
255 250
251early_param("earlyprintk", setup_early_printk);
252
256void __init disable_early_printk(void) 253void __init disable_early_printk(void)
257{ 254{
258 if (!early_console_initialized || !early_console) 255 if (!early_console_initialized || !early_console)
@@ -266,4 +263,3 @@ void __init disable_early_printk(void)
266 } 263 }
267} 264}
268 265
269__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index aa8d8939abc1..2802524104f3 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -4,8 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
7 *
8 * $Id$
9 */ 7 */
10 8
11/* 9/*
@@ -22,15 +20,25 @@
22 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers upto R11.
24 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
25 * 23 *
26 * TODO: 24 * Some macro usage:
27 * - schedule it carefully for the final hardware. 25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
28 */ 38 */
29 39
30#define ASSEMBLY 1
31#include <linux/linkage.h> 40#include <linux/linkage.h>
32#include <asm/segment.h> 41#include <asm/segment.h>
33#include <asm/smp.h>
34#include <asm/cache.h> 42#include <asm/cache.h>
35#include <asm/errno.h> 43#include <asm/errno.h>
36#include <asm/dwarf2.h> 44#include <asm/dwarf2.h>
@@ -115,6 +123,7 @@
115 .macro CFI_DEFAULT_STACK start=1 123 .macro CFI_DEFAULT_STACK start=1
116 .if \start 124 .if \start
117 CFI_STARTPROC simple 125 CFI_STARTPROC simple
126 CFI_SIGNAL_FRAME
118 CFI_DEF_CFA rsp,SS+8 127 CFI_DEF_CFA rsp,SS+8
119 .else 128 .else
120 CFI_DEF_CFA_OFFSET SS+8 129 CFI_DEF_CFA_OFFSET SS+8
@@ -146,6 +155,10 @@
146/* rdi: prev */ 155/* rdi: prev */
147ENTRY(ret_from_fork) 156ENTRY(ret_from_fork)
148 CFI_DEFAULT_STACK 157 CFI_DEFAULT_STACK
158 push kernel_eflags(%rip)
159 CFI_ADJUST_CFA_OFFSET 4
160 popf # reset kernel eflags
161 CFI_ADJUST_CFA_OFFSET -4
149 call schedule_tail 162 call schedule_tail
150 GET_THREAD_INFO(%rcx) 163 GET_THREAD_INFO(%rcx)
151 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) 164 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
@@ -199,6 +212,7 @@ END(ret_from_fork)
199 212
200ENTRY(system_call) 213ENTRY(system_call)
201 CFI_STARTPROC simple 214 CFI_STARTPROC simple
215 CFI_SIGNAL_FRAME
202 CFI_DEF_CFA rsp,PDA_STACKOFFSET 216 CFI_DEF_CFA rsp,PDA_STACKOFFSET
203 CFI_REGISTER rip,rcx 217 CFI_REGISTER rip,rcx
204 /*CFI_REGISTER rflags,r11*/ 218 /*CFI_REGISTER rflags,r11*/
@@ -316,6 +330,7 @@ END(system_call)
316 */ 330 */
317ENTRY(int_ret_from_sys_call) 331ENTRY(int_ret_from_sys_call)
318 CFI_STARTPROC simple 332 CFI_STARTPROC simple
333 CFI_SIGNAL_FRAME
319 CFI_DEF_CFA rsp,SS+8-ARGOFFSET 334 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
320 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ 335 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
321 CFI_REL_OFFSET rsp,RSP-ARGOFFSET 336 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
@@ -476,6 +491,7 @@ END(stub_rt_sigreturn)
476 */ 491 */
477 .macro _frame ref 492 .macro _frame ref
478 CFI_STARTPROC simple 493 CFI_STARTPROC simple
494 CFI_SIGNAL_FRAME
479 CFI_DEF_CFA rsp,SS+8-\ref 495 CFI_DEF_CFA rsp,SS+8-\ref
480 /*CFI_REL_OFFSET ss,SS-\ref*/ 496 /*CFI_REL_OFFSET ss,SS-\ref*/
481 CFI_REL_OFFSET rsp,RSP-\ref 497 CFI_REL_OFFSET rsp,RSP-\ref
@@ -511,7 +527,12 @@ END(stub_rt_sigreturn)
511 testl $3,CS(%rdi) 527 testl $3,CS(%rdi)
512 je 1f 528 je 1f
513 swapgs 529 swapgs
5141: incl %gs:pda_irqcount # RED-PEN should check preempt count 530 /* irqcount is used to check if a CPU is already on an interrupt
531 stack or not. While this is essentially redundant with preempt_count
532 it is a little cheaper to use a separate counter in the PDA
533 (short of moving irq_enter into assembly, which would be too
534 much work) */
5351: incl %gs:pda_irqcount
515 cmoveq %gs:pda_irqstackptr,%rsp 536 cmoveq %gs:pda_irqstackptr,%rsp
516 push %rbp # backlink for old unwinder 537 push %rbp # backlink for old unwinder
517 /* 538 /*
@@ -619,8 +640,7 @@ retint_signal:
619#ifdef CONFIG_PREEMPT 640#ifdef CONFIG_PREEMPT
620 /* Returning to kernel space. Check if we need preemption */ 641 /* Returning to kernel space. Check if we need preemption */
621 /* rcx: threadinfo. interrupts off. */ 642 /* rcx: threadinfo. interrupts off. */
622 .p2align 643ENTRY(retint_kernel)
623retint_kernel:
624 cmpl $0,threadinfo_preempt_count(%rcx) 644 cmpl $0,threadinfo_preempt_count(%rcx)
625 jnz retint_restore_args 645 jnz retint_restore_args
626 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) 646 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
@@ -679,7 +699,6 @@ ENTRY(call_function_interrupt)
679END(call_function_interrupt) 699END(call_function_interrupt)
680#endif 700#endif
681 701
682#ifdef CONFIG_X86_LOCAL_APIC
683ENTRY(apic_timer_interrupt) 702ENTRY(apic_timer_interrupt)
684 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 703 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
685END(apic_timer_interrupt) 704END(apic_timer_interrupt)
@@ -691,7 +710,6 @@ END(error_interrupt)
691ENTRY(spurious_interrupt) 710ENTRY(spurious_interrupt)
692 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt 711 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
693END(spurious_interrupt) 712END(spurious_interrupt)
694#endif
695 713
696/* 714/*
697 * Exception entry points. 715 * Exception entry points.
@@ -768,7 +786,9 @@ paranoid_exit\trace:
768 testl $3,CS(%rsp) 786 testl $3,CS(%rsp)
769 jnz paranoid_userspace\trace 787 jnz paranoid_userspace\trace
770paranoid_swapgs\trace: 788paranoid_swapgs\trace:
789 .if \trace
771 TRACE_IRQS_IRETQ 0 790 TRACE_IRQS_IRETQ 0
791 .endif
772 swapgs 792 swapgs
773paranoid_restore\trace: 793paranoid_restore\trace:
774 RESTORE_ALL 8 794 RESTORE_ALL 8
@@ -814,7 +834,7 @@ paranoid_schedule\trace:
814 * Exception entry point. This expects an error code/orig_rax on the stack 834 * Exception entry point. This expects an error code/orig_rax on the stack
815 * and the exception handler in %rax. 835 * and the exception handler in %rax.
816 */ 836 */
817ENTRY(error_entry) 837KPROBE_ENTRY(error_entry)
818 _frame RDI 838 _frame RDI
819 /* rdi slot contains rax, oldrax contains error code */ 839 /* rdi slot contains rax, oldrax contains error code */
820 cld 840 cld
@@ -898,7 +918,7 @@ error_kernelspace:
898 cmpq $gs_change,RIP(%rsp) 918 cmpq $gs_change,RIP(%rsp)
899 je error_swapgs 919 je error_swapgs
900 jmp error_sti 920 jmp error_sti
901END(error_entry) 921KPROBE_END(error_entry)
902 922
903 /* Reload gs selector with exception handling */ 923 /* Reload gs selector with exception handling */
904 /* edi: new selector */ 924 /* edi: new selector */
@@ -1020,8 +1040,7 @@ ENDPROC(execve)
1020 1040
1021KPROBE_ENTRY(page_fault) 1041KPROBE_ENTRY(page_fault)
1022 errorentry do_page_fault 1042 errorentry do_page_fault
1023END(page_fault) 1043KPROBE_END(page_fault)
1024 .previous .text
1025 1044
1026ENTRY(coprocessor_error) 1045ENTRY(coprocessor_error)
1027 zeroentry do_coprocessor_error 1046 zeroentry do_coprocessor_error
@@ -1042,8 +1061,7 @@ KPROBE_ENTRY(debug)
1042 CFI_ADJUST_CFA_OFFSET 8 1061 CFI_ADJUST_CFA_OFFSET 8
1043 paranoidentry do_debug, DEBUG_STACK 1062 paranoidentry do_debug, DEBUG_STACK
1044 paranoidexit 1063 paranoidexit
1045END(debug) 1064KPROBE_END(debug)
1046 .previous .text
1047 1065
1048 /* runs on exception stack */ 1066 /* runs on exception stack */
1049KPROBE_ENTRY(nmi) 1067KPROBE_ENTRY(nmi)
@@ -1057,8 +1075,7 @@ KPROBE_ENTRY(nmi)
1057 jmp paranoid_exit1 1075 jmp paranoid_exit1
1058 CFI_ENDPROC 1076 CFI_ENDPROC
1059#endif 1077#endif
1060END(nmi) 1078KPROBE_END(nmi)
1061 .previous .text
1062 1079
1063KPROBE_ENTRY(int3) 1080KPROBE_ENTRY(int3)
1064 INTR_FRAME 1081 INTR_FRAME
@@ -1067,8 +1084,7 @@ KPROBE_ENTRY(int3)
1067 paranoidentry do_int3, DEBUG_STACK 1084 paranoidentry do_int3, DEBUG_STACK
1068 jmp paranoid_exit1 1085 jmp paranoid_exit1
1069 CFI_ENDPROC 1086 CFI_ENDPROC
1070END(int3) 1087KPROBE_END(int3)
1071 .previous .text
1072 1088
1073ENTRY(overflow) 1089ENTRY(overflow)
1074 zeroentry do_overflow 1090 zeroentry do_overflow
@@ -1116,8 +1132,7 @@ END(stack_segment)
1116 1132
1117KPROBE_ENTRY(general_protection) 1133KPROBE_ENTRY(general_protection)
1118 errorentry do_general_protection 1134 errorentry do_general_protection
1119END(general_protection) 1135KPROBE_END(general_protection)
1120 .previous .text
1121 1136
1122ENTRY(alignment_check) 1137ENTRY(alignment_check)
1123 errorentry do_alignment_check 1138 errorentry do_alignment_check
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
index 3020917546de..cdb90e671b88 100644
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -118,7 +118,6 @@ struct genapic apic_cluster = {
118 .name = "clustered", 118 .name = "clustered",
119 .int_delivery_mode = dest_Fixed, 119 .int_delivery_mode = dest_Fixed,
120 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 120 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
121 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
122 .target_cpus = cluster_target_cpus, 121 .target_cpus = cluster_target_cpus,
123 .apic_id_registered = cluster_apic_id_registered, 122 .apic_id_registered = cluster_apic_id_registered,
124 .init_apic_ldr = cluster_init_apic_ldr, 123 .init_apic_ldr = cluster_init_apic_ldr,
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index eb86d374813a..50ad153eaac4 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -49,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
49 unsigned long cfg; 49 unsigned long cfg;
50 unsigned long flags; 50 unsigned long flags;
51 51
52 local_save_flags(flags); 52 local_irq_save(flags);
53 local_irq_disable();
54 53
55 /* 54 /*
56 * Wait for idle. 55 * Wait for idle.
@@ -121,7 +120,6 @@ struct genapic apic_flat = {
121 .name = "flat", 120 .name = "flat",
122 .int_delivery_mode = dest_LowestPrio, 121 .int_delivery_mode = dest_LowestPrio,
123 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 122 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
124 .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
125 .target_cpus = flat_target_cpus, 123 .target_cpus = flat_target_cpus,
126 .apic_id_registered = flat_apic_id_registered, 124 .apic_id_registered = flat_apic_id_registered,
127 .init_apic_ldr = flat_init_apic_ldr, 125 .init_apic_ldr = flat_init_apic_ldr,
@@ -180,7 +178,6 @@ struct genapic apic_physflat = {
180 .name = "physical flat", 178 .name = "physical flat",
181 .int_delivery_mode = dest_Fixed, 179 .int_delivery_mode = dest_Fixed,
182 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
183 .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
184 .target_cpus = physflat_target_cpus, 181 .target_cpus = physflat_target_cpus,
185 .apic_id_registered = flat_apic_id_registered, 182 .apic_id_registered = flat_apic_id_registered,
186 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ 183 .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index c9739ca81d06..1e6f80870679 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,8 +5,6 @@
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> 6 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> 7 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
8 *
9 * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
10 */ 8 */
11 9
12 10
@@ -187,12 +185,15 @@ startup_64:
187 185
188 /* Finally jump to run C code and to be on real kernel address 186 /* Finally jump to run C code and to be on real kernel address
189 * Since we are running on identity-mapped space we have to jump 187 * Since we are running on identity-mapped space we have to jump
190 * to the full 64bit address , this is only possible as indirect 188 * to the full 64bit address, this is only possible as indirect
191 * jump 189 * jump. In addition we need to ensure %cs is set so we make this
190 * a far return.
192 */ 191 */
193 movq initial_code(%rip),%rax 192 movq initial_code(%rip),%rax
194 pushq $0 # fake return address 193 pushq $0 # fake return address to stop unwinder
195 jmp *%rax 194 pushq $__KERNEL_CS # set correct cs
195 pushq %rax # target address in negative space
196 lretq
196 197
197 /* SMP bootup changes these two */ 198 /* SMP bootup changes these two */
198 .align 8 199 .align 8
@@ -371,7 +372,7 @@ ENTRY(cpu_gdt_table)
371 .quad 0,0 /* TSS */ 372 .quad 0,0 /* TSS */
372 .quad 0,0 /* LDT */ 373 .quad 0,0 /* LDT */
373 .quad 0,0,0 /* three TLS descriptors */ 374 .quad 0,0,0 /* three TLS descriptors */
374 .quad 0 /* unused */ 375 .quad 0x0000f40000000000 /* node/CPU stored in limit */
375gdt_end: 376gdt_end:
376 /* asm/segment.h:GDT_ENTRIES must match this */ 377 /* asm/segment.h:GDT_ENTRIES must match this */
377 /* This should be a multiple of the cache line size */ 378 /* This should be a multiple of the cache line size */
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 36647ce6aecb..9561eb3c5b5c 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -45,38 +45,16 @@ static void __init copy_bootdata(char *real_mode_data)
45 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); 45 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
46 if (!new_data) { 46 if (!new_data) {
47 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { 47 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
48 printk("so old bootloader that it does not support commandline?!\n");
49 return; 48 return;
50 } 49 }
51 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; 50 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
52 printk("old bootloader convention, maybe loadlin?\n");
53 } 51 }
54 command_line = (char *) ((u64)(new_data)); 52 command_line = (char *) ((u64)(new_data));
55 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); 53 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
56 printk("Bootdata ok (command line is %s)\n", saved_command_line);
57}
58
59static void __init setup_boot_cpu_data(void)
60{
61 unsigned int dummy, eax;
62
63 /* get vendor info */
64 cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
65 (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
66 (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
67 (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
68
69 /* get cpu type */
70 cpuid(1, &eax, &dummy, &dummy,
71 (unsigned int *) &boot_cpu_data.x86_capability);
72 boot_cpu_data.x86 = (eax >> 8) & 0xf;
73 boot_cpu_data.x86_model = (eax >> 4) & 0xf;
74 boot_cpu_data.x86_mask = eax & 0xf;
75} 54}
76 55
77void __init x86_64_start_kernel(char * real_mode_data) 56void __init x86_64_start_kernel(char * real_mode_data)
78{ 57{
79 char *s;
80 int i; 58 int i;
81 59
82 for (i = 0; i < 256; i++) 60 for (i = 0; i < 256; i++)
@@ -84,10 +62,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
84 asm volatile("lidt %0" :: "m" (idt_descr)); 62 asm volatile("lidt %0" :: "m" (idt_descr));
85 clear_bss(); 63 clear_bss();
86 64
87 /* 65 early_printk("Kernel alive\n");
88 * This must be called really, really early:
89 */
90 lockdep_init();
91 66
92 /* 67 /*
93 * switch to init_level4_pgt from boot_level4_pgt 68 * switch to init_level4_pgt from boot_level4_pgt
@@ -103,22 +78,5 @@ void __init x86_64_start_kernel(char * real_mode_data)
103#ifdef CONFIG_SMP 78#ifdef CONFIG_SMP
104 cpu_set(0, cpu_online_map); 79 cpu_set(0, cpu_online_map);
105#endif 80#endif
106 s = strstr(saved_command_line, "earlyprintk=");
107 if (s != NULL)
108 setup_early_printk(strchr(s, '=') + 1);
109#ifdef CONFIG_NUMA
110 s = strstr(saved_command_line, "numa=");
111 if (s != NULL)
112 numa_setup(s+5);
113#endif
114#ifdef CONFIG_X86_IO_APIC
115 if (strstr(saved_command_line, "disableapic"))
116 disable_apic = 1;
117#endif
118 /* You need early console to see that */
119 if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
120 panic("Kernel too big for kernel mapping\n");
121
122 setup_boot_cpu_data();
123 start_kernel(); 81 start_kernel();
124} 82}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 0434b1f8e3dd..2dd51f364ea2 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -55,7 +55,6 @@
55 */ 55 */
56BUILD_16_IRQS(0x0) 56BUILD_16_IRQS(0x0)
57 57
58#ifdef CONFIG_X86_LOCAL_APIC
59/* 58/*
60 * The IO-APIC gives us many more interrupt sources. Most of these 59 * The IO-APIC gives us many more interrupt sources. Most of these
61 * are unused but an SMP system is supposed to have enough memory ... 60 * are unused but an SMP system is supposed to have enough memory ...
@@ -75,8 +74,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
75 BUILD_15_IRQS(0xe) 74 BUILD_15_IRQS(0xe)
76#endif 75#endif
77 76
78#endif
79
80#undef BUILD_16_IRQS 77#undef BUILD_16_IRQS
81#undef BUILD_15_IRQS 78#undef BUILD_15_IRQS
82#undef BI 79#undef BI
@@ -100,7 +97,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
100void (*interrupt[NR_IRQS])(void) = { 97void (*interrupt[NR_IRQS])(void) = {
101 IRQLIST_16(0x0), 98 IRQLIST_16(0x0),
102 99
103#ifdef CONFIG_X86_IO_APIC
104 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), 100 IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
105 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), 101 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
106 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), 102 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -110,7 +106,6 @@ void (*interrupt[NR_IRQS])(void) = {
110 , IRQLIST_15(0xe) 106 , IRQLIST_15(0xe)
111#endif 107#endif
112 108
113#endif
114}; 109};
115 110
116#undef IRQ 111#undef IRQ
@@ -128,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = {
128 123
129DEFINE_SPINLOCK(i8259A_lock); 124DEFINE_SPINLOCK(i8259A_lock);
130 125
126static int i8259A_auto_eoi;
127
131static void end_8259A_irq (unsigned int irq) 128static void end_8259A_irq (unsigned int irq)
132{ 129{
133 if (irq > 256) { 130 if (irq > 256) {
@@ -341,6 +338,8 @@ void init_8259A(int auto_eoi)
341{ 338{
342 unsigned long flags; 339 unsigned long flags;
343 340
341 i8259A_auto_eoi = auto_eoi;
342
344 spin_lock_irqsave(&i8259A_lock, flags); 343 spin_lock_irqsave(&i8259A_lock, flags);
345 344
346 outb(0xff, 0x21); /* mask all of 8259A-1 */ 345 outb(0xff, 0x21); /* mask all of 8259A-1 */
@@ -399,7 +398,7 @@ static void save_ELCR(char *trigger)
399 398
400static int i8259A_resume(struct sys_device *dev) 399static int i8259A_resume(struct sys_device *dev)
401{ 400{
402 init_8259A(0); 401 init_8259A(i8259A_auto_eoi);
403 restore_ELCR(irq_trigger); 402 restore_ELCR(irq_trigger);
404 return 0; 403 return 0;
405} 404}
@@ -453,9 +452,7 @@ void __init init_ISA_irqs (void)
453{ 452{
454 int i; 453 int i;
455 454
456#ifdef CONFIG_X86_LOCAL_APIC
457 init_bsp_APIC(); 455 init_bsp_APIC();
458#endif
459 init_8259A(0); 456 init_8259A(0);
460 457
461 for (i = 0; i < NR_IRQS; i++) { 458 for (i = 0; i < NR_IRQS; i++) {
@@ -581,14 +578,12 @@ void __init init_IRQ(void)
581 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 578 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
582 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 579 set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
583 580
584#ifdef CONFIG_X86_LOCAL_APIC
585 /* self generated IPI for local APIC timer */ 581 /* self generated IPI for local APIC timer */
586 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 582 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
587 583
588 /* IPI vectors for APIC spurious and error interrupts */ 584 /* IPI vectors for APIC spurious and error interrupts */
589 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 585 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
590 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 586 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
591#endif
592 587
593 /* 588 /*
594 * Set the clock to HZ Hz, we already have a valid 589 * Set the clock to HZ Hz, we already have a valid
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 924a4a332954..0491019d4c8d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
48 48
49static int no_timer_check; 49static int no_timer_check;
50 50
51int disable_timer_pin_1 __initdata; 51static int disable_timer_pin_1 __initdata;
52 52
53int timer_over_8254 __initdata = 0; 53int timer_over_8254 __initdata = 0;
54 54
@@ -111,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
111 FINAL; \ 111 FINAL; \
112} 112}
113 113
114union entry_union {
115 struct { u32 w1, w2; };
116 struct IO_APIC_route_entry entry;
117};
118
119static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
120{
121 union entry_union eu;
122 unsigned long flags;
123 spin_lock_irqsave(&ioapic_lock, flags);
124 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
125 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
126 spin_unlock_irqrestore(&ioapic_lock, flags);
127 return eu.entry;
128}
129
130static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
131{
132 unsigned long flags;
133 union entry_union eu;
134 eu.entry = e;
135 spin_lock_irqsave(&ioapic_lock, flags);
136 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
137 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
138 spin_unlock_irqrestore(&ioapic_lock, flags);
139}
140
114#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
115static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 142static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
116{ 143{
@@ -196,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
196static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 223static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
197{ 224{
198 struct IO_APIC_route_entry entry; 225 struct IO_APIC_route_entry entry;
199 unsigned long flags;
200 226
201 /* Check delivery_mode to be sure we're not clearing an SMI pin */ 227 /* Check delivery_mode to be sure we're not clearing an SMI pin */
202 spin_lock_irqsave(&ioapic_lock, flags); 228 entry = ioapic_read_entry(apic, pin);
203 *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
204 *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
205 spin_unlock_irqrestore(&ioapic_lock, flags);
206 if (entry.delivery_mode == dest_SMI) 229 if (entry.delivery_mode == dest_SMI)
207 return; 230 return;
208 /* 231 /*
@@ -210,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
210 */ 233 */
211 memset(&entry, 0, sizeof(entry)); 234 memset(&entry, 0, sizeof(entry));
212 entry.mask = 1; 235 entry.mask = 1;
213 spin_lock_irqsave(&ioapic_lock, flags); 236 ioapic_write_entry(apic, pin, entry);
214 io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
215 io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
216 spin_unlock_irqrestore(&ioapic_lock, flags);
217} 237}
218 238
219static void clear_IO_APIC (void) 239static void clear_IO_APIC (void)
@@ -225,14 +245,6 @@ static void clear_IO_APIC (void)
225 clear_IO_APIC_pin(apic, pin); 245 clear_IO_APIC_pin(apic, pin);
226} 246}
227 247
228/*
229 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
230 * specific CPU-side IRQs.
231 */
232
233#define MAX_PIRQS 8
234static int pirq_entries [MAX_PIRQS];
235static int pirqs_enabled;
236int skip_ioapic_setup; 248int skip_ioapic_setup;
237int ioapic_force; 249int ioapic_force;
238 250
@@ -241,18 +253,17 @@ int ioapic_force;
241static int __init disable_ioapic_setup(char *str) 253static int __init disable_ioapic_setup(char *str)
242{ 254{
243 skip_ioapic_setup = 1; 255 skip_ioapic_setup = 1;
244 return 1; 256 return 0;
245} 257}
258early_param("noapic", disable_ioapic_setup);
246 259
247static int __init enable_ioapic_setup(char *str) 260/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
261static int __init disable_timer_pin_setup(char *arg)
248{ 262{
249 ioapic_force = 1; 263 disable_timer_pin_1 = 1;
250 skip_ioapic_setup = 0;
251 return 1; 264 return 1;
252} 265}
253 266__setup("disable_timer_pin_1", disable_timer_pin_setup);
254__setup("noapic", disable_ioapic_setup);
255__setup("apic", enable_ioapic_setup);
256 267
257static int __init setup_disable_8254_timer(char *s) 268static int __init setup_disable_8254_timer(char *s)
258{ 269{
@@ -268,135 +279,6 @@ static int __init setup_enable_8254_timer(char *s)
268__setup("disable_8254_timer", setup_disable_8254_timer); 279__setup("disable_8254_timer", setup_disable_8254_timer);
269__setup("enable_8254_timer", setup_enable_8254_timer); 280__setup("enable_8254_timer", setup_enable_8254_timer);
270 281
271#include <asm/pci-direct.h>
272#include <linux/pci_ids.h>
273#include <linux/pci.h>
274
275
276#ifdef CONFIG_ACPI
277
278static int nvidia_hpet_detected __initdata;
279
280static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
281{
282 nvidia_hpet_detected = 1;
283 return 0;
284}
285#endif
286
287/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
288 off. Check for an Nvidia or VIA PCI bridge and turn it off.
289 Use pci direct infrastructure because this runs before the PCI subsystem.
290
291 Can be overwritten with "apic"
292
293 And another hack to disable the IOMMU on VIA chipsets.
294
295 ... and others. Really should move this somewhere else.
296
297 Kludge-O-Rama. */
298void __init check_ioapic(void)
299{
300 int num,slot,func;
301 /* Poor man's PCI discovery */
302 for (num = 0; num < 32; num++) {
303 for (slot = 0; slot < 32; slot++) {
304 for (func = 0; func < 8; func++) {
305 u32 class;
306 u32 vendor;
307 u8 type;
308 class = read_pci_config(num,slot,func,
309 PCI_CLASS_REVISION);
310 if (class == 0xffffffff)
311 break;
312
313 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
314 continue;
315
316 vendor = read_pci_config(num, slot, func,
317 PCI_VENDOR_ID);
318 vendor &= 0xffff;
319 switch (vendor) {
320 case PCI_VENDOR_ID_VIA:
321#ifdef CONFIG_IOMMU
322 if ((end_pfn > MAX_DMA32_PFN ||
323 force_iommu) &&
324 !iommu_aperture_allowed) {
325 printk(KERN_INFO
326 "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
327 iommu_aperture_disabled = 1;
328 }
329#endif
330 return;
331 case PCI_VENDOR_ID_NVIDIA:
332#ifdef CONFIG_ACPI
333 /*
334 * All timer overrides on Nvidia are
335 * wrong unless HPET is enabled.
336 */
337 nvidia_hpet_detected = 0;
338 acpi_table_parse(ACPI_HPET,
339 nvidia_hpet_check);
340 if (nvidia_hpet_detected == 0) {
341 acpi_skip_timer_override = 1;
342 printk(KERN_INFO "Nvidia board "
343 "detected. Ignoring ACPI "
344 "timer override.\n");
345 }
346#endif
347 /* RED-PEN skip them on mptables too? */
348 return;
349
350 /* This should be actually default, but
351 for 2.6.16 let's do it for ATI only where
352 it's really needed. */
353 case PCI_VENDOR_ID_ATI:
354 if (timer_over_8254 == 1) {
355 timer_over_8254 = 0;
356 printk(KERN_INFO
357 "ATI board detected. Disabling timer routing over 8254.\n");
358 }
359 return;
360 }
361
362
363 /* No multi-function device? */
364 type = read_pci_config_byte(num,slot,func,
365 PCI_HEADER_TYPE);
366 if (!(type & 0x80))
367 break;
368 }
369 }
370 }
371}
372
373static int __init ioapic_pirq_setup(char *str)
374{
375 int i, max;
376 int ints[MAX_PIRQS+1];
377
378 get_options(str, ARRAY_SIZE(ints), ints);
379
380 for (i = 0; i < MAX_PIRQS; i++)
381 pirq_entries[i] = -1;
382
383 pirqs_enabled = 1;
384 apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
385 max = MAX_PIRQS;
386 if (ints[0] < MAX_PIRQS)
387 max = ints[0];
388
389 for (i = 0; i < max; i++) {
390 apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
391 /*
392 * PIRQs are mapped upside down, usually.
393 */
394 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
395 }
396 return 1;
397}
398
399__setup("pirq=", ioapic_pirq_setup);
400 282
401/* 283/*
402 * Find the IRQ entry number of a certain pin. 284 * Find the IRQ entry number of a certain pin.
@@ -425,9 +307,7 @@ static int __init find_isa_irq_pin(int irq, int type)
425 for (i = 0; i < mp_irq_entries; i++) { 307 for (i = 0; i < mp_irq_entries; i++) {
426 int lbus = mp_irqs[i].mpc_srcbus; 308 int lbus = mp_irqs[i].mpc_srcbus;
427 309
428 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 310 if (test_bit(lbus, mp_bus_not_pci) &&
429 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
430 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
431 (mp_irqs[i].mpc_irqtype == type) && 311 (mp_irqs[i].mpc_irqtype == type) &&
432 (mp_irqs[i].mpc_srcbusirq == irq)) 312 (mp_irqs[i].mpc_srcbusirq == irq))
433 313
@@ -443,9 +323,7 @@ static int __init find_isa_irq_apic(int irq, int type)
443 for (i = 0; i < mp_irq_entries; i++) { 323 for (i = 0; i < mp_irq_entries; i++) {
444 int lbus = mp_irqs[i].mpc_srcbus; 324 int lbus = mp_irqs[i].mpc_srcbus;
445 325
446 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 326 if (test_bit(lbus, mp_bus_not_pci) &&
447 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
448 mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
449 (mp_irqs[i].mpc_irqtype == type) && 327 (mp_irqs[i].mpc_irqtype == type) &&
450 (mp_irqs[i].mpc_srcbusirq == irq)) 328 (mp_irqs[i].mpc_srcbusirq == irq))
451 break; 329 break;
@@ -485,7 +363,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
485 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) 363 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
486 break; 364 break;
487 365
488 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && 366 if (!test_bit(lbus, mp_bus_not_pci) &&
489 !mp_irqs[i].mpc_irqtype && 367 !mp_irqs[i].mpc_irqtype &&
490 (bus == lbus) && 368 (bus == lbus) &&
491 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { 369 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -508,27 +386,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
508 return best_guess; 386 return best_guess;
509} 387}
510 388
511/*
512 * EISA Edge/Level control register, ELCR
513 */
514static int EISA_ELCR(unsigned int irq)
515{
516 if (irq < 16) {
517 unsigned int port = 0x4d0 + (irq >> 3);
518 return (inb(port) >> (irq & 7)) & 1;
519 }
520 apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
521 return 0;
522}
523
524/* EISA interrupts are always polarity zero and can be edge or level
525 * trigger depending on the ELCR value. If an interrupt is listed as
526 * EISA conforming in the MP table, that means its trigger type must
527 * be read in from the ELCR */
528
529#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
530#define default_EISA_polarity(idx) (0)
531
532/* ISA interrupts are always polarity zero edge triggered, 389/* ISA interrupts are always polarity zero edge triggered,
533 * when listed as conforming in the MP table. */ 390 * when listed as conforming in the MP table. */
534 391
@@ -541,12 +398,6 @@ static int EISA_ELCR(unsigned int irq)
541#define default_PCI_trigger(idx) (1) 398#define default_PCI_trigger(idx) (1)
542#define default_PCI_polarity(idx) (1) 399#define default_PCI_polarity(idx) (1)
543 400
544/* MCA interrupts are always polarity zero level triggered,
545 * when listed as conforming in the MP table. */
546
547#define default_MCA_trigger(idx) (1)
548#define default_MCA_polarity(idx) (0)
549
550static int __init MPBIOS_polarity(int idx) 401static int __init MPBIOS_polarity(int idx)
551{ 402{
552 int bus = mp_irqs[idx].mpc_srcbus; 403 int bus = mp_irqs[idx].mpc_srcbus;
@@ -558,38 +409,11 @@ static int __init MPBIOS_polarity(int idx)
558 switch (mp_irqs[idx].mpc_irqflag & 3) 409 switch (mp_irqs[idx].mpc_irqflag & 3)
559 { 410 {
560 case 0: /* conforms, ie. bus-type dependent polarity */ 411 case 0: /* conforms, ie. bus-type dependent polarity */
561 { 412 if (test_bit(bus, mp_bus_not_pci))
562 switch (mp_bus_id_to_type[bus]) 413 polarity = default_ISA_polarity(idx);
563 { 414 else
564 case MP_BUS_ISA: /* ISA pin */ 415 polarity = default_PCI_polarity(idx);
565 {
566 polarity = default_ISA_polarity(idx);
567 break;
568 }
569 case MP_BUS_EISA: /* EISA pin */
570 {
571 polarity = default_EISA_polarity(idx);
572 break;
573 }
574 case MP_BUS_PCI: /* PCI pin */
575 {
576 polarity = default_PCI_polarity(idx);
577 break;
578 }
579 case MP_BUS_MCA: /* MCA pin */
580 {
581 polarity = default_MCA_polarity(idx);
582 break;
583 }
584 default:
585 {
586 printk(KERN_WARNING "broken BIOS!!\n");
587 polarity = 1;
588 break;
589 }
590 }
591 break; 416 break;
592 }
593 case 1: /* high active */ 417 case 1: /* high active */
594 { 418 {
595 polarity = 0; 419 polarity = 0;
@@ -627,38 +451,11 @@ static int MPBIOS_trigger(int idx)
627 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) 451 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
628 { 452 {
629 case 0: /* conforms, ie. bus-type dependent */ 453 case 0: /* conforms, ie. bus-type dependent */
630 { 454 if (test_bit(bus, mp_bus_not_pci))
631 switch (mp_bus_id_to_type[bus]) 455 trigger = default_ISA_trigger(idx);
632 { 456 else
633 case MP_BUS_ISA: /* ISA pin */ 457 trigger = default_PCI_trigger(idx);
634 {
635 trigger = default_ISA_trigger(idx);
636 break;
637 }
638 case MP_BUS_EISA: /* EISA pin */
639 {
640 trigger = default_EISA_trigger(idx);
641 break;
642 }
643 case MP_BUS_PCI: /* PCI pin */
644 {
645 trigger = default_PCI_trigger(idx);
646 break;
647 }
648 case MP_BUS_MCA: /* MCA pin */
649 {
650 trigger = default_MCA_trigger(idx);
651 break;
652 }
653 default:
654 {
655 printk(KERN_WARNING "broken BIOS!!\n");
656 trigger = 1;
657 break;
658 }
659 }
660 break; 458 break;
661 }
662 case 1: /* edge */ 459 case 1: /* edge */
663 { 460 {
664 trigger = 0; 461 trigger = 0;
@@ -764,49 +561,17 @@ static int pin_2_irq(int idx, int apic, int pin)
764 if (mp_irqs[idx].mpc_dstirq != pin) 561 if (mp_irqs[idx].mpc_dstirq != pin)
765 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 562 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
766 563
767 switch (mp_bus_id_to_type[bus]) 564 if (test_bit(bus, mp_bus_not_pci)) {
768 { 565 irq = mp_irqs[idx].mpc_srcbusirq;
769 case MP_BUS_ISA: /* ISA pin */ 566 } else {
770 case MP_BUS_EISA: 567 /*
771 case MP_BUS_MCA: 568 * PCI IRQs are mapped in order
772 { 569 */
773 irq = mp_irqs[idx].mpc_srcbusirq; 570 i = irq = 0;
774 break; 571 while (i < apic)
775 } 572 irq += nr_ioapic_registers[i++];
776 case MP_BUS_PCI: /* PCI pin */ 573 irq += pin;
777 { 574 irq = gsi_irq_sharing(irq);
778 /*
779 * PCI IRQs are mapped in order
780 */
781 i = irq = 0;
782 while (i < apic)
783 irq += nr_ioapic_registers[i++];
784 irq += pin;
785 irq = gsi_irq_sharing(irq);
786 break;
787 }
788 default:
789 {
790 printk(KERN_ERR "unknown bus type %d.\n",bus);
791 irq = 0;
792 break;
793 }
794 }
795 BUG_ON(irq >= NR_IRQS);
796
797 /*
798 * PCI IRQ command line redirection. Yes, limits are hardcoded.
799 */
800 if ((pin >= 16) && (pin <= 23)) {
801 if (pirq_entries[pin-16] != -1) {
802 if (!pirq_entries[pin-16]) {
803 apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
804 } else {
805 irq = pirq_entries[pin-16];
806 apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
807 pin-16, irq);
808 }
809 }
810 } 575 }
811 BUG_ON(irq >= NR_IRQS); 576 BUG_ON(irq >= NR_IRQS);
812 return irq; 577 return irq;
@@ -943,9 +708,9 @@ static void __init setup_IO_APIC_irqs(void)
943 if (!apic && (irq < 16)) 708 if (!apic && (irq < 16))
944 disable_8259A_irq(irq); 709 disable_8259A_irq(irq);
945 } 710 }
711 ioapic_write_entry(apic, pin, entry);
712
946 spin_lock_irqsave(&ioapic_lock, flags); 713 spin_lock_irqsave(&ioapic_lock, flags);
947 io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
948 io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
949 set_native_irq_info(irq, TARGET_CPUS); 714 set_native_irq_info(irq, TARGET_CPUS);
950 spin_unlock_irqrestore(&ioapic_lock, flags); 715 spin_unlock_irqrestore(&ioapic_lock, flags);
951 } 716 }
@@ -1083,10 +848,7 @@ void __apicdebuginit print_IO_APIC(void)
1083 for (i = 0; i <= reg_01.bits.entries; i++) { 848 for (i = 0; i <= reg_01.bits.entries; i++) {
1084 struct IO_APIC_route_entry entry; 849 struct IO_APIC_route_entry entry;
1085 850
1086 spin_lock_irqsave(&ioapic_lock, flags); 851 entry = ioapic_read_entry(apic, i);
1087 *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
1088 *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
1089 spin_unlock_irqrestore(&ioapic_lock, flags);
1090 852
1091 printk(KERN_DEBUG " %02x %03X %02X ", 853 printk(KERN_DEBUG " %02x %03X %02X ",
1092 i, 854 i,
@@ -1281,9 +1043,6 @@ static void __init enable_IO_APIC(void)
1281 irq_2_pin[i].pin = -1; 1043 irq_2_pin[i].pin = -1;
1282 irq_2_pin[i].next = 0; 1044 irq_2_pin[i].next = 0;
1283 } 1045 }
1284 if (!pirqs_enabled)
1285 for (i = 0; i < MAX_PIRQS; i++)
1286 pirq_entries[i] = -1;
1287 1046
1288 /* 1047 /*
1289 * The number of IO-APIC IRQ registers (== #pins): 1048 * The number of IO-APIC IRQ registers (== #pins):
@@ -1299,11 +1058,7 @@ static void __init enable_IO_APIC(void)
1299 /* See if any of the pins is in ExtINT mode */ 1058 /* See if any of the pins is in ExtINT mode */
1300 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1059 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1301 struct IO_APIC_route_entry entry; 1060 struct IO_APIC_route_entry entry;
1302 spin_lock_irqsave(&ioapic_lock, flags); 1061 entry = ioapic_read_entry(apic, pin);
1303 *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1304 *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1305 spin_unlock_irqrestore(&ioapic_lock, flags);
1306
1307 1062
1308 /* If the interrupt line is enabled and in ExtInt mode 1063 /* If the interrupt line is enabled and in ExtInt mode
1309 * I have found the pin where the i8259 is connected. 1064 * I have found the pin where the i8259 is connected.
@@ -1355,7 +1110,6 @@ void disable_IO_APIC(void)
1355 */ 1110 */
1356 if (ioapic_i8259.pin != -1) { 1111 if (ioapic_i8259.pin != -1) {
1357 struct IO_APIC_route_entry entry; 1112 struct IO_APIC_route_entry entry;
1358 unsigned long flags;
1359 1113
1360 memset(&entry, 0, sizeof(entry)); 1114 memset(&entry, 0, sizeof(entry));
1361 entry.mask = 0; /* Enabled */ 1115 entry.mask = 0; /* Enabled */
@@ -1372,84 +1126,13 @@ void disable_IO_APIC(void)
1372 /* 1126 /*
1373 * Add it to the IO-APIC irq-routing table: 1127 * Add it to the IO-APIC irq-routing table:
1374 */ 1128 */
1375 spin_lock_irqsave(&ioapic_lock, flags); 1129 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1376 io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
1377 *(((int *)&entry)+1));
1378 io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
1379 *(((int *)&entry)+0));
1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1381 } 1130 }
1382 1131
1383 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 1132 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1384} 1133}
1385 1134
1386/* 1135/*
1387 * function to set the IO-APIC physical IDs based on the
1388 * values stored in the MPC table.
1389 *
1390 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1391 */
1392
1393static void __init setup_ioapic_ids_from_mpc (void)
1394{
1395 union IO_APIC_reg_00 reg_00;
1396 int apic;
1397 int i;
1398 unsigned char old_id;
1399 unsigned long flags;
1400
1401 /*
1402 * Set the IOAPIC ID to the value stored in the MPC table.
1403 */
1404 for (apic = 0; apic < nr_ioapics; apic++) {
1405
1406 /* Read the register 0 value */
1407 spin_lock_irqsave(&ioapic_lock, flags);
1408 reg_00.raw = io_apic_read(apic, 0);
1409 spin_unlock_irqrestore(&ioapic_lock, flags);
1410
1411 old_id = mp_ioapics[apic].mpc_apicid;
1412
1413
1414 printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
1415
1416
1417 /*
1418 * We need to adjust the IRQ routing table
1419 * if the ID changed.
1420 */
1421 if (old_id != mp_ioapics[apic].mpc_apicid)
1422 for (i = 0; i < mp_irq_entries; i++)
1423 if (mp_irqs[i].mpc_dstapic == old_id)
1424 mp_irqs[i].mpc_dstapic
1425 = mp_ioapics[apic].mpc_apicid;
1426
1427 /*
1428 * Read the right value from the MPC table and
1429 * write it into the ID register.
1430 */
1431 apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
1432 mp_ioapics[apic].mpc_apicid);
1433
1434 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1435 spin_lock_irqsave(&ioapic_lock, flags);
1436 io_apic_write(apic, 0, reg_00.raw);
1437 spin_unlock_irqrestore(&ioapic_lock, flags);
1438
1439 /*
1440 * Sanity check
1441 */
1442 spin_lock_irqsave(&ioapic_lock, flags);
1443 reg_00.raw = io_apic_read(apic, 0);
1444 spin_unlock_irqrestore(&ioapic_lock, flags);
1445 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1446 printk("could not set ID!\n");
1447 else
1448 apic_printk(APIC_VERBOSE," ok.\n");
1449 }
1450}
1451
1452/*
1453 * There is a nasty bug in some older SMP boards, their mptable lies 1136 * There is a nasty bug in some older SMP boards, their mptable lies
1454 * about the timer IRQ. We do the following to work around the situation: 1137 * about the timer IRQ. We do the following to work around the situation:
1455 * 1138 *
@@ -1964,11 +1647,6 @@ void __init setup_IO_APIC(void)
1964 1647
1965 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 1648 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1966 1649
1967 /*
1968 * Set up the IO-APIC IRQ routing table.
1969 */
1970 if (!acpi_ioapic)
1971 setup_ioapic_ids_from_mpc();
1972 sync_Arb_IDs(); 1650 sync_Arb_IDs();
1973 setup_IO_APIC_irqs(); 1651 setup_IO_APIC_irqs();
1974 init_IO_APIC_traps(); 1652 init_IO_APIC_traps();
@@ -1987,17 +1665,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
1987{ 1665{
1988 struct IO_APIC_route_entry *entry; 1666 struct IO_APIC_route_entry *entry;
1989 struct sysfs_ioapic_data *data; 1667 struct sysfs_ioapic_data *data;
1990 unsigned long flags;
1991 int i; 1668 int i;
1992 1669
1993 data = container_of(dev, struct sysfs_ioapic_data, dev); 1670 data = container_of(dev, struct sysfs_ioapic_data, dev);
1994 entry = data->entry; 1671 entry = data->entry;
1995 spin_lock_irqsave(&ioapic_lock, flags); 1672 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
1996 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { 1673 *entry = ioapic_read_entry(dev->id, i);
1997 *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
1998 *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
1999 }
2000 spin_unlock_irqrestore(&ioapic_lock, flags);
2001 1674
2002 return 0; 1675 return 0;
2003} 1676}
@@ -2019,11 +1692,9 @@ static int ioapic_resume(struct sys_device *dev)
2019 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; 1692 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2020 io_apic_write(dev->id, 0, reg_00.raw); 1693 io_apic_write(dev->id, 0, reg_00.raw);
2021 } 1694 }
2022 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2023 io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
2024 io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
2025 }
2026 spin_unlock_irqrestore(&ioapic_lock, flags); 1695 spin_unlock_irqrestore(&ioapic_lock, flags);
1696 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
1697 ioapic_write_entry(dev->id, i, entry[i]);
2027 1698
2028 return 0; 1699 return 0;
2029} 1700}
@@ -2077,19 +1748,6 @@ device_initcall(ioapic_init_sysfs);
2077 1748
2078#define IO_APIC_MAX_ID 0xFE 1749#define IO_APIC_MAX_ID 0xFE
2079 1750
2080int __init io_apic_get_version (int ioapic)
2081{
2082 union IO_APIC_reg_01 reg_01;
2083 unsigned long flags;
2084
2085 spin_lock_irqsave(&ioapic_lock, flags);
2086 reg_01.raw = io_apic_read(ioapic, 1);
2087 spin_unlock_irqrestore(&ioapic_lock, flags);
2088
2089 return reg_01.bits.version;
2090}
2091
2092
2093int __init io_apic_get_redir_entries (int ioapic) 1751int __init io_apic_get_redir_entries (int ioapic)
2094{ 1752{
2095 union IO_APIC_reg_01 reg_01; 1753 union IO_APIC_reg_01 reg_01;
@@ -2148,10 +1806,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
2148 if (!ioapic && (irq < 16)) 1806 if (!ioapic && (irq < 16))
2149 disable_8259A_irq(irq); 1807 disable_8259A_irq(irq);
2150 1808
1809 ioapic_write_entry(ioapic, pin, entry);
1810
2151 spin_lock_irqsave(&ioapic_lock, flags); 1811 spin_lock_irqsave(&ioapic_lock, flags);
2152 io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); 1812 set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2153 io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
2154 set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2155 spin_unlock_irqrestore(&ioapic_lock, flags); 1813 spin_unlock_irqrestore(&ioapic_lock, flags);
2156 1814
2157 return 0; 1815 return 0;
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index b81614970ecc..fe063d3cfe42 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
56 56
57 memset(bitmap, 0xff, IO_BITMAP_BYTES); 57 memset(bitmap, 0xff, IO_BITMAP_BYTES);
58 t->io_bitmap_ptr = bitmap; 58 t->io_bitmap_ptr = bitmap;
59 set_thread_flag(TIF_IO_BITMAP);
59 } 60 }
60 61
61 /* 62 /*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 5221a53e90c1..b3677e6ccc6e 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -20,11 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21 21
22atomic_t irq_err_count; 22atomic_t irq_err_count;
23#ifdef CONFIG_X86_IO_APIC
24#ifdef APIC_MISMATCH_DEBUG
25atomic_t irq_mis_count;
26#endif
27#endif
28 23
29#ifdef CONFIG_DEBUG_STACKOVERFLOW 24#ifdef CONFIG_DEBUG_STACKOVERFLOW
30/* 25/*
@@ -92,18 +87,11 @@ skip:
92 for_each_online_cpu(j) 87 for_each_online_cpu(j)
93 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); 88 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
94 seq_putc(p, '\n'); 89 seq_putc(p, '\n');
95#ifdef CONFIG_X86_LOCAL_APIC
96 seq_printf(p, "LOC: "); 90 seq_printf(p, "LOC: ");
97 for_each_online_cpu(j) 91 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); 92 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
99 seq_putc(p, '\n'); 93 seq_putc(p, '\n');
100#endif
101 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); 94 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
102#ifdef CONFIG_X86_IO_APIC
103#ifdef APIC_MISMATCH_DEBUG
104 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
105#endif
106#endif
107 } 95 }
108 return 0; 96 return 0;
109} 97}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 106076b370fc..0497e3bd5bff 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -15,6 +15,15 @@
15#include <asm/mmu_context.h> 15#include <asm/mmu_context.h>
16#include <asm/io.h> 16#include <asm/io.h>
17 17
18#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
19static u64 kexec_pgd[512] PAGE_ALIGNED;
20static u64 kexec_pud0[512] PAGE_ALIGNED;
21static u64 kexec_pmd0[512] PAGE_ALIGNED;
22static u64 kexec_pte0[512] PAGE_ALIGNED;
23static u64 kexec_pud1[512] PAGE_ALIGNED;
24static u64 kexec_pmd1[512] PAGE_ALIGNED;
25static u64 kexec_pte1[512] PAGE_ALIGNED;
26
18static void init_level2_page(pmd_t *level2p, unsigned long addr) 27static void init_level2_page(pmd_t *level2p, unsigned long addr)
19{ 28{
20 unsigned long end_addr; 29 unsigned long end_addr;
@@ -144,32 +153,19 @@ static void load_segments(void)
144 ); 153 );
145} 154}
146 155
147typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
148 unsigned long control_code_buffer,
149 unsigned long start_address,
150 unsigned long pgtable) ATTRIB_NORET;
151
152extern const unsigned char relocate_new_kernel[];
153extern const unsigned long relocate_new_kernel_size;
154
155int machine_kexec_prepare(struct kimage *image) 156int machine_kexec_prepare(struct kimage *image)
156{ 157{
157 unsigned long start_pgtable, control_code_buffer; 158 unsigned long start_pgtable;
158 int result; 159 int result;
159 160
160 /* Calculate the offsets */ 161 /* Calculate the offsets */
161 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 162 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
162 control_code_buffer = start_pgtable + PAGE_SIZE;
163 163
164 /* Setup the identity mapped 64bit page table */ 164 /* Setup the identity mapped 64bit page table */
165 result = init_pgtable(image, start_pgtable); 165 result = init_pgtable(image, start_pgtable);
166 if (result) 166 if (result)
167 return result; 167 return result;
168 168
169 /* Place the code in the reboot code buffer */
170 memcpy(__va(control_code_buffer), relocate_new_kernel,
171 relocate_new_kernel_size);
172
173 return 0; 169 return 0;
174} 170}
175 171
@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage *image)
184 */ 180 */
185NORET_TYPE void machine_kexec(struct kimage *image) 181NORET_TYPE void machine_kexec(struct kimage *image)
186{ 182{
187 unsigned long page_list; 183 unsigned long page_list[PAGES_NR];
188 unsigned long control_code_buffer; 184 void *control_page;
189 unsigned long start_pgtable;
190 relocate_new_kernel_t rnk;
191 185
192 /* Interrupts aren't acceptable while we reboot */ 186 /* Interrupts aren't acceptable while we reboot */
193 local_irq_disable(); 187 local_irq_disable();
194 188
195 /* Calculate the offsets */ 189 control_page = page_address(image->control_code_page) + PAGE_SIZE;
196 page_list = image->head; 190 memcpy(control_page, relocate_kernel, PAGE_SIZE);
197 start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 191
198 control_code_buffer = start_pgtable + PAGE_SIZE; 192 page_list[PA_CONTROL_PAGE] = __pa(control_page);
199 193 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
200 /* Set the low half of the page table to my identity mapped 194 page_list[PA_PGD] = __pa(kexec_pgd);
201 * page table for kexec. Leave the high half pointing at the 195 page_list[VA_PGD] = (unsigned long)kexec_pgd;
202 * kernel pages. Don't bother to flush the global pages 196 page_list[PA_PUD_0] = __pa(kexec_pud0);
203 * as that will happen when I fully switch to my identity mapped 197 page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
204 * page table anyway. 198 page_list[PA_PMD_0] = __pa(kexec_pmd0);
205 */ 199 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
206 memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); 200 page_list[PA_PTE_0] = __pa(kexec_pte0);
207 __flush_tlb(); 201 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
208 202 page_list[PA_PUD_1] = __pa(kexec_pud1);
203 page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
204 page_list[PA_PMD_1] = __pa(kexec_pmd1);
205 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
206 page_list[PA_PTE_1] = __pa(kexec_pte1);
207 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
208
209 page_list[PA_TABLE_PAGE] =
210 (unsigned long)__pa(page_address(image->control_code_page));
209 211
210 /* The segment registers are funny things, they have both a 212 /* The segment registers are funny things, they have both a
211 * visible and an invisible part. Whenever the visible part is 213 * visible and an invisible part. Whenever the visible part is
@@ -222,7 +224,36 @@ NORET_TYPE void machine_kexec(struct kimage *image)
222 */ 224 */
223 set_gdt(phys_to_virt(0),0); 225 set_gdt(phys_to_virt(0),0);
224 set_idt(phys_to_virt(0),0); 226 set_idt(phys_to_virt(0),0);
227
225 /* now call it */ 228 /* now call it */
226 rnk = (relocate_new_kernel_t) control_code_buffer; 229 relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
227 (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); 230 image->start);
228} 231}
232
233/* crashkernel=size@addr specifies the location to reserve for
234 * a crash kernel. By reserving this memory we guarantee
235 * that linux never set's it up as a DMA target.
236 * Useful for holding code to do something appropriate
237 * after a kernel panic.
238 */
239static int __init setup_crashkernel(char *arg)
240{
241 unsigned long size, base;
242 char *p;
243 if (!arg)
244 return -EINVAL;
245 size = memparse(arg, &p);
246 if (arg == p)
247 return -EINVAL;
248 if (*p == '@') {
249 base = memparse(p+1, &p);
250 /* FIXME: Do I want a sanity check to validate the
251 * memory range? Yes you do, but it's too early for
252 * e820 -AK */
253 crashk_res.start = base;
254 crashk_res.end = base + size - 1;
255 }
256 return 0;
257}
258early_param("crashkernel", setup_crashkernel);
259
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 4e017fb30fb3..bbea88801d88 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
182 goto out2; 182 goto out2;
183 183
184 memset(&m, 0, sizeof(struct mce)); 184 memset(&m, 0, sizeof(struct mce));
185 m.cpu = safe_smp_processor_id(); 185 m.cpu = smp_processor_id();
186 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 186 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
187 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 187 if (!(m.mcgstatus & MCG_STATUS_RIPV))
188 kill_it = 1; 188 kill_it = 1;
@@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code)
274 atomic_dec(&mce_entry); 274 atomic_dec(&mce_entry);
275} 275}
276 276
277#ifdef CONFIG_X86_MCE_INTEL
278/***
279 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
280 * @cpu: The CPU on which the event occured.
281 * @status: Event status information
282 *
283 * This function should be called by the thermal interrupt after the
284 * event has been processed and the decision was made to log the event
285 * further.
286 *
287 * The status parameter will be saved to the 'status' field of 'struct mce'
288 * and historically has been the register value of the
289 * MSR_IA32_THERMAL_STATUS (Intel) msr.
290 */
291void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
292{
293 struct mce m;
294
295 memset(&m, 0, sizeof(m));
296 m.cpu = cpu;
297 m.bank = MCE_THERMAL_BANK;
298 m.status = status;
299 rdtscll(m.tsc);
300 mce_log(&m);
301}
302#endif /* CONFIG_X86_MCE_INTEL */
303
277/* 304/*
278 * Periodic polling timer for "silent" machine check errors. 305 * Periodic polling timer for "silent" machine check errors.
279 */ 306 */
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 8f533d2c40cb..6551505d8a2c 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -11,36 +11,21 @@
11#include <asm/mce.h> 11#include <asm/mce.h>
12#include <asm/hw_irq.h> 12#include <asm/hw_irq.h>
13#include <asm/idle.h> 13#include <asm/idle.h>
14 14#include <asm/therm_throt.h>
15static DEFINE_PER_CPU(unsigned long, next_check);
16 15
17asmlinkage void smp_thermal_interrupt(void) 16asmlinkage void smp_thermal_interrupt(void)
18{ 17{
19 struct mce m; 18 __u64 msr_val;
20 19
21 ack_APIC_irq(); 20 ack_APIC_irq();
22 21
23 exit_idle(); 22 exit_idle();
24 irq_enter(); 23 irq_enter();
25 if (time_before(jiffies, __get_cpu_var(next_check)))
26 goto done;
27
28 __get_cpu_var(next_check) = jiffies + HZ*300;
29 memset(&m, 0, sizeof(m));
30 m.cpu = smp_processor_id();
31 m.bank = MCE_THERMAL_BANK;
32 rdtscll(m.tsc);
33 rdmsrl(MSR_IA32_THERM_STATUS, m.status);
34 if (m.status & 0x1) {
35 printk(KERN_EMERG
36 "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
37 add_taint(TAINT_MACHINE_CHECK);
38 } else {
39 printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
40 }
41 24
42 mce_log(&m); 25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
43done: 26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28
44 irq_exit(); 29 irq_exit();
45} 30}
46 31
@@ -92,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
92 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 77 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
93 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 78 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
94 cpu, tm2 ? "TM2" : "TM1"); 79 cpu, tm2 ? "TM2" : "TM1");
80
81 /* enable thermal throttle processing */
82 atomic_set(&therm_throt_en, 1);
95 return; 83 return;
96} 84}
97 85
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index a1ab4197f8a1..20e88f4b564b 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -41,8 +41,7 @@ int acpi_found_madt;
41 * Various Linux-internal data structures created from the 41 * Various Linux-internal data structures created from the
42 * MP-table. 42 * MP-table.
43 */ 43 */
44unsigned char apic_version [MAX_APICS]; 44DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
45unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
46int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; 45int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
47 46
48static int mp_current_pci_id = 0; 47static int mp_current_pci_id = 0;
@@ -56,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
56int mp_irq_entries; 55int mp_irq_entries;
57 56
58int nr_ioapics; 57int nr_ioapics;
59int pic_mode;
60unsigned long mp_lapic_addr = 0; 58unsigned long mp_lapic_addr = 0;
61 59
62 60
@@ -71,19 +69,6 @@ unsigned disabled_cpus __initdata;
71/* Bitmask of physically existing CPUs */ 69/* Bitmask of physically existing CPUs */
72physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; 70physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
73 71
74/* ACPI MADT entry parsing functions */
75#ifdef CONFIG_ACPI
76extern struct acpi_boot_flags acpi_boot;
77#ifdef CONFIG_X86_LOCAL_APIC
78extern int acpi_parse_lapic (acpi_table_entry_header *header);
79extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
80extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
81#endif /*CONFIG_X86_LOCAL_APIC*/
82#ifdef CONFIG_X86_IO_APIC
83extern int acpi_parse_ioapic (acpi_table_entry_header *header);
84#endif /*CONFIG_X86_IO_APIC*/
85#endif /*CONFIG_ACPI*/
86
87u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 72u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
88 73
89 74
@@ -108,24 +93,20 @@ static int __init mpf_checksum(unsigned char *mp, int len)
108static void __cpuinit MP_processor_info (struct mpc_config_processor *m) 93static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
109{ 94{
110 int cpu; 95 int cpu;
111 unsigned char ver;
112 cpumask_t tmp_map; 96 cpumask_t tmp_map;
97 char *bootup_cpu = "";
113 98
114 if (!(m->mpc_cpuflag & CPU_ENABLED)) { 99 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
115 disabled_cpus++; 100 disabled_cpus++;
116 return; 101 return;
117 } 102 }
118
119 printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
120 m->mpc_apicid,
121 (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
122 (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
123 m->mpc_apicver);
124
125 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 103 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
126 Dprintk(" Bootup CPU\n"); 104 bootup_cpu = " (Bootup-CPU)";
127 boot_cpu_id = m->mpc_apicid; 105 boot_cpu_id = m->mpc_apicid;
128 } 106 }
107
108 printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
109
129 if (num_processors >= NR_CPUS) { 110 if (num_processors >= NR_CPUS) {
130 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 111 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
131 " Processor ignored.\n", NR_CPUS); 112 " Processor ignored.\n", NR_CPUS);
@@ -136,24 +117,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
136 cpus_complement(tmp_map, cpu_present_map); 117 cpus_complement(tmp_map, cpu_present_map);
137 cpu = first_cpu(tmp_map); 118 cpu = first_cpu(tmp_map);
138 119
139#if MAX_APICS < 255
140 if ((int)m->mpc_apicid > MAX_APICS) {
141 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
142 m->mpc_apicid, MAX_APICS);
143 return;
144 }
145#endif
146 ver = m->mpc_apicver;
147
148 physid_set(m->mpc_apicid, phys_cpu_present_map); 120 physid_set(m->mpc_apicid, phys_cpu_present_map);
149 /*
150 * Validate version
151 */
152 if (ver == 0x0) {
153 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
154 ver = 0x10;
155 }
156 apic_version[m->mpc_apicid] = ver;
157 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 121 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
158 /* 122 /*
159 * bios_cpu_apicid is required to have processors listed 123 * bios_cpu_apicid is required to have processors listed
@@ -178,15 +142,11 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
178 Dprintk("Bus #%d is %s\n", m->mpc_busid, str); 142 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
179 143
180 if (strncmp(str, "ISA", 3) == 0) { 144 if (strncmp(str, "ISA", 3) == 0) {
181 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 145 set_bit(m->mpc_busid, mp_bus_not_pci);
182 } else if (strncmp(str, "EISA", 4) == 0) {
183 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
184 } else if (strncmp(str, "PCI", 3) == 0) { 146 } else if (strncmp(str, "PCI", 3) == 0) {
185 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 147 clear_bit(m->mpc_busid, mp_bus_not_pci);
186 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; 148 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
187 mp_current_pci_id++; 149 mp_current_pci_id++;
188 } else if (strncmp(str, "MCA", 3) == 0) {
189 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
190 } else { 150 } else {
191 printk(KERN_ERR "Unknown bustype %s\n", str); 151 printk(KERN_ERR "Unknown bustype %s\n", str);
192 } 152 }
@@ -197,8 +157,8 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
197 if (!(m->mpc_flags & MPC_APIC_USABLE)) 157 if (!(m->mpc_flags & MPC_APIC_USABLE))
198 return; 158 return;
199 159
200 printk("I/O APIC #%d Version %d at 0x%X.\n", 160 printk("I/O APIC #%d at 0x%X.\n",
201 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); 161 m->mpc_apicid, m->mpc_apicaddr);
202 if (nr_ioapics >= MAX_IO_APICS) { 162 if (nr_ioapics >= MAX_IO_APICS) {
203 printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", 163 printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
204 MAX_IO_APICS, nr_ioapics); 164 MAX_IO_APICS, nr_ioapics);
@@ -232,19 +192,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
232 m->mpc_irqtype, m->mpc_irqflag & 3, 192 m->mpc_irqtype, m->mpc_irqflag & 3,
233 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, 193 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
234 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 194 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
235 /*
236 * Well it seems all SMP boards in existence
237 * use ExtINT/LVT1 == LINT0 and
238 * NMI/LVT2 == LINT1 - the following check
239 * will show us if this assumptions is false.
240 * Until then we do not have to add baggage.
241 */
242 if ((m->mpc_irqtype == mp_ExtINT) &&
243 (m->mpc_destapiclint != 0))
244 BUG();
245 if ((m->mpc_irqtype == mp_NMI) &&
246 (m->mpc_destapiclint != 1))
247 BUG();
248} 195}
249 196
250/* 197/*
@@ -258,7 +205,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
258 unsigned char *mpt=((unsigned char *)mpc)+count; 205 unsigned char *mpt=((unsigned char *)mpc)+count;
259 206
260 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { 207 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
261 printk("SMP mptable: bad signature [%c%c%c%c]!\n", 208 printk("MPTABLE: bad signature [%c%c%c%c]!\n",
262 mpc->mpc_signature[0], 209 mpc->mpc_signature[0],
263 mpc->mpc_signature[1], 210 mpc->mpc_signature[1],
264 mpc->mpc_signature[2], 211 mpc->mpc_signature[2],
@@ -266,31 +213,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
266 return 0; 213 return 0;
267 } 214 }
268 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { 215 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
269 printk("SMP mptable: checksum error!\n"); 216 printk("MPTABLE: checksum error!\n");
270 return 0; 217 return 0;
271 } 218 }
272 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { 219 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
273 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", 220 printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
274 mpc->mpc_spec); 221 mpc->mpc_spec);
275 return 0; 222 return 0;
276 } 223 }
277 if (!mpc->mpc_lapic) { 224 if (!mpc->mpc_lapic) {
278 printk(KERN_ERR "SMP mptable: null local APIC address!\n"); 225 printk(KERN_ERR "MPTABLE: null local APIC address!\n");
279 return 0; 226 return 0;
280 } 227 }
281 memcpy(str,mpc->mpc_oem,8); 228 memcpy(str,mpc->mpc_oem,8);
282 str[8]=0; 229 str[8] = 0;
283 printk(KERN_INFO "OEM ID: %s ",str); 230 printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
284 231
285 memcpy(str,mpc->mpc_productid,12); 232 memcpy(str,mpc->mpc_productid,12);
286 str[12]=0; 233 str[12] = 0;
287 printk("Product ID: %s ",str); 234 printk("MPTABLE: Product ID: %s ",str);
288 235
289 printk("APIC at: 0x%X\n",mpc->mpc_lapic); 236 printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
290 237
291 /* save the local APIC address, it might be non-default */ 238 /* save the local APIC address, it might be non-default */
292 if (!acpi_lapic) 239 if (!acpi_lapic)
293 mp_lapic_addr = mpc->mpc_lapic; 240 mp_lapic_addr = mpc->mpc_lapic;
294 241
295 /* 242 /*
296 * Now process the configuration blocks. 243 * Now process the configuration blocks.
@@ -302,7 +249,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
302 struct mpc_config_processor *m= 249 struct mpc_config_processor *m=
303 (struct mpc_config_processor *)mpt; 250 (struct mpc_config_processor *)mpt;
304 if (!acpi_lapic) 251 if (!acpi_lapic)
305 MP_processor_info(m); 252 MP_processor_info(m);
306 mpt += sizeof(*m); 253 mpt += sizeof(*m);
307 count += sizeof(*m); 254 count += sizeof(*m);
308 break; 255 break;
@@ -321,8 +268,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
321 struct mpc_config_ioapic *m= 268 struct mpc_config_ioapic *m=
322 (struct mpc_config_ioapic *)mpt; 269 (struct mpc_config_ioapic *)mpt;
323 MP_ioapic_info(m); 270 MP_ioapic_info(m);
324 mpt+=sizeof(*m); 271 mpt += sizeof(*m);
325 count+=sizeof(*m); 272 count += sizeof(*m);
326 break; 273 break;
327 } 274 }
328 case MP_INTSRC: 275 case MP_INTSRC:
@@ -331,8 +278,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
331 (struct mpc_config_intsrc *)mpt; 278 (struct mpc_config_intsrc *)mpt;
332 279
333 MP_intsrc_info(m); 280 MP_intsrc_info(m);
334 mpt+=sizeof(*m); 281 mpt += sizeof(*m);
335 count+=sizeof(*m); 282 count += sizeof(*m);
336 break; 283 break;
337 } 284 }
338 case MP_LINTSRC: 285 case MP_LINTSRC:
@@ -340,15 +287,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
340 struct mpc_config_lintsrc *m= 287 struct mpc_config_lintsrc *m=
341 (struct mpc_config_lintsrc *)mpt; 288 (struct mpc_config_lintsrc *)mpt;
342 MP_lintsrc_info(m); 289 MP_lintsrc_info(m);
343 mpt+=sizeof(*m); 290 mpt += sizeof(*m);
344 count+=sizeof(*m); 291 count += sizeof(*m);
345 break; 292 break;
346 } 293 }
347 } 294 }
348 } 295 }
349 clustered_apic_check(); 296 clustered_apic_check();
350 if (!num_processors) 297 if (!num_processors)
351 printk(KERN_ERR "SMP mptable: no processors registered!\n"); 298 printk(KERN_ERR "MPTABLE: no processors registered!\n");
352 return num_processors; 299 return num_processors;
353} 300}
354 301
@@ -444,13 +391,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
444 * 2 CPUs, numbered 0 & 1. 391 * 2 CPUs, numbered 0 & 1.
445 */ 392 */
446 processor.mpc_type = MP_PROCESSOR; 393 processor.mpc_type = MP_PROCESSOR;
447 /* Either an integrated APIC or a discrete 82489DX. */ 394 processor.mpc_apicver = 0;
448 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
449 processor.mpc_cpuflag = CPU_ENABLED; 395 processor.mpc_cpuflag = CPU_ENABLED;
450 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 396 processor.mpc_cpufeature = 0;
451 (boot_cpu_data.x86_model << 4) | 397 processor.mpc_featureflag = 0;
452 boot_cpu_data.x86_mask;
453 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
454 processor.mpc_reserved[0] = 0; 398 processor.mpc_reserved[0] = 0;
455 processor.mpc_reserved[1] = 0; 399 processor.mpc_reserved[1] = 0;
456 for (i = 0; i < 2; i++) { 400 for (i = 0; i < 2; i++) {
@@ -469,14 +413,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
469 case 5: 413 case 5:
470 memcpy(bus.mpc_bustype, "ISA ", 6); 414 memcpy(bus.mpc_bustype, "ISA ", 6);
471 break; 415 break;
472 case 2:
473 case 6:
474 case 3:
475 memcpy(bus.mpc_bustype, "EISA ", 6);
476 break;
477 case 4:
478 case 7:
479 memcpy(bus.mpc_bustype, "MCA ", 6);
480 } 416 }
481 MP_bus_info(&bus); 417 MP_bus_info(&bus);
482 if (mpc_default_type > 4) { 418 if (mpc_default_type > 4) {
@@ -487,7 +423,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
487 423
488 ioapic.mpc_type = MP_IOAPIC; 424 ioapic.mpc_type = MP_IOAPIC;
489 ioapic.mpc_apicid = 2; 425 ioapic.mpc_apicid = 2;
490 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; 426 ioapic.mpc_apicver = 0;
491 ioapic.mpc_flags = MPC_APIC_USABLE; 427 ioapic.mpc_flags = MPC_APIC_USABLE;
492 ioapic.mpc_apicaddr = 0xFEC00000; 428 ioapic.mpc_apicaddr = 0xFEC00000;
493 MP_ioapic_info(&ioapic); 429 MP_ioapic_info(&ioapic);
@@ -530,13 +466,6 @@ void __init get_smp_config (void)
530 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); 466 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
531 467
532 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); 468 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
533 if (mpf->mpf_feature2 & (1<<7)) {
534 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
535 pic_mode = 1;
536 } else {
537 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
538 pic_mode = 0;
539 }
540 469
541 /* 470 /*
542 * Now see if we need to read further. 471 * Now see if we need to read further.
@@ -616,7 +545,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
616 return 0; 545 return 0;
617} 546}
618 547
619void __init find_intel_smp (void) 548void __init find_smp_config(void)
620{ 549{
621 unsigned int address; 550 unsigned int address;
622 551
@@ -633,9 +562,7 @@ void __init find_intel_smp (void)
633 smp_scan_config(0xF0000,0x10000)) 562 smp_scan_config(0xF0000,0x10000))
634 return; 563 return;
635 /* 564 /*
636 * If it is an SMP machine we should know now, unless the 565 * If it is an SMP machine we should know now.
637 * configuration is in an EISA/MCA bus machine with an
638 * extended bios data area.
639 * 566 *
640 * there is a real-mode segmented pointer pointing to the 567 * there is a real-mode segmented pointer pointing to the
641 * 4K EBDA area at 0x40E, calculate and scan it here. 568 * 4K EBDA area at 0x40E, calculate and scan it here.
@@ -656,69 +583,41 @@ void __init find_intel_smp (void)
656 printk(KERN_INFO "No mptable found.\n"); 583 printk(KERN_INFO "No mptable found.\n");
657} 584}
658 585
659/*
660 * - Intel MP Configuration Table
661 */
662void __init find_smp_config (void)
663{
664#ifdef CONFIG_X86_LOCAL_APIC
665 find_intel_smp();
666#endif
667}
668
669
670/* -------------------------------------------------------------------------- 586/* --------------------------------------------------------------------------
671 ACPI-based MP Configuration 587 ACPI-based MP Configuration
672 -------------------------------------------------------------------------- */ 588 -------------------------------------------------------------------------- */
673 589
674#ifdef CONFIG_ACPI 590#ifdef CONFIG_ACPI
675 591
676void __init mp_register_lapic_address ( 592void __init mp_register_lapic_address(u64 address)
677 u64 address)
678{ 593{
679 mp_lapic_addr = (unsigned long) address; 594 mp_lapic_addr = (unsigned long) address;
680
681 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); 595 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
682
683 if (boot_cpu_id == -1U) 596 if (boot_cpu_id == -1U)
684 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); 597 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
685
686 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
687} 598}
688 599
689 600void __cpuinit mp_register_lapic (u8 id, u8 enabled)
690void __cpuinit mp_register_lapic (
691 u8 id,
692 u8 enabled)
693{ 601{
694 struct mpc_config_processor processor; 602 struct mpc_config_processor processor;
695 int boot_cpu = 0; 603 int boot_cpu = 0;
696 604
697 if (id >= MAX_APICS) { 605 if (id == boot_cpu_id)
698 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
699 id, MAX_APICS);
700 return;
701 }
702
703 if (id == boot_cpu_physical_apicid)
704 boot_cpu = 1; 606 boot_cpu = 1;
705 607
706 processor.mpc_type = MP_PROCESSOR; 608 processor.mpc_type = MP_PROCESSOR;
707 processor.mpc_apicid = id; 609 processor.mpc_apicid = id;
708 processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); 610 processor.mpc_apicver = 0;
709 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); 611 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
710 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); 612 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
711 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 613 processor.mpc_cpufeature = 0;
712 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; 614 processor.mpc_featureflag = 0;
713 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
714 processor.mpc_reserved[0] = 0; 615 processor.mpc_reserved[0] = 0;
715 processor.mpc_reserved[1] = 0; 616 processor.mpc_reserved[1] = 0;
716 617
717 MP_processor_info(&processor); 618 MP_processor_info(&processor);
718} 619}
719 620
720#ifdef CONFIG_X86_IO_APIC
721
722#define MP_ISA_BUS 0 621#define MP_ISA_BUS 0
723#define MP_MAX_IOAPIC_PIN 127 622#define MP_MAX_IOAPIC_PIN 127
724 623
@@ -729,11 +628,9 @@ static struct mp_ioapic_routing {
729 u32 pin_programmed[4]; 628 u32 pin_programmed[4];
730} mp_ioapic_routing[MAX_IO_APICS]; 629} mp_ioapic_routing[MAX_IO_APICS];
731 630
732 631static int mp_find_ioapic(int gsi)
733static int mp_find_ioapic (
734 int gsi)
735{ 632{
736 int i = 0; 633 int i = 0;
737 634
738 /* Find the IOAPIC that manages this GSI. */ 635 /* Find the IOAPIC that manages this GSI. */
739 for (i = 0; i < nr_ioapics; i++) { 636 for (i = 0; i < nr_ioapics; i++) {
@@ -743,17 +640,12 @@ static int mp_find_ioapic (
743 } 640 }
744 641
745 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); 642 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
746
747 return -1; 643 return -1;
748} 644}
749
750 645
751void __init mp_register_ioapic ( 646void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
752 u8 id,
753 u32 address,
754 u32 gsi_base)
755{ 647{
756 int idx = 0; 648 int idx = 0;
757 649
758 if (nr_ioapics >= MAX_IO_APICS) { 650 if (nr_ioapics >= MAX_IO_APICS) {
759 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " 651 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -774,7 +666,7 @@ void __init mp_register_ioapic (
774 666
775 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 667 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
776 mp_ioapics[idx].mpc_apicid = id; 668 mp_ioapics[idx].mpc_apicid = id;
777 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); 669 mp_ioapics[idx].mpc_apicver = 0;
778 670
779 /* 671 /*
780 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups 672 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
@@ -785,21 +677,15 @@ void __init mp_register_ioapic (
785 mp_ioapic_routing[idx].gsi_end = gsi_base + 677 mp_ioapic_routing[idx].gsi_end = gsi_base +
786 io_apic_get_redir_entries(idx); 678 io_apic_get_redir_entries(idx);
787 679
788 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 680 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
789 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 681 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
790 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 682 mp_ioapics[idx].mpc_apicaddr,
791 mp_ioapic_routing[idx].gsi_start, 683 mp_ioapic_routing[idx].gsi_start,
792 mp_ioapic_routing[idx].gsi_end); 684 mp_ioapic_routing[idx].gsi_end);
793
794 return;
795} 685}
796 686
797 687void __init
798void __init mp_override_legacy_irq ( 688mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
799 u8 bus_irq,
800 u8 polarity,
801 u8 trigger,
802 u32 gsi)
803{ 689{
804 struct mpc_config_intsrc intsrc; 690 struct mpc_config_intsrc intsrc;
805 int ioapic = -1; 691 int ioapic = -1;
@@ -837,22 +723,18 @@ void __init mp_override_legacy_irq (
837 mp_irqs[mp_irq_entries] = intsrc; 723 mp_irqs[mp_irq_entries] = intsrc;
838 if (++mp_irq_entries == MAX_IRQ_SOURCES) 724 if (++mp_irq_entries == MAX_IRQ_SOURCES)
839 panic("Max # of irq sources exceeded!\n"); 725 panic("Max # of irq sources exceeded!\n");
840
841 return;
842} 726}
843 727
844 728void __init mp_config_acpi_legacy_irqs(void)
845void __init mp_config_acpi_legacy_irqs (void)
846{ 729{
847 struct mpc_config_intsrc intsrc; 730 struct mpc_config_intsrc intsrc;
848 int i = 0; 731 int i = 0;
849 int ioapic = -1; 732 int ioapic = -1;
850 733
851 /* 734 /*
852 * Fabricate the legacy ISA bus (bus #31). 735 * Fabricate the legacy ISA bus (bus #31).
853 */ 736 */
854 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 737 set_bit(MP_ISA_BUS, mp_bus_not_pci);
855 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
856 738
857 /* 739 /*
858 * Locate the IOAPIC that manages the ISA IRQs (0-15). 740 * Locate the IOAPIC that manages the ISA IRQs (0-15).
@@ -905,24 +787,22 @@ void __init mp_config_acpi_legacy_irqs (void)
905 if (++mp_irq_entries == MAX_IRQ_SOURCES) 787 if (++mp_irq_entries == MAX_IRQ_SOURCES)
906 panic("Max # of irq sources exceeded!\n"); 788 panic("Max # of irq sources exceeded!\n");
907 } 789 }
908
909 return;
910} 790}
911 791
912#define MAX_GSI_NUM 4096 792#define MAX_GSI_NUM 4096
913 793
914int mp_register_gsi(u32 gsi, int triggering, int polarity) 794int mp_register_gsi(u32 gsi, int triggering, int polarity)
915{ 795{
916 int ioapic = -1; 796 int ioapic = -1;
917 int ioapic_pin = 0; 797 int ioapic_pin = 0;
918 int idx, bit = 0; 798 int idx, bit = 0;
919 static int pci_irq = 16; 799 static int pci_irq = 16;
920 /* 800 /*
921 * Mapping between Global System Interrupts, which 801 * Mapping between Global System Interrupts, which
922 * represent all possible interrupts, to the IRQs 802 * represent all possible interrupts, to the IRQs
923 * assigned to actual devices. 803 * assigned to actual devices.
924 */ 804 */
925 static int gsi_to_irq[MAX_GSI_NUM]; 805 static int gsi_to_irq[MAX_GSI_NUM];
926 806
927 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 807 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
928 return gsi; 808 return gsi;
@@ -996,6 +876,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
996 polarity == ACPI_ACTIVE_HIGH ? 0 : 1); 876 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
997 return gsi; 877 return gsi;
998} 878}
999
1000#endif /*CONFIG_X86_IO_APIC*/
1001#endif /*CONFIG_ACPI*/ 879#endif /*CONFIG_ACPI*/
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 5baa0c726e97..4d6fb047952e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -28,71 +28,138 @@
28#include <asm/mce.h> 28#include <asm/mce.h>
29#include <asm/intel_arch_perfmon.h> 29#include <asm/intel_arch_perfmon.h>
30 30
31/* 31/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
32 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: 32 * evtsel_nmi_owner tracks the ownership of the event selection
33 * - it may be reserved by some other driver, or not 33 * - different performance counters/ event selection may be reserved for
34 * - when not reserved by some other driver, it may be used for 34 * different subsystems this reservation system just tries to coordinate
35 * the NMI watchdog, or not 35 * things a little
36 *
37 * This is maintained separately from nmi_active because the NMI
38 * watchdog may also be driven from the I/O APIC timer.
39 */ 36 */
40static DEFINE_SPINLOCK(lapic_nmi_owner_lock); 37static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
41static unsigned int lapic_nmi_owner; 38static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
42#define LAPIC_NMI_WATCHDOG (1<<0) 39
43#define LAPIC_NMI_RESERVED (1<<1) 40/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
41 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
42 */
43#define NMI_MAX_COUNTER_BITS 66
44 44
45/* nmi_active: 45/* nmi_active:
46 * +1: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
47 * 0: the lapic NMI watchdog has not been set up, and cannot 47 * <0: the lapic NMI watchdog has not been set up, and cannot
48 * be enabled 48 * be enabled
49 * -1: the lapic NMI watchdog is disabled, but can be enabled 49 * 0: the lapic NMI watchdog is disabled, but can be enabled
50 */ 50 */
51int nmi_active; /* oprofile uses this */ 51atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
52int panic_on_timeout; 52int panic_on_timeout;
53 53
54unsigned int nmi_watchdog = NMI_DEFAULT; 54unsigned int nmi_watchdog = NMI_DEFAULT;
55static unsigned int nmi_hz = HZ; 55static unsigned int nmi_hz = HZ;
56static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
57static unsigned int nmi_p4_cccr_val;
58 56
59/* Note that these events don't tick when the CPU idles. This means 57struct nmi_watchdog_ctlblk {
60 the frequency varies with CPU load. */ 58 int enabled;
59 u64 check_bit;
60 unsigned int cccr_msr;
61 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
62 unsigned int evntsel_msr; /* the MSR to select the events to handle */
63};
64static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
61 65
62#define K7_EVNTSEL_ENABLE (1 << 22) 66/* local prototypes */
63#define K7_EVNTSEL_INT (1 << 20) 67static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
64#define K7_EVNTSEL_OS (1 << 17)
65#define K7_EVNTSEL_USR (1 << 16)
66#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
67#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
68 68
69#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 69/* converts an msr to an appropriate reservation bit */
70#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK 70static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
71{
72 /* returns the bit offset of the performance counter register */
73 switch (boot_cpu_data.x86_vendor) {
74 case X86_VENDOR_AMD:
75 return (msr - MSR_K7_PERFCTR0);
76 case X86_VENDOR_INTEL:
77 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
78 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
79 else
80 return (msr - MSR_P4_BPU_PERFCTR0);
81 }
82 return 0;
83}
71 84
72#define MSR_P4_MISC_ENABLE 0x1A0 85/* converts an msr to an appropriate reservation bit */
73#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) 86static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
74#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) 87{
75#define MSR_P4_PERFCTR0 0x300 88 /* returns the bit offset of the event selection register */
76#define MSR_P4_CCCR0 0x360 89 switch (boot_cpu_data.x86_vendor) {
77#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) 90 case X86_VENDOR_AMD:
78#define P4_ESCR_OS (1<<3) 91 return (msr - MSR_K7_EVNTSEL0);
79#define P4_ESCR_USR (1<<2) 92 case X86_VENDOR_INTEL:
80#define P4_CCCR_OVF_PMI0 (1<<26) 93 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
81#define P4_CCCR_OVF_PMI1 (1<<27) 94 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
82#define P4_CCCR_THRESHOLD(N) ((N)<<20) 95 else
83#define P4_CCCR_COMPLEMENT (1<<19) 96 return (msr - MSR_P4_BSU_ESCR0);
84#define P4_CCCR_COMPARE (1<<18) 97 }
85#define P4_CCCR_REQUIRED (3<<16) 98 return 0;
86#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) 99}
87#define P4_CCCR_ENABLE (1<<12) 100
88/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 101/* checks for a bit availability (hack for oprofile) */
89 CRU_ESCR0 (with any non-null event selector) through a complemented 102int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
90 max threshold. [IA32-Vol3, Section 14.9.9] */ 103{
91#define MSR_P4_IQ_COUNTER0 0x30C 104 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
92#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) 105
93#define P4_NMI_IQ_CCCR0 \ 106 return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
94 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 107}
95 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 108
109/* checks the an msr for availability */
110int avail_to_resrv_perfctr_nmi(unsigned int msr)
111{
112 unsigned int counter;
113
114 counter = nmi_perfctr_msr_to_bit(msr);
115 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
116
117 return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
118}
119
120int reserve_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
128 return 1;
129 return 0;
130}
131
132void release_perfctr_nmi(unsigned int msr)
133{
134 unsigned int counter;
135
136 counter = nmi_perfctr_msr_to_bit(msr);
137 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
138
139 clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
140}
141
142int reserve_evntsel_nmi(unsigned int msr)
143{
144 unsigned int counter;
145
146 counter = nmi_evntsel_msr_to_bit(msr);
147 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
148
149 if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)))
150 return 1;
151 return 0;
152}
153
154void release_evntsel_nmi(unsigned int msr)
155{
156 unsigned int counter;
157
158 counter = nmi_evntsel_msr_to_bit(msr);
159 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
160
161 clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner));
162}
96 163
97static __cpuinit inline int nmi_known_cpu(void) 164static __cpuinit inline int nmi_known_cpu(void)
98{ 165{
@@ -109,7 +176,7 @@ static __cpuinit inline int nmi_known_cpu(void)
109} 176}
110 177
111/* Run after command line and cpu_init init, but before all other checks */ 178/* Run after command line and cpu_init init, but before all other checks */
112void __cpuinit nmi_watchdog_default(void) 179void nmi_watchdog_default(void)
113{ 180{
114 if (nmi_watchdog != NMI_DEFAULT) 181 if (nmi_watchdog != NMI_DEFAULT)
115 return; 182 return;
@@ -145,6 +212,12 @@ int __init check_nmi_watchdog (void)
145 int *counts; 212 int *counts;
146 int cpu; 213 int cpu;
147 214
215 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
216 return 0;
217
218 if (!atomic_read(&nmi_active))
219 return 0;
220
148 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 221 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
149 if (!counts) 222 if (!counts)
150 return -1; 223 return -1;
@@ -162,26 +235,43 @@ int __init check_nmi_watchdog (void)
162 mdelay((10*1000)/nmi_hz); // wait 10 ticks 235 mdelay((10*1000)/nmi_hz); // wait 10 ticks
163 236
164 for_each_online_cpu(cpu) { 237 for_each_online_cpu(cpu) {
238 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
239 continue;
165 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { 240 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
166 endflag = 1;
167 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 241 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
168 cpu, 242 cpu,
169 counts[cpu], 243 counts[cpu],
170 cpu_pda(cpu)->__nmi_count); 244 cpu_pda(cpu)->__nmi_count);
171 nmi_active = 0; 245 per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
172 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 246 atomic_dec(&nmi_active);
173 nmi_perfctr_msr = 0;
174 kfree(counts);
175 return -1;
176 } 247 }
177 } 248 }
249 if (!atomic_read(&nmi_active)) {
250 kfree(counts);
251 atomic_set(&nmi_active, -1);
252 return -1;
253 }
178 endflag = 1; 254 endflag = 1;
179 printk("OK.\n"); 255 printk("OK.\n");
180 256
181 /* now that we know it works we can reduce NMI frequency to 257 /* now that we know it works we can reduce NMI frequency to
182 something more reasonable; makes a difference in some configs */ 258 something more reasonable; makes a difference in some configs */
183 if (nmi_watchdog == NMI_LOCAL_APIC) 259 if (nmi_watchdog == NMI_LOCAL_APIC) {
260 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
261
184 nmi_hz = 1; 262 nmi_hz = 1;
263 /*
264 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
265 * are writable, with higher bits sign extending from bit 31.
266 * So, we can only program the counter with 31 bit values and
267 * 32nd bit should be 1, for 33.. to be 1.
268 * Find the appropriate nmi_hz
269 */
270 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
271 ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
272 nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
273 }
274 }
185 275
186 kfree(counts); 276 kfree(counts);
187 return 0; 277 return 0;
@@ -201,91 +291,65 @@ int __init setup_nmi_watchdog(char *str)
201 291
202 get_option(&str, &nmi); 292 get_option(&str, &nmi);
203 293
204 if (nmi >= NMI_INVALID) 294 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
205 return 0; 295 return 0;
296
297 if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
298 return 0; /* no lapic support */
206 nmi_watchdog = nmi; 299 nmi_watchdog = nmi;
207 return 1; 300 return 1;
208} 301}
209 302
210__setup("nmi_watchdog=", setup_nmi_watchdog); 303__setup("nmi_watchdog=", setup_nmi_watchdog);
211 304
212static void disable_intel_arch_watchdog(void);
213
214static void disable_lapic_nmi_watchdog(void) 305static void disable_lapic_nmi_watchdog(void)
215{ 306{
216 if (nmi_active <= 0) 307 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
308
309 if (atomic_read(&nmi_active) <= 0)
217 return; 310 return;
218 switch (boot_cpu_data.x86_vendor) {
219 case X86_VENDOR_AMD:
220 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
221 break;
222 case X86_VENDOR_INTEL:
223 if (boot_cpu_data.x86 == 15) {
224 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
225 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
226 } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
227 disable_intel_arch_watchdog();
228 }
229 break;
230 }
231 nmi_active = -1;
232 /* tell do_nmi() and others that we're not active any more */
233 nmi_watchdog = 0;
234}
235 311
236static void enable_lapic_nmi_watchdog(void) 312 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
237{ 313
238 if (nmi_active < 0) { 314 BUG_ON(atomic_read(&nmi_active) != 0);
239 nmi_watchdog = NMI_LOCAL_APIC;
240 touch_nmi_watchdog();
241 setup_apic_nmi_watchdog();
242 }
243} 315}
244 316
245int reserve_lapic_nmi(void) 317static void enable_lapic_nmi_watchdog(void)
246{ 318{
247 unsigned int old_owner; 319 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
248 320
249 spin_lock(&lapic_nmi_owner_lock); 321 /* are we already enabled */
250 old_owner = lapic_nmi_owner; 322 if (atomic_read(&nmi_active) != 0)
251 lapic_nmi_owner |= LAPIC_NMI_RESERVED; 323 return;
252 spin_unlock(&lapic_nmi_owner_lock);
253 if (old_owner & LAPIC_NMI_RESERVED)
254 return -EBUSY;
255 if (old_owner & LAPIC_NMI_WATCHDOG)
256 disable_lapic_nmi_watchdog();
257 return 0;
258}
259 324
260void release_lapic_nmi(void) 325 /* are we lapic aware */
261{ 326 if (nmi_known_cpu() <= 0)
262 unsigned int new_owner; 327 return;
263 328
264 spin_lock(&lapic_nmi_owner_lock); 329 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
265 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; 330 touch_nmi_watchdog();
266 lapic_nmi_owner = new_owner;
267 spin_unlock(&lapic_nmi_owner_lock);
268 if (new_owner & LAPIC_NMI_WATCHDOG)
269 enable_lapic_nmi_watchdog();
270} 331}
271 332
272void disable_timer_nmi_watchdog(void) 333void disable_timer_nmi_watchdog(void)
273{ 334{
274 if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) 335 BUG_ON(nmi_watchdog != NMI_IO_APIC);
336
337 if (atomic_read(&nmi_active) <= 0)
275 return; 338 return;
276 339
277 disable_irq(0); 340 disable_irq(0);
278 unset_nmi_callback(); 341 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
279 nmi_active = -1; 342
280 nmi_watchdog = NMI_NONE; 343 BUG_ON(atomic_read(&nmi_active) != 0);
281} 344}
282 345
283void enable_timer_nmi_watchdog(void) 346void enable_timer_nmi_watchdog(void)
284{ 347{
285 if (nmi_active < 0) { 348 BUG_ON(nmi_watchdog != NMI_IO_APIC);
286 nmi_watchdog = NMI_IO_APIC; 349
350 if (atomic_read(&nmi_active) == 0) {
287 touch_nmi_watchdog(); 351 touch_nmi_watchdog();
288 nmi_active = 1; 352 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
289 enable_irq(0); 353 enable_irq(0);
290 } 354 }
291} 355}
@@ -296,15 +360,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
296 360
297static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 361static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
298{ 362{
299 nmi_pm_active = nmi_active; 363 /* only CPU0 goes here, other CPUs should be offline */
300 disable_lapic_nmi_watchdog(); 364 nmi_pm_active = atomic_read(&nmi_active);
365 stop_apic_nmi_watchdog(NULL);
366 BUG_ON(atomic_read(&nmi_active) != 0);
301 return 0; 367 return 0;
302} 368}
303 369
304static int lapic_nmi_resume(struct sys_device *dev) 370static int lapic_nmi_resume(struct sys_device *dev)
305{ 371{
306 if (nmi_pm_active > 0) 372 /* only CPU0 goes here, other CPUs should be offline */
307 enable_lapic_nmi_watchdog(); 373 if (nmi_pm_active > 0) {
374 setup_apic_nmi_watchdog(NULL);
375 touch_nmi_watchdog();
376 }
308 return 0; 377 return 0;
309} 378}
310 379
@@ -323,7 +392,13 @@ static int __init init_lapic_nmi_sysfs(void)
323{ 392{
324 int error; 393 int error;
325 394
326 if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) 395 /* should really be a BUG_ON but b/c this is an
396 * init call, it just doesn't work. -dcz
397 */
398 if (nmi_watchdog != NMI_LOCAL_APIC)
399 return 0;
400
401 if ( atomic_read(&nmi_active) < 0 )
327 return 0; 402 return 0;
328 403
329 error = sysdev_class_register(&nmi_sysclass); 404 error = sysdev_class_register(&nmi_sysclass);
@@ -341,74 +416,209 @@ late_initcall(init_lapic_nmi_sysfs);
341 * Original code written by Keith Owens. 416 * Original code written by Keith Owens.
342 */ 417 */
343 418
344static void clear_msr_range(unsigned int base, unsigned int n) 419/* Note that these events don't tick when the CPU idles. This means
345{ 420 the frequency varies with CPU load. */
346 unsigned int i;
347 421
348 for(i = 0; i < n; ++i) 422#define K7_EVNTSEL_ENABLE (1 << 22)
349 wrmsr(base+i, 0, 0); 423#define K7_EVNTSEL_INT (1 << 20)
350} 424#define K7_EVNTSEL_OS (1 << 17)
425#define K7_EVNTSEL_USR (1 << 16)
426#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
427#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
351 428
352static void setup_k7_watchdog(void) 429static int setup_k7_watchdog(void)
353{ 430{
354 int i; 431 unsigned int perfctr_msr, evntsel_msr;
355 unsigned int evntsel; 432 unsigned int evntsel;
433 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
356 434
357 nmi_perfctr_msr = MSR_K7_PERFCTR0; 435 perfctr_msr = MSR_K7_PERFCTR0;
436 evntsel_msr = MSR_K7_EVNTSEL0;
437 if (!reserve_perfctr_nmi(perfctr_msr))
438 goto fail;
358 439
359 for(i = 0; i < 4; ++i) { 440 if (!reserve_evntsel_nmi(evntsel_msr))
360 /* Simulator may not support it */ 441 goto fail1;
361 if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { 442
362 nmi_perfctr_msr = 0; 443 /* Simulator may not support it */
363 return; 444 if (checking_wrmsrl(evntsel_msr, 0UL))
364 } 445 goto fail2;
365 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 446 wrmsrl(perfctr_msr, 0UL);
366 }
367 447
368 evntsel = K7_EVNTSEL_INT 448 evntsel = K7_EVNTSEL_INT
369 | K7_EVNTSEL_OS 449 | K7_EVNTSEL_OS
370 | K7_EVNTSEL_USR 450 | K7_EVNTSEL_USR
371 | K7_NMI_EVENT; 451 | K7_NMI_EVENT;
372 452
373 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 453 /* setup the timer */
374 wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); 454 wrmsr(evntsel_msr, evntsel, 0);
455 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
375 apic_write(APIC_LVTPC, APIC_DM_NMI); 456 apic_write(APIC_LVTPC, APIC_DM_NMI);
376 evntsel |= K7_EVNTSEL_ENABLE; 457 evntsel |= K7_EVNTSEL_ENABLE;
377 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 458 wrmsr(evntsel_msr, evntsel, 0);
459
460 wd->perfctr_msr = perfctr_msr;
461 wd->evntsel_msr = evntsel_msr;
462 wd->cccr_msr = 0; //unused
463 wd->check_bit = 1ULL<<63;
464 return 1;
465fail2:
466 release_evntsel_nmi(evntsel_msr);
467fail1:
468 release_perfctr_nmi(perfctr_msr);
469fail:
470 return 0;
378} 471}
379 472
380static void disable_intel_arch_watchdog(void) 473static void stop_k7_watchdog(void)
381{ 474{
382 unsigned ebx; 475 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
383 476
384 /* 477 wrmsr(wd->evntsel_msr, 0, 0);
385 * Check whether the Architectural PerfMon supports 478
386 * Unhalted Core Cycles Event or not. 479 release_evntsel_nmi(wd->evntsel_msr);
387 * NOTE: Corresponding bit = 0 in ebp indicates event present. 480 release_perfctr_nmi(wd->perfctr_msr);
481}
482
483/* Note that these events don't tick when the CPU idles. This means
484 the frequency varies with CPU load. */
485
486#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
487#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
488#define P4_ESCR_OS (1<<3)
489#define P4_ESCR_USR (1<<2)
490#define P4_CCCR_OVF_PMI0 (1<<26)
491#define P4_CCCR_OVF_PMI1 (1<<27)
492#define P4_CCCR_THRESHOLD(N) ((N)<<20)
493#define P4_CCCR_COMPLEMENT (1<<19)
494#define P4_CCCR_COMPARE (1<<18)
495#define P4_CCCR_REQUIRED (3<<16)
496#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
497#define P4_CCCR_ENABLE (1<<12)
498#define P4_CCCR_OVF (1<<31)
499/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
500 CRU_ESCR0 (with any non-null event selector) through a complemented
501 max threshold. [IA32-Vol3, Section 14.9.9] */
502
503static int setup_p4_watchdog(void)
504{
505 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
506 unsigned int evntsel, cccr_val;
507 unsigned int misc_enable, dummy;
508 unsigned int ht_num;
509 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
510
511 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
512 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
513 return 0;
514
515#ifdef CONFIG_SMP
516 /* detect which hyperthread we are on */
517 if (smp_num_siblings == 2) {
518 unsigned int ebx, apicid;
519
520 ebx = cpuid_ebx(1);
521 apicid = (ebx >> 24) & 0xff;
522 ht_num = apicid & 1;
523 } else
524#endif
525 ht_num = 0;
526
527 /* performance counters are shared resources
528 * assign each hyperthread its own set
529 * (re-use the ESCR0 register, seems safe
530 * and keeps the cccr_val the same)
388 */ 531 */
389 ebx = cpuid_ebx(10); 532 if (!ht_num) {
390 if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) 533 /* logical cpu 0 */
391 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); 534 perfctr_msr = MSR_P4_IQ_PERFCTR0;
535 evntsel_msr = MSR_P4_CRU_ESCR0;
536 cccr_msr = MSR_P4_IQ_CCCR0;
537 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
538 } else {
539 /* logical cpu 1 */
540 perfctr_msr = MSR_P4_IQ_PERFCTR1;
541 evntsel_msr = MSR_P4_CRU_ESCR0;
542 cccr_msr = MSR_P4_IQ_CCCR1;
543 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
544 }
545
546 if (!reserve_perfctr_nmi(perfctr_msr))
547 goto fail;
548
549 if (!reserve_evntsel_nmi(evntsel_msr))
550 goto fail1;
551
552 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
553 | P4_ESCR_OS
554 | P4_ESCR_USR;
555
556 cccr_val |= P4_CCCR_THRESHOLD(15)
557 | P4_CCCR_COMPLEMENT
558 | P4_CCCR_COMPARE
559 | P4_CCCR_REQUIRED;
560
561 wrmsr(evntsel_msr, evntsel, 0);
562 wrmsr(cccr_msr, cccr_val, 0);
563 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
564 apic_write(APIC_LVTPC, APIC_DM_NMI);
565 cccr_val |= P4_CCCR_ENABLE;
566 wrmsr(cccr_msr, cccr_val, 0);
567
568 wd->perfctr_msr = perfctr_msr;
569 wd->evntsel_msr = evntsel_msr;
570 wd->cccr_msr = cccr_msr;
571 wd->check_bit = 1ULL<<39;
572 return 1;
573fail1:
574 release_perfctr_nmi(perfctr_msr);
575fail:
576 return 0;
577}
578
579static void stop_p4_watchdog(void)
580{
581 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
582
583 wrmsr(wd->cccr_msr, 0, 0);
584 wrmsr(wd->evntsel_msr, 0, 0);
585
586 release_evntsel_nmi(wd->evntsel_msr);
587 release_perfctr_nmi(wd->perfctr_msr);
392} 588}
393 589
590#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
591#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
592
394static int setup_intel_arch_watchdog(void) 593static int setup_intel_arch_watchdog(void)
395{ 594{
595 unsigned int ebx;
596 union cpuid10_eax eax;
597 unsigned int unused;
598 unsigned int perfctr_msr, evntsel_msr;
396 unsigned int evntsel; 599 unsigned int evntsel;
397 unsigned ebx; 600 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
398 601
399 /* 602 /*
400 * Check whether the Architectural PerfMon supports 603 * Check whether the Architectural PerfMon supports
401 * Unhalted Core Cycles Event or not. 604 * Unhalted Core Cycles Event or not.
402 * NOTE: Corresponding bit = 0 in ebp indicates event present. 605 * NOTE: Corresponding bit = 0 in ebx indicates event present.
403 */ 606 */
404 ebx = cpuid_ebx(10); 607 cpuid(10, &(eax.full), &ebx, &unused, &unused);
405 if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) 608 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
406 return 0; 609 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
610 goto fail;
611
612 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
613 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
407 614
408 nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; 615 if (!reserve_perfctr_nmi(perfctr_msr))
616 goto fail;
409 617
410 clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); 618 if (!reserve_evntsel_nmi(evntsel_msr))
411 clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); 619 goto fail1;
620
621 wrmsrl(perfctr_msr, 0UL);
412 622
413 evntsel = ARCH_PERFMON_EVENTSEL_INT 623 evntsel = ARCH_PERFMON_EVENTSEL_INT
414 | ARCH_PERFMON_EVENTSEL_OS 624 | ARCH_PERFMON_EVENTSEL_OS
@@ -416,84 +626,122 @@ static int setup_intel_arch_watchdog(void)
416 | ARCH_PERFMON_NMI_EVENT_SEL 626 | ARCH_PERFMON_NMI_EVENT_SEL
417 | ARCH_PERFMON_NMI_EVENT_UMASK; 627 | ARCH_PERFMON_NMI_EVENT_UMASK;
418 628
419 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); 629 /* setup the timer */
420 wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); 630 wrmsr(evntsel_msr, evntsel, 0);
631 wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
632
421 apic_write(APIC_LVTPC, APIC_DM_NMI); 633 apic_write(APIC_LVTPC, APIC_DM_NMI);
422 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 634 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
423 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); 635 wrmsr(evntsel_msr, evntsel, 0);
636
637 wd->perfctr_msr = perfctr_msr;
638 wd->evntsel_msr = evntsel_msr;
639 wd->cccr_msr = 0; //unused
640 wd->check_bit = 1ULL << (eax.split.bit_width - 1);
424 return 1; 641 return 1;
642fail1:
643 release_perfctr_nmi(perfctr_msr);
644fail:
645 return 0;
425} 646}
426 647
427 648static void stop_intel_arch_watchdog(void)
428static int setup_p4_watchdog(void)
429{ 649{
430 unsigned int misc_enable, dummy; 650 unsigned int ebx;
651 union cpuid10_eax eax;
652 unsigned int unused;
653 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
431 654
432 rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); 655 /*
433 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) 656 * Check whether the Architectural PerfMon supports
434 return 0; 657 * Unhalted Core Cycles Event or not.
658 * NOTE: Corresponding bit = 0 in ebx indicates event present.
659 */
660 cpuid(10, &(eax.full), &ebx, &unused, &unused);
661 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
662 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
663 return;
435 664
436 nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; 665 wrmsr(wd->evntsel_msr, 0, 0);
437 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
438#ifdef CONFIG_SMP
439 if (smp_num_siblings == 2)
440 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
441#endif
442 666
443 if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) 667 release_evntsel_nmi(wd->evntsel_msr);
444 clear_msr_range(0x3F1, 2); 668 release_perfctr_nmi(wd->perfctr_msr);
445 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
446 docs doesn't fully define it, so leave it alone for now. */
447 if (boot_cpu_data.x86_model >= 0x3) {
448 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
449 clear_msr_range(0x3A0, 26);
450 clear_msr_range(0x3BC, 3);
451 } else {
452 clear_msr_range(0x3A0, 31);
453 }
454 clear_msr_range(0x3C0, 6);
455 clear_msr_range(0x3C8, 6);
456 clear_msr_range(0x3E0, 2);
457 clear_msr_range(MSR_P4_CCCR0, 18);
458 clear_msr_range(MSR_P4_PERFCTR0, 18);
459
460 wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
461 wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
462 Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz));
463 wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz));
464 apic_write(APIC_LVTPC, APIC_DM_NMI);
465 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
466 return 1;
467} 669}
468 670
469void setup_apic_nmi_watchdog(void) 671void setup_apic_nmi_watchdog(void *unused)
470{ 672{
471 switch (boot_cpu_data.x86_vendor) { 673 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472 case X86_VENDOR_AMD: 674
473 if (boot_cpu_data.x86 != 15) 675 /* only support LOCAL and IO APICs for now */
474 return; 676 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
475 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) 677 (nmi_watchdog != NMI_IO_APIC))
476 return; 678 return;
477 setup_k7_watchdog(); 679
478 break; 680 if (wd->enabled == 1)
479 case X86_VENDOR_INTEL: 681 return;
480 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 682
481 if (!setup_intel_arch_watchdog()) 683 /* cheap hack to support suspend/resume */
684 /* if cpu0 is not active neither should the other cpus */
685 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
686 return;
687
688 if (nmi_watchdog == NMI_LOCAL_APIC) {
689 switch (boot_cpu_data.x86_vendor) {
690 case X86_VENDOR_AMD:
691 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
482 return; 692 return;
483 } else if (boot_cpu_data.x86 == 15) { 693 if (!setup_k7_watchdog())
694 return;
695 break;
696 case X86_VENDOR_INTEL:
697 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
698 if (!setup_intel_arch_watchdog())
699 return;
700 break;
701 }
484 if (!setup_p4_watchdog()) 702 if (!setup_p4_watchdog())
485 return; 703 return;
486 } else { 704 break;
705 default:
487 return; 706 return;
488 } 707 }
708 }
709 wd->enabled = 1;
710 atomic_inc(&nmi_active);
711}
712
713void stop_apic_nmi_watchdog(void *unused)
714{
715 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
489 716
490 break; 717 /* only support LOCAL and IO APICs for now */
718 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
719 (nmi_watchdog != NMI_IO_APIC))
720 return;
491 721
492 default: 722 if (wd->enabled == 0)
493 return; 723 return;
724
725 if (nmi_watchdog == NMI_LOCAL_APIC) {
726 switch (boot_cpu_data.x86_vendor) {
727 case X86_VENDOR_AMD:
728 if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
729 return;
730 stop_k7_watchdog();
731 break;
732 case X86_VENDOR_INTEL:
733 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
734 stop_intel_arch_watchdog();
735 break;
736 }
737 stop_p4_watchdog();
738 break;
739 default:
740 return;
741 }
494 } 742 }
495 lapic_nmi_owner = LAPIC_NMI_WATCHDOG; 743 wd->enabled = 0;
496 nmi_active = 1; 744 atomic_dec(&nmi_active);
497} 745}
498 746
499/* 747/*
@@ -526,93 +774,109 @@ void touch_nmi_watchdog (void)
526 touch_softlockup_watchdog(); 774 touch_softlockup_watchdog();
527} 775}
528 776
529void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) 777int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
530{ 778{
531 int sum; 779 int sum;
532 int touched = 0; 780 int touched = 0;
781 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
782 u64 dummy;
783 int rc=0;
784
785 /* check for other users first */
786 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
787 == NOTIFY_STOP) {
788 rc = 1;
789 touched = 1;
790 }
533 791
534 sum = read_pda(apic_timer_irqs); 792 sum = read_pda(apic_timer_irqs);
535 if (__get_cpu_var(nmi_touch)) { 793 if (__get_cpu_var(nmi_touch)) {
536 __get_cpu_var(nmi_touch) = 0; 794 __get_cpu_var(nmi_touch) = 0;
537 touched = 1; 795 touched = 1;
538 } 796 }
797
539#ifdef CONFIG_X86_MCE 798#ifdef CONFIG_X86_MCE
540 /* Could check oops_in_progress here too, but it's safer 799 /* Could check oops_in_progress here too, but it's safer
541 not too */ 800 not too */
542 if (atomic_read(&mce_entry) > 0) 801 if (atomic_read(&mce_entry) > 0)
543 touched = 1; 802 touched = 1;
544#endif 803#endif
804 /* if the apic timer isn't firing, this cpu isn't doing much */
545 if (!touched && __get_cpu_var(last_irq_sum) == sum) { 805 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
546 /* 806 /*
547 * Ayiee, looks like this CPU is stuck ... 807 * Ayiee, looks like this CPU is stuck ...
548 * wait a few IRQs (5 seconds) before doing the oops ... 808 * wait a few IRQs (5 seconds) before doing the oops ...
549 */ 809 */
550 local_inc(&__get_cpu_var(alert_counter)); 810 local_inc(&__get_cpu_var(alert_counter));
551 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { 811 if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
552 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 812 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
553 == NOTIFY_STOP) { 813 panic_on_timeout);
554 local_set(&__get_cpu_var(alert_counter), 0);
555 return;
556 }
557 die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs);
558 }
559 } else { 814 } else {
560 __get_cpu_var(last_irq_sum) = sum; 815 __get_cpu_var(last_irq_sum) = sum;
561 local_set(&__get_cpu_var(alert_counter), 0); 816 local_set(&__get_cpu_var(alert_counter), 0);
562 } 817 }
563 if (nmi_perfctr_msr) { 818
564 if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { 819 /* see if the nmi watchdog went off */
565 /* 820 if (wd->enabled) {
566 * P4 quirks: 821 if (nmi_watchdog == NMI_LOCAL_APIC) {
567 * - An overflown perfctr will assert its interrupt 822 rdmsrl(wd->perfctr_msr, dummy);
568 * until the OVF flag in its CCCR is cleared. 823 if (dummy & wd->check_bit){
569 * - LVTPC is masked on interrupt and must be 824 /* this wasn't a watchdog timer interrupt */
570 * unmasked by the LVTPC handler. 825 goto done;
571 */ 826 }
572 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 827
573 apic_write(APIC_LVTPC, APIC_DM_NMI); 828 /* only Intel uses the cccr msr */
574 } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { 829 if (wd->cccr_msr != 0) {
575 /* 830 /*
576 * For Intel based architectural perfmon 831 * P4 quirks:
577 * - LVTPC is masked on interrupt and must be 832 * - An overflown perfctr will assert its interrupt
578 * unmasked by the LVTPC handler. 833 * until the OVF flag in its CCCR is cleared.
834 * - LVTPC is masked on interrupt and must be
835 * unmasked by the LVTPC handler.
836 */
837 rdmsrl(wd->cccr_msr, dummy);
838 dummy &= ~P4_CCCR_OVF;
839 wrmsrl(wd->cccr_msr, dummy);
840 apic_write(APIC_LVTPC, APIC_DM_NMI);
841 } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
842 /*
843 * ArchPerfom/Core Duo needs to re-unmask
844 * the apic vector
845 */
846 apic_write(APIC_LVTPC, APIC_DM_NMI);
847 }
848 /* start the cycle over again */
849 wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
850 rc = 1;
851 } else if (nmi_watchdog == NMI_IO_APIC) {
852 /* don't know how to accurately check for this.
853 * just assume it was a watchdog timer interrupt
854 * This matches the old behaviour.
579 */ 855 */
580 apic_write(APIC_LVTPC, APIC_DM_NMI); 856 rc = 1;
581 } 857 } else
582 wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); 858 printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
583 } 859 }
860done:
861 return rc;
584} 862}
585 863
586static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
587{
588 return 0;
589}
590
591static nmi_callback_t nmi_callback = dummy_nmi_callback;
592
593asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) 864asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
594{ 865{
595 int cpu = safe_smp_processor_id();
596
597 nmi_enter(); 866 nmi_enter();
598 add_pda(__nmi_count,1); 867 add_pda(__nmi_count,1);
599 if (!rcu_dereference(nmi_callback)(regs, cpu)) 868 default_do_nmi(regs);
600 default_do_nmi(regs);
601 nmi_exit(); 869 nmi_exit();
602} 870}
603 871
604void set_nmi_callback(nmi_callback_t callback) 872int do_nmi_callback(struct pt_regs * regs, int cpu)
605{ 873{
606 vmalloc_sync_all(); 874#ifdef CONFIG_SYSCTL
607 rcu_assign_pointer(nmi_callback, callback); 875 if (unknown_nmi_panic)
608} 876 return unknown_nmi_panic_callback(regs, cpu);
609EXPORT_SYMBOL_GPL(set_nmi_callback); 877#endif
610 878 return 0;
611void unset_nmi_callback(void)
612{
613 nmi_callback = dummy_nmi_callback;
614} 879}
615EXPORT_SYMBOL_GPL(unset_nmi_callback);
616 880
617#ifdef CONFIG_SYSCTL 881#ifdef CONFIG_SYSCTL
618 882
@@ -621,36 +885,42 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
621 unsigned char reason = get_nmi_reason(); 885 unsigned char reason = get_nmi_reason();
622 char buf[64]; 886 char buf[64];
623 887
624 if (!(reason & 0xc0)) { 888 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
625 sprintf(buf, "NMI received for unknown reason %02x\n", reason); 889 die_nmi(buf, regs, 1); /* Always panic here */
626 die_nmi(buf,regs);
627 }
628 return 0; 890 return 0;
629} 891}
630 892
631/* 893/*
632 * proc handler for /proc/sys/kernel/unknown_nmi_panic 894 * proc handler for /proc/sys/kernel/nmi
633 */ 895 */
634int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, 896int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
635 void __user *buffer, size_t *length, loff_t *ppos) 897 void __user *buffer, size_t *length, loff_t *ppos)
636{ 898{
637 int old_state; 899 int old_state;
638 900
639 old_state = unknown_nmi_panic; 901 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
902 old_state = nmi_watchdog_enabled;
640 proc_dointvec(table, write, file, buffer, length, ppos); 903 proc_dointvec(table, write, file, buffer, length, ppos);
641 if (!!old_state == !!unknown_nmi_panic) 904 if (!!old_state == !!nmi_watchdog_enabled)
642 return 0; 905 return 0;
643 906
644 if (unknown_nmi_panic) { 907 if (atomic_read(&nmi_active) < 0) {
645 if (reserve_lapic_nmi() < 0) { 908 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
646 unknown_nmi_panic = 0; 909 return -EIO;
647 return -EBUSY; 910 }
648 } else { 911
649 set_nmi_callback(unknown_nmi_panic_callback); 912 /* if nmi_watchdog is not set yet, then set it */
650 } 913 nmi_watchdog_default();
914
915 if (nmi_watchdog == NMI_LOCAL_APIC) {
916 if (nmi_watchdog_enabled)
917 enable_lapic_nmi_watchdog();
918 else
919 disable_lapic_nmi_watchdog();
651 } else { 920 } else {
652 release_lapic_nmi(); 921 printk( KERN_WARNING
653 unset_nmi_callback(); 922 "NMI watchdog doesn't know what hardware to touch\n");
923 return -EIO;
654 } 924 }
655 return 0; 925 return 0;
656} 926}
@@ -659,8 +929,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file
659 929
660EXPORT_SYMBOL(nmi_active); 930EXPORT_SYMBOL(nmi_active);
661EXPORT_SYMBOL(nmi_watchdog); 931EXPORT_SYMBOL(nmi_watchdog);
662EXPORT_SYMBOL(reserve_lapic_nmi); 932EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
663EXPORT_SYMBOL(release_lapic_nmi); 933EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
934EXPORT_SYMBOL(reserve_perfctr_nmi);
935EXPORT_SYMBOL(release_perfctr_nmi);
936EXPORT_SYMBOL(reserve_evntsel_nmi);
937EXPORT_SYMBOL(release_evntsel_nmi);
664EXPORT_SYMBOL(disable_timer_nmi_watchdog); 938EXPORT_SYMBOL(disable_timer_nmi_watchdog);
665EXPORT_SYMBOL(enable_timer_nmi_watchdog); 939EXPORT_SYMBOL(enable_timer_nmi_watchdog);
666EXPORT_SYMBOL(touch_nmi_watchdog); 940EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 146924ba5df5..cfb09b07ae99 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -86,7 +86,8 @@
86 86
87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
88#define MAX_NUM_CHASSIS 8 /* max number of chassis */ 88#define MAX_NUM_CHASSIS 8 /* max number of chassis */
89#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ 89/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
90#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
90#define PHBS_PER_CALGARY 4 91#define PHBS_PER_CALGARY 4
91 92
92/* register offsets in Calgary's internal register space */ 93/* register offsets in Calgary's internal register space */
@@ -111,31 +112,49 @@ static const unsigned long phb_offsets[] = {
111 0xB000 /* PHB3 */ 112 0xB000 /* PHB3 */
112}; 113};
113 114
114static char bus_to_phb[MAX_PHB_BUS_NUM];
115void* tce_table_kva[MAX_PHB_BUS_NUM];
116unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 115unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
117static int translate_empty_slots __read_mostly = 0; 116static int translate_empty_slots __read_mostly = 0;
118static int calgary_detected __read_mostly = 0; 117static int calgary_detected __read_mostly = 0;
119 118
120/* 119struct calgary_bus_info {
121 * the bitmap of PHBs the user requested that we disable 120 void *tce_space;
122 * translation on. 121 unsigned char translation_disabled;
123 */ 122 signed char phbid;
124static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); 123};
124
125static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
125 126
126static void tce_cache_blast(struct iommu_table *tbl); 127static void tce_cache_blast(struct iommu_table *tbl);
127 128
128/* enable this to stress test the chip's TCE cache */ 129/* enable this to stress test the chip's TCE cache */
129#ifdef CONFIG_IOMMU_DEBUG 130#ifdef CONFIG_IOMMU_DEBUG
130static inline void tce_cache_blast_stress(struct iommu_table *tbl) 131int debugging __read_mostly = 1;
132
133static inline unsigned long verify_bit_range(unsigned long* bitmap,
134 int expected, unsigned long start, unsigned long end)
131{ 135{
132 tce_cache_blast(tbl); 136 unsigned long idx = start;
137
138 BUG_ON(start >= end);
139
140 while (idx < end) {
141 if (!!test_bit(idx, bitmap) != expected)
142 return idx;
143 ++idx;
144 }
145
146 /* all bits have the expected value */
147 return ~0UL;
133} 148}
134#else 149#else /* debugging is disabled */
135static inline void tce_cache_blast_stress(struct iommu_table *tbl) 150int debugging __read_mostly = 0;
151
152static inline unsigned long verify_bit_range(unsigned long* bitmap,
153 int expected, unsigned long start, unsigned long end)
136{ 154{
155 return ~0UL;
137} 156}
138#endif /* BLAST_TCE_CACHE_ON_UNMAP */ 157#endif /* CONFIG_IOMMU_DEBUG */
139 158
140static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) 159static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
141{ 160{
@@ -149,7 +168,7 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
149 168
150static inline int translate_phb(struct pci_dev* dev) 169static inline int translate_phb(struct pci_dev* dev)
151{ 170{
152 int disabled = test_bit(dev->bus->number, translation_disabled); 171 int disabled = bus_info[dev->bus->number].translation_disabled;
153 return !disabled; 172 return !disabled;
154} 173}
155 174
@@ -158,6 +177,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
158{ 177{
159 unsigned long index; 178 unsigned long index;
160 unsigned long end; 179 unsigned long end;
180 unsigned long badbit;
161 181
162 index = start_addr >> PAGE_SHIFT; 182 index = start_addr >> PAGE_SHIFT;
163 183
@@ -169,14 +189,15 @@ static void iommu_range_reserve(struct iommu_table *tbl,
169 if (end > tbl->it_size) /* don't go off the table */ 189 if (end > tbl->it_size) /* don't go off the table */
170 end = tbl->it_size; 190 end = tbl->it_size;
171 191
172 while (index < end) { 192 badbit = verify_bit_range(tbl->it_map, 0, index, end);
173 if (test_bit(index, tbl->it_map)) 193 if (badbit != ~0UL) {
194 if (printk_ratelimit())
174 printk(KERN_ERR "Calgary: entry already allocated at " 195 printk(KERN_ERR "Calgary: entry already allocated at "
175 "0x%lx tbl %p dma 0x%lx npages %u\n", 196 "0x%lx tbl %p dma 0x%lx npages %u\n",
176 index, tbl, start_addr, npages); 197 badbit, tbl, start_addr, npages);
177 ++index;
178 } 198 }
179 set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); 199
200 set_bit_string(tbl->it_map, index, npages);
180} 201}
181 202
182static unsigned long iommu_range_alloc(struct iommu_table *tbl, 203static unsigned long iommu_range_alloc(struct iommu_table *tbl,
@@ -243,7 +264,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
243 unsigned int npages) 264 unsigned int npages)
244{ 265{
245 unsigned long entry; 266 unsigned long entry;
246 unsigned long i; 267 unsigned long badbit;
247 268
248 entry = dma_addr >> PAGE_SHIFT; 269 entry = dma_addr >> PAGE_SHIFT;
249 270
@@ -251,16 +272,15 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
251 272
252 tce_free(tbl, entry, npages); 273 tce_free(tbl, entry, npages);
253 274
254 for (i = 0; i < npages; ++i) { 275 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
255 if (!test_bit(entry + i, tbl->it_map)) 276 if (badbit != ~0UL) {
277 if (printk_ratelimit())
256 printk(KERN_ERR "Calgary: bit is off at 0x%lx " 278 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
257 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", 279 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
258 entry + i, tbl, dma_addr, entry, npages); 280 badbit, tbl, dma_addr, entry, npages);
259 } 281 }
260 282
261 __clear_bit_string(tbl->it_map, entry, npages); 283 __clear_bit_string(tbl->it_map, entry, npages);
262
263 tce_cache_blast_stress(tbl);
264} 284}
265 285
266static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 286static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -454,7 +474,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
454 474
455static inline int busno_to_phbid(unsigned char num) 475static inline int busno_to_phbid(unsigned char num)
456{ 476{
457 return bus_to_phb[num]; 477 return bus_info[num].phbid;
458} 478}
459 479
460static inline unsigned long split_queue_offset(unsigned char num) 480static inline unsigned long split_queue_offset(unsigned char num)
@@ -631,6 +651,10 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
631 if (ret) 651 if (ret)
632 return ret; 652 return ret;
633 653
654 tbl = dev->sysdata;
655 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
656 tce_free(tbl, 0, tbl->it_size);
657
634 calgary_reserve_regions(dev); 658 calgary_reserve_regions(dev);
635 659
636 /* set TARs for each PHB */ 660 /* set TARs for each PHB */
@@ -654,11 +678,12 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
654 return 0; 678 return 0;
655} 679}
656 680
657static void __init calgary_free_tar(struct pci_dev *dev) 681static void __init calgary_free_bus(struct pci_dev *dev)
658{ 682{
659 u64 val64; 683 u64 val64;
660 struct iommu_table *tbl = dev->sysdata; 684 struct iommu_table *tbl = dev->sysdata;
661 void __iomem *target; 685 void __iomem *target;
686 unsigned int bitmapsz;
662 687
663 target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); 688 target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
664 val64 = be64_to_cpu(readq(target)); 689 val64 = be64_to_cpu(readq(target));
@@ -666,8 +691,15 @@ static void __init calgary_free_tar(struct pci_dev *dev)
666 writeq(cpu_to_be64(val64), target); 691 writeq(cpu_to_be64(val64), target);
667 readq(target); /* flush */ 692 readq(target); /* flush */
668 693
694 bitmapsz = tbl->it_size / BITS_PER_BYTE;
695 free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
696 tbl->it_map = NULL;
697
669 kfree(tbl); 698 kfree(tbl);
670 dev->sysdata = NULL; 699 dev->sysdata = NULL;
700
701 /* Can't free bootmem allocated memory after system is up :-( */
702 bus_info[dev->bus->number].tce_space = NULL;
671} 703}
672 704
673static void calgary_watchdog(unsigned long data) 705static void calgary_watchdog(unsigned long data)
@@ -772,12 +804,11 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev)
772 return address; 804 return address;
773} 805}
774 806
775static int __init calgary_init_one_nontraslated(struct pci_dev *dev) 807static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
776{ 808{
809 pci_dev_get(dev);
777 dev->sysdata = NULL; 810 dev->sysdata = NULL;
778 dev->bus->self = dev; 811 dev->bus->self = dev;
779
780 return 0;
781} 812}
782 813
783static int __init calgary_init_one(struct pci_dev *dev) 814static int __init calgary_init_one(struct pci_dev *dev)
@@ -798,6 +829,7 @@ static int __init calgary_init_one(struct pci_dev *dev)
798 if (ret) 829 if (ret)
799 goto iounmap; 830 goto iounmap;
800 831
832 pci_dev_get(dev);
801 dev->bus->self = dev; 833 dev->bus->self = dev;
802 calgary_enable_translation(dev); 834 calgary_enable_translation(dev);
803 835
@@ -824,10 +856,9 @@ static int __init calgary_init(void)
824 calgary_init_one_nontraslated(dev); 856 calgary_init_one_nontraslated(dev);
825 continue; 857 continue;
826 } 858 }
827 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { 859 if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
828 pci_dev_put(dev);
829 continue; 860 continue;
830 } 861
831 ret = calgary_init_one(dev); 862 ret = calgary_init_one(dev);
832 if (ret) 863 if (ret)
833 goto error; 864 goto error;
@@ -840,15 +871,18 @@ error:
840 dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, 871 dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
841 PCI_DEVICE_ID_IBM_CALGARY, 872 PCI_DEVICE_ID_IBM_CALGARY,
842 dev); 873 dev);
874 if (!dev)
875 break;
843 if (!translate_phb(dev)) { 876 if (!translate_phb(dev)) {
844 pci_dev_put(dev); 877 pci_dev_put(dev);
845 continue; 878 continue;
846 } 879 }
847 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) 880 if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
848 continue; 881 continue;
882
849 calgary_disable_translation(dev); 883 calgary_disable_translation(dev);
850 calgary_free_tar(dev); 884 calgary_free_bus(dev);
851 pci_dev_put(dev); 885 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
852 } 886 }
853 887
854 return ret; 888 return ret;
@@ -890,13 +924,15 @@ void __init detect_calgary(void)
890 if (swiotlb || no_iommu || iommu_detected) 924 if (swiotlb || no_iommu || iommu_detected)
891 return; 925 return;
892 926
927 if (!early_pci_allowed())
928 return;
929
893 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 930 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
894 931
895 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 932 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
896 int dev; 933 int dev;
897 934 struct calgary_bus_info *info = &bus_info[bus];
898 tce_table_kva[bus] = NULL; 935 info->phbid = -1;
899 bus_to_phb[bus] = -1;
900 936
901 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 937 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
902 continue; 938 continue;
@@ -907,12 +943,9 @@ void __init detect_calgary(void)
907 */ 943 */
908 phb = (phb + 1) % PHBS_PER_CALGARY; 944 phb = (phb + 1) % PHBS_PER_CALGARY;
909 945
910 if (test_bit(bus, translation_disabled)) { 946 if (info->translation_disabled)
911 printk(KERN_INFO "Calgary: translation is disabled for "
912 "PHB 0x%x\n", bus);
913 /* skip this phb, don't allocate a tbl for it */
914 continue; 947 continue;
915 } 948
916 /* 949 /*
917 * Scan the slots of the PCI bus to see if there is a device present. 950 * Scan the slots of the PCI bus to see if there is a device present.
918 * The parent bus will be the zero-ith device, so start at 1. 951 * The parent bus will be the zero-ith device, so start at 1.
@@ -923,8 +956,8 @@ void __init detect_calgary(void)
923 tbl = alloc_tce_table(); 956 tbl = alloc_tce_table();
924 if (!tbl) 957 if (!tbl)
925 goto cleanup; 958 goto cleanup;
926 tce_table_kva[bus] = tbl; 959 info->tce_space = tbl;
927 bus_to_phb[bus] = phb; 960 info->phbid = phb;
928 calgary_found = 1; 961 calgary_found = 1;
929 break; 962 break;
930 } 963 }
@@ -934,15 +967,20 @@ void __init detect_calgary(void)
934 if (calgary_found) { 967 if (calgary_found) {
935 iommu_detected = 1; 968 iommu_detected = 1;
936 calgary_detected = 1; 969 calgary_detected = 1;
937 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " 970 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
938 "TCE table spec is %d.\n", specified_table_size); 971 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
972 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
973 debugging ? "enabled" : "disabled");
939 } 974 }
940 return; 975 return;
941 976
942cleanup: 977cleanup:
943 for (--bus; bus >= 0; --bus) 978 for (--bus; bus >= 0; --bus) {
944 if (tce_table_kva[bus]) 979 struct calgary_bus_info *info = &bus_info[bus];
945 free_tce_table(tce_table_kva[bus]); 980
981 if (info->tce_space)
982 free_tce_table(info->tce_space);
983 }
946} 984}
947 985
948int __init calgary_iommu_init(void) 986int __init calgary_iommu_init(void)
@@ -1016,7 +1054,7 @@ static int __init calgary_parse_options(char *p)
1016 if (bridge < MAX_PHB_BUS_NUM) { 1054 if (bridge < MAX_PHB_BUS_NUM) {
1017 printk(KERN_INFO "Calgary: disabling " 1055 printk(KERN_INFO "Calgary: disabling "
1018 "translation for PHB 0x%x\n", bridge); 1056 "translation for PHB 0x%x\n", bridge);
1019 set_bit(bridge, translation_disabled); 1057 bus_info[bridge].translation_disabled = 1;
1020 } 1058 }
1021 } 1059 }
1022 1060
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 9c44f4f2433d..4dcb671bd19f 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -236,6 +236,9 @@ __init int iommu_setup(char *p)
236{ 236{
237 iommu_merge = 1; 237 iommu_merge = 1;
238 238
239 if (!p)
240 return -EINVAL;
241
239 while (*p) { 242 while (*p) {
240 if (!strncmp(p,"off",3)) 243 if (!strncmp(p,"off",3))
241 no_iommu = 1; 244 no_iommu = 1;
@@ -278,9 +281,9 @@ __init int iommu_setup(char *p)
278 if (*p == ',') 281 if (*p == ',')
279 ++p; 282 ++p;
280 } 283 }
281 return 1; 284 return 0;
282} 285}
283__setup("iommu=", iommu_setup); 286early_param("iommu", iommu_setup);
284 287
285void __init pci_iommu_alloc(void) 288void __init pci_iommu_alloc(void)
286{ 289{
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 6d3e61baf7a0..16261a8a3303 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -239,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
239{ 239{
240 unsigned long phys_mem, bus; 240 unsigned long phys_mem, bus;
241 241
242 BUG_ON(dir == DMA_NONE);
243
244 if (!dev) 242 if (!dev)
245 dev = &fallback_dev; 243 dev = &fallback_dev;
246 244
@@ -383,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
383 unsigned long pages = 0; 381 unsigned long pages = 0;
384 int need = 0, nextneed; 382 int need = 0, nextneed;
385 383
386 BUG_ON(dir == DMA_NONE);
387 if (nents == 0) 384 if (nents == 0)
388 return 0; 385 return 0;
389 386
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index aad7609d8e92..df09ab05a1bd 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -59,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
59{ 59{
60 int i; 60 int i;
61 61
62 BUG_ON(direction == DMA_NONE);
63 for (i = 0; i < nents; i++ ) { 62 for (i = 0; i < nents; i++ ) {
64 struct scatterlist *s = &sg[i]; 63 struct scatterlist *s = &sg[i];
65 BUG_ON(!s->page); 64 BUG_ON(!s->page);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index bb6745d13b8f..458006ae19f3 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n)
80} 80}
81EXPORT_SYMBOL(idle_notifier_unregister); 81EXPORT_SYMBOL(idle_notifier_unregister);
82 82
83enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
84static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
85
86void enter_idle(void) 83void enter_idle(void)
87{ 84{
88 __get_cpu_var(idle_state) = CPU_IDLE; 85 write_pda(isidle, 1);
89 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
90} 87}
91 88
92static void __exit_idle(void) 89static void __exit_idle(void)
93{ 90{
94 __get_cpu_var(idle_state) = CPU_NOT_IDLE; 91 if (read_pda(isidle) == 0)
92 return;
93 write_pda(isidle, 0);
95 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96} 95}
97 96
98/* Called from interrupts to signify idle end */ 97/* Called from interrupts to signify idle end */
99void exit_idle(void) 98void exit_idle(void)
100{ 99{
101 if (current->pid | read_pda(irqcount)) 100 /* idle loop has pid 0 */
101 if (current->pid)
102 return; 102 return;
103 __exit_idle(); 103 __exit_idle();
104} 104}
@@ -220,6 +220,9 @@ void cpu_idle (void)
220 play_dead(); 220 play_dead();
221 enter_idle(); 221 enter_idle();
222 idle(); 222 idle();
223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
223 __exit_idle(); 226 __exit_idle();
224 } 227 }
225 228
@@ -350,6 +353,7 @@ void exit_thread(void)
350 353
351 kfree(t->io_bitmap_ptr); 354 kfree(t->io_bitmap_ptr);
352 t->io_bitmap_ptr = NULL; 355 t->io_bitmap_ptr = NULL;
356 clear_thread_flag(TIF_IO_BITMAP);
353 /* 357 /*
354 * Careful, clear this in the TSS too: 358 * Careful, clear this in the TSS too:
355 */ 359 */
@@ -369,6 +373,7 @@ void flush_thread(void)
369 if (t->flags & _TIF_IA32) 373 if (t->flags & _TIF_IA32)
370 current_thread_info()->status |= TS_COMPAT; 374 current_thread_info()->status |= TS_COMPAT;
371 } 375 }
376 t->flags &= ~_TIF_DEBUG;
372 377
373 tsk->thread.debugreg0 = 0; 378 tsk->thread.debugreg0 = 0;
374 tsk->thread.debugreg1 = 0; 379 tsk->thread.debugreg1 = 0;
@@ -461,7 +466,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
461 asm("mov %%es,%0" : "=m" (p->thread.es)); 466 asm("mov %%es,%0" : "=m" (p->thread.es));
462 asm("mov %%ds,%0" : "=m" (p->thread.ds)); 467 asm("mov %%ds,%0" : "=m" (p->thread.ds));
463 468
464 if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 469 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
465 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 470 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
466 if (!p->thread.io_bitmap_ptr) { 471 if (!p->thread.io_bitmap_ptr) {
467 p->thread.io_bitmap_max = 0; 472 p->thread.io_bitmap_max = 0;
@@ -469,6 +474,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
469 } 474 }
470 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 475 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
471 IO_BITMAP_BYTES); 476 IO_BITMAP_BYTES);
477 set_tsk_thread_flag(p, TIF_IO_BITMAP);
472 } 478 }
473 479
474 /* 480 /*
@@ -498,6 +504,40 @@ out:
498 */ 504 */
499#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 505#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
500 506
507static inline void __switch_to_xtra(struct task_struct *prev_p,
508 struct task_struct *next_p,
509 struct tss_struct *tss)
510{
511 struct thread_struct *prev, *next;
512
513 prev = &prev_p->thread,
514 next = &next_p->thread;
515
516 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
517 loaddebug(next, 0);
518 loaddebug(next, 1);
519 loaddebug(next, 2);
520 loaddebug(next, 3);
521 /* no 4 and 5 */
522 loaddebug(next, 6);
523 loaddebug(next, 7);
524 }
525
526 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
527 /*
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
530 */
531 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
532 max(prev->io_bitmap_max, next->io_bitmap_max));
533 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
534 /*
535 * Clear any possible leftover bits:
536 */
537 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
538 }
539}
540
501/* 541/*
502 * switch_to(x,y) should switch tasks from x to y. 542 * switch_to(x,y) should switch tasks from x to y.
503 * 543 *
@@ -515,6 +555,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
515 int cpu = smp_processor_id(); 555 int cpu = smp_processor_id();
516 struct tss_struct *tss = &per_cpu(init_tss, cpu); 556 struct tss_struct *tss = &per_cpu(init_tss, cpu);
517 557
558 /* we're going to use this soon, after a few expensive things */
559 if (next_p->fpu_counter>5)
560 prefetch(&next->i387.fxsave);
561
518 /* 562 /*
519 * Reload esp0, LDT and the page table pointer: 563 * Reload esp0, LDT and the page table pointer:
520 */ 564 */
@@ -583,41 +627,29 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
583 And the AMD workaround requires it to be after DS reload. */ 627 And the AMD workaround requires it to be after DS reload. */
584 unlazy_fpu(prev_p); 628 unlazy_fpu(prev_p);
585 write_pda(kernelstack, 629 write_pda(kernelstack,
586 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
587 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary);
588 /* 633 /*
589 * Now maybe reload the debug registers 634 * Build time only check to make sure the stack_canary is at
635 * offset 40 in the pda; this is a gcc ABI requirement
590 */ 636 */
591 if (unlikely(next->debugreg7)) { 637 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
592 loaddebug(next, 0); 638#endif
593 loaddebug(next, 1);
594 loaddebug(next, 2);
595 loaddebug(next, 3);
596 /* no 4 and 5 */
597 loaddebug(next, 6);
598 loaddebug(next, 7);
599 }
600
601 639
602 /* 640 /*
603 * Handle the IO bitmap 641 * Now maybe reload the debug registers and handle I/O bitmaps
604 */ 642 */
605 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { 643 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
606 if (next->io_bitmap_ptr) 644 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
607 /* 645 __switch_to_xtra(prev_p, next_p, tss);
608 * Copy the relevant range of the IO bitmap.
609 * Normally this is 128 bytes or less:
610 */
611 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
612 max(prev->io_bitmap_max, next->io_bitmap_max));
613 else {
614 /*
615 * Clear any possible leftover bits:
616 */
617 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
618 }
619 }
620 646
647 /* If the task has used fpu the last 5 timeslices, just do a full
648 * restore of the math state immediately to avoid the trap; the
649 * chances of needing FPU soon are obviously high now
650 */
651 if (next_p->fpu_counter>5)
652 math_state_restore();
621 return prev_p; 653 return prev_p;
622} 654}
623 655
@@ -834,7 +866,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
834 866
835unsigned long arch_align_stack(unsigned long sp) 867unsigned long arch_align_stack(unsigned long sp)
836{ 868{
837 if (randomize_va_space) 869 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
838 sp -= get_random_int() % 8192; 870 sp -= get_random_int() % 8192;
839 return sp & ~0xf; 871 return sp & ~0xf;
840} 872}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 2d50024c9f30..addc14af0c56 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
116 return addr; 116 return addr;
117} 117}
118 118
119static int is_at_popf(struct task_struct *child, struct pt_regs *regs) 119static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
120{ 120{
121 int i, copied; 121 int i, copied;
122 unsigned char opcode[16]; 122 unsigned char opcode[15];
123 unsigned long addr = convert_rip_to_linear(child, regs); 123 unsigned long addr = convert_rip_to_linear(child, regs);
124 124
125 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); 125 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
126 for (i = 0; i < copied; i++) { 126 for (i = 0; i < copied; i++) {
127 switch (opcode[i]) { 127 switch (opcode[i]) {
128 /* popf */ 128 /* popf and iret */
129 case 0x9d: 129 case 0x9d: case 0xcf:
130 return 1; 130 return 1;
131 131
132 /* CHECKME: 64 65 */ 132 /* CHECKME: 64 65 */
@@ -138,14 +138,17 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
138 case 0x26: case 0x2e: 138 case 0x26: case 0x2e:
139 case 0x36: case 0x3e: 139 case 0x36: case 0x3e:
140 case 0x64: case 0x65: 140 case 0x64: case 0x65:
141 case 0xf0: case 0xf2: case 0xf3: 141 case 0xf2: case 0xf3:
142 continue; 142 continue;
143 143
144 /* REX prefixes */
145 case 0x40 ... 0x4f: 144 case 0x40 ... 0x4f:
145 if (regs->cs != __USER_CS)
146 /* 32-bit mode: register increment */
147 return 0;
148 /* 64-bit mode: REX prefix */
146 continue; 149 continue;
147 150
148 /* CHECKME: f0, f2, f3 */ 151 /* CHECKME: f2, f3 */
149 152
150 /* 153 /*
151 * pushf: NOTE! We should probably not let 154 * pushf: NOTE! We should probably not let
@@ -186,10 +189,8 @@ static void set_singlestep(struct task_struct *child)
186 * ..but if TF is changed by the instruction we will trace, 189 * ..but if TF is changed by the instruction we will trace,
187 * don't mark it as being "us" that set it, so that we 190 * don't mark it as being "us" that set it, so that we
188 * won't clear it by hand later. 191 * won't clear it by hand later.
189 *
190 * AK: this is not enough, LAHF and IRET can change TF in user space too.
191 */ 192 */
192 if (is_at_popf(child, regs)) 193 if (is_setting_trap_flag(child, regs))
193 return; 194 return;
194 195
195 child->ptrace |= PT_DTRACE; 196 child->ptrace |= PT_DTRACE;
@@ -420,9 +421,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
420 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) 421 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
421 break; 422 break;
422 if (i == 4) { 423 if (i == 4) {
423 child->thread.debugreg7 = data; 424 child->thread.debugreg7 = data;
425 if (data)
426 set_tsk_thread_flag(child, TIF_DEBUG);
427 else
428 clear_tsk_thread_flag(child, TIF_DEBUG);
424 ret = 0; 429 ret = 0;
425 } 430 }
426 break; 431 break;
427 } 432 }
428 break; 433 break;
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
index d24fa9b72a2b..14e95872c6a3 100644
--- a/arch/x86_64/kernel/relocate_kernel.S
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -7,31 +7,169 @@
7 */ 7 */
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/kexec.h>
10 12
11 /* 13/*
12 * Must be relocatable PIC code callable as a C function, that once 14 * Must be relocatable PIC code callable as a C function
13 * it starts can not use the previous processes stack. 15 */
14 */ 16
15 .globl relocate_new_kernel 17#define PTR(x) (x << 3)
18#define PAGE_ALIGNED (1 << PAGE_SHIFT)
19#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
20
21 .text
22 .align PAGE_ALIGNED
16 .code64 23 .code64
24 .globl relocate_kernel
25relocate_kernel:
26 /* %rdi indirection_page
27 * %rsi page_list
28 * %rdx start address
29 */
30
31 /* map the control page at its virtual address */
32
33 movq $0x0000ff8000000000, %r10 /* mask */
34 mov $(39 - 3), %cl /* bits to shift */
35 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
36
37 movq %r11, %r9
38 andq %r10, %r9
39 shrq %cl, %r9
40
41 movq PTR(VA_PGD)(%rsi), %r8
42 addq %r8, %r9
43 movq PTR(PA_PUD_0)(%rsi), %r8
44 orq $PAGE_ATTR, %r8
45 movq %r8, (%r9)
46
47 shrq $9, %r10
48 sub $9, %cl
49
50 movq %r11, %r9
51 andq %r10, %r9
52 shrq %cl, %r9
53
54 movq PTR(VA_PUD_0)(%rsi), %r8
55 addq %r8, %r9
56 movq PTR(PA_PMD_0)(%rsi), %r8
57 orq $PAGE_ATTR, %r8
58 movq %r8, (%r9)
59
60 shrq $9, %r10
61 sub $9, %cl
62
63 movq %r11, %r9
64 andq %r10, %r9
65 shrq %cl, %r9
66
67 movq PTR(VA_PMD_0)(%rsi), %r8
68 addq %r8, %r9
69 movq PTR(PA_PTE_0)(%rsi), %r8
70 orq $PAGE_ATTR, %r8
71 movq %r8, (%r9)
72
73 shrq $9, %r10
74 sub $9, %cl
75
76 movq %r11, %r9
77 andq %r10, %r9
78 shrq %cl, %r9
79
80 movq PTR(VA_PTE_0)(%rsi), %r8
81 addq %r8, %r9
82 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
83 orq $PAGE_ATTR, %r8
84 movq %r8, (%r9)
85
86 /* identity map the control page at its physical address */
87
88 movq $0x0000ff8000000000, %r10 /* mask */
89 mov $(39 - 3), %cl /* bits to shift */
90 movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
91
92 movq %r11, %r9
93 andq %r10, %r9
94 shrq %cl, %r9
95
96 movq PTR(VA_PGD)(%rsi), %r8
97 addq %r8, %r9
98 movq PTR(PA_PUD_1)(%rsi), %r8
99 orq $PAGE_ATTR, %r8
100 movq %r8, (%r9)
101
102 shrq $9, %r10
103 sub $9, %cl
104
105 movq %r11, %r9
106 andq %r10, %r9
107 shrq %cl, %r9
108
109 movq PTR(VA_PUD_1)(%rsi), %r8
110 addq %r8, %r9
111 movq PTR(PA_PMD_1)(%rsi), %r8
112 orq $PAGE_ATTR, %r8
113 movq %r8, (%r9)
114
115 shrq $9, %r10
116 sub $9, %cl
117
118 movq %r11, %r9
119 andq %r10, %r9
120 shrq %cl, %r9
121
122 movq PTR(VA_PMD_1)(%rsi), %r8
123 addq %r8, %r9
124 movq PTR(PA_PTE_1)(%rsi), %r8
125 orq $PAGE_ATTR, %r8
126 movq %r8, (%r9)
127
128 shrq $9, %r10
129 sub $9, %cl
130
131 movq %r11, %r9
132 andq %r10, %r9
133 shrq %cl, %r9
134
135 movq PTR(VA_PTE_1)(%rsi), %r8
136 addq %r8, %r9
137 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
138 orq $PAGE_ATTR, %r8
139 movq %r8, (%r9)
140
17relocate_new_kernel: 141relocate_new_kernel:
18 /* %rdi page_list 142 /* %rdi indirection_page
19 * %rsi reboot_code_buffer 143 * %rsi page_list
20 * %rdx start address 144 * %rdx start address
21 * %rcx page_table
22 * %r8 arg5
23 * %r9 arg6
24 */ 145 */
25 146
26 /* zero out flags, and disable interrupts */ 147 /* zero out flags, and disable interrupts */
27 pushq $0 148 pushq $0
28 popfq 149 popfq
29 150
30 /* set a new stack at the bottom of our page... */ 151 /* get physical address of control page now */
31 lea 4096(%rsi), %rsp 152 /* this is impossible after page table switch */
153 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
154
155 /* get physical address of page table now too */
156 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
32 157
33 /* store the parameters back on the stack */ 158 /* switch to new set of page tables */
34 pushq %rdx /* store the start address */ 159 movq PTR(PA_PGD)(%rsi), %r9
160 movq %r9, %cr3
161
162 /* setup a new stack at the end of the physical control page */
163 lea 4096(%r8), %rsp
164
165 /* jump to identity mapped page */
166 addq $(identity_mapped - relocate_kernel), %r8
167 pushq %r8
168 ret
169
170identity_mapped:
171 /* store the start address on the stack */
172 pushq %rdx
35 173
36 /* Set cr0 to a known state: 174 /* Set cr0 to a known state:
37 * 31 1 == Paging enabled 175 * 31 1 == Paging enabled
@@ -136,8 +274,3 @@ relocate_new_kernel:
136 xorq %r15, %r15 274 xorq %r15, %r15
137 275
138 ret 276 ret
139relocate_new_kernel_end:
140
141 .globl relocate_new_kernel_size
142relocate_new_kernel_size:
143 .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 34afad704824..0b00bb2ea576 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -74,16 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data);
74 74
75unsigned long mmu_cr4_features; 75unsigned long mmu_cr4_features;
76 76
77int acpi_disabled;
78EXPORT_SYMBOL(acpi_disabled);
79#ifdef CONFIG_ACPI
80extern int __initdata acpi_ht;
81extern acpi_interrupt_flags acpi_sci_flags;
82int __initdata acpi_force = 0;
83#endif
84
85int acpi_numa __initdata;
86
87/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 77/* Boot loader ID as an integer, for the benefit of proc_dointvec */
88int bootloader_type; 78int bootloader_type;
89 79
@@ -107,7 +97,6 @@ struct sys_desc_table_struct {
107 97
108struct edid_info edid_info; 98struct edid_info edid_info;
109EXPORT_SYMBOL_GPL(edid_info); 99EXPORT_SYMBOL_GPL(edid_info);
110struct e820map e820;
111 100
112extern int root_mountflags; 101extern int root_mountflags;
113 102
@@ -276,185 +265,22 @@ static void __init probe_roms(void)
276 } 265 }
277} 266}
278 267
279/* Check for full argument with no trailing characters */ 268#ifdef CONFIG_PROC_VMCORE
280static int fullarg(char *p, char *arg) 269/* elfcorehdr= specifies the location of elf core header
270 * stored by the crashed kernel. This option will be passed
271 * by kexec loader to the capture kernel.
272 */
273static int __init setup_elfcorehdr(char *arg)
281{ 274{
282 int l = strlen(arg); 275 char *end;
283 return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); 276 if (!arg)
277 return -EINVAL;
278 elfcorehdr_addr = memparse(arg, &end);
279 return end > arg ? 0 : -EINVAL;
284} 280}
285 281early_param("elfcorehdr", setup_elfcorehdr);
286static __init void parse_cmdline_early (char ** cmdline_p)
287{
288 char c = ' ', *to = command_line, *from = COMMAND_LINE;
289 int len = 0;
290 int userdef = 0;
291
292 for (;;) {
293 if (c != ' ')
294 goto next_char;
295
296#ifdef CONFIG_SMP
297 /*
298 * If the BIOS enumerates physical processors before logical,
299 * maxcpus=N at enumeration-time can be used to disable HT.
300 */
301 else if (!memcmp(from, "maxcpus=", 8)) {
302 extern unsigned int maxcpus;
303
304 maxcpus = simple_strtoul(from + 8, NULL, 0);
305 }
306#endif
307#ifdef CONFIG_ACPI
308 /* "acpi=off" disables both ACPI table parsing and interpreter init */
309 if (fullarg(from,"acpi=off"))
310 disable_acpi();
311
312 if (fullarg(from, "acpi=force")) {
313 /* add later when we do DMI horrors: */
314 acpi_force = 1;
315 acpi_disabled = 0;
316 }
317
318 /* acpi=ht just means: do ACPI MADT parsing
319 at bootup, but don't enable the full ACPI interpreter */
320 if (fullarg(from, "acpi=ht")) {
321 if (!acpi_force)
322 disable_acpi();
323 acpi_ht = 1;
324 }
325 else if (fullarg(from, "pci=noacpi"))
326 acpi_disable_pci();
327 else if (fullarg(from, "acpi=noirq"))
328 acpi_noirq_set();
329
330 else if (fullarg(from, "acpi_sci=edge"))
331 acpi_sci_flags.trigger = 1;
332 else if (fullarg(from, "acpi_sci=level"))
333 acpi_sci_flags.trigger = 3;
334 else if (fullarg(from, "acpi_sci=high"))
335 acpi_sci_flags.polarity = 1;
336 else if (fullarg(from, "acpi_sci=low"))
337 acpi_sci_flags.polarity = 3;
338
339 /* acpi=strict disables out-of-spec workarounds */
340 else if (fullarg(from, "acpi=strict")) {
341 acpi_strict = 1;
342 }
343#ifdef CONFIG_X86_IO_APIC
344 else if (fullarg(from, "acpi_skip_timer_override"))
345 acpi_skip_timer_override = 1;
346#endif
347#endif
348
349 if (fullarg(from, "disable_timer_pin_1"))
350 disable_timer_pin_1 = 1;
351 if (fullarg(from, "enable_timer_pin_1"))
352 disable_timer_pin_1 = -1;
353
354 if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
355 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
356 disable_apic = 1;
357 }
358
359 if (fullarg(from, "noapic"))
360 skip_ioapic_setup = 1;
361
362 if (fullarg(from,"apic")) {
363 skip_ioapic_setup = 0;
364 ioapic_force = 1;
365 }
366
367 if (!memcmp(from, "mem=", 4))
368 parse_memopt(from+4, &from);
369
370 if (!memcmp(from, "memmap=", 7)) {
371 /* exactmap option is for used defined memory */
372 if (!memcmp(from+7, "exactmap", 8)) {
373#ifdef CONFIG_CRASH_DUMP
374 /* If we are doing a crash dump, we
375 * still need to know the real mem
376 * size before original memory map is
377 * reset.
378 */
379 saved_max_pfn = e820_end_of_ram();
380#endif
381 from += 8+7;
382 end_pfn_map = 0;
383 e820.nr_map = 0;
384 userdef = 1;
385 }
386 else {
387 parse_memmapopt(from+7, &from);
388 userdef = 1;
389 }
390 }
391
392#ifdef CONFIG_NUMA
393 if (!memcmp(from, "numa=", 5))
394 numa_setup(from+5);
395#endif 282#endif
396 283
397 if (!memcmp(from,"iommu=",6)) {
398 iommu_setup(from+6);
399 }
400
401 if (fullarg(from,"oops=panic"))
402 panic_on_oops = 1;
403
404 if (!memcmp(from, "noexec=", 7))
405 nonx_setup(from + 7);
406
407#ifdef CONFIG_KEXEC
408 /* crashkernel=size@addr specifies the location to reserve for
409 * a crash kernel. By reserving this memory we guarantee
410 * that linux never set's it up as a DMA target.
411 * Useful for holding code to do something appropriate
412 * after a kernel panic.
413 */
414 else if (!memcmp(from, "crashkernel=", 12)) {
415 unsigned long size, base;
416 size = memparse(from+12, &from);
417 if (*from == '@') {
418 base = memparse(from+1, &from);
419 /* FIXME: Do I want a sanity check
420 * to validate the memory range?
421 */
422 crashk_res.start = base;
423 crashk_res.end = base + size - 1;
424 }
425 }
426#endif
427
428#ifdef CONFIG_PROC_VMCORE
429 /* elfcorehdr= specifies the location of elf core header
430 * stored by the crashed kernel. This option will be passed
431 * by kexec loader to the capture kernel.
432 */
433 else if(!memcmp(from, "elfcorehdr=", 11))
434 elfcorehdr_addr = memparse(from+11, &from);
435#endif
436
437#ifdef CONFIG_HOTPLUG_CPU
438 else if (!memcmp(from, "additional_cpus=", 16))
439 setup_additional_cpus(from+16);
440#endif
441
442 next_char:
443 c = *(from++);
444 if (!c)
445 break;
446 if (COMMAND_LINE_SIZE <= ++len)
447 break;
448 *(to++) = c;
449 }
450 if (userdef) {
451 printk(KERN_INFO "user-defined physical RAM map:\n");
452 e820_print_map("user");
453 }
454 *to = '\0';
455 *cmdline_p = command_line;
456}
457
458#ifndef CONFIG_NUMA 284#ifndef CONFIG_NUMA
459static void __init 285static void __init
460contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 286contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
@@ -466,7 +292,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
466 if (bootmap == -1L) 292 if (bootmap == -1L)
467 panic("Cannot find bootmem map of size %ld\n",bootmap_size); 293 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
468 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 294 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
469 e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); 295 e820_register_active_regions(0, start_pfn, end_pfn);
296 free_bootmem_with_active_regions(0, end_pfn);
470 reserve_bootmem(bootmap, bootmap_size); 297 reserve_bootmem(bootmap, bootmap_size);
471} 298}
472#endif 299#endif
@@ -521,6 +348,8 @@ static void discover_ebda(void)
521 348
522void __init setup_arch(char **cmdline_p) 349void __init setup_arch(char **cmdline_p)
523{ 350{
351 printk(KERN_INFO "Command line: %s\n", saved_command_line);
352
524 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); 353 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
525 screen_info = SCREEN_INFO; 354 screen_info = SCREEN_INFO;
526 edid_info = EDID_INFO; 355 edid_info = EDID_INFO;
@@ -547,16 +376,22 @@ void __init setup_arch(char **cmdline_p)
547 data_resource.start = virt_to_phys(&_etext); 376 data_resource.start = virt_to_phys(&_etext);
548 data_resource.end = virt_to_phys(&_edata)-1; 377 data_resource.end = virt_to_phys(&_edata)-1;
549 378
550 parse_cmdline_early(cmdline_p);
551
552 early_identify_cpu(&boot_cpu_data); 379 early_identify_cpu(&boot_cpu_data);
553 380
381 strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
382 *cmdline_p = command_line;
383
384 parse_early_param();
385
386 finish_e820_parsing();
387
388 e820_register_active_regions(0, 0, -1UL);
554 /* 389 /*
555 * partially used pages are not usable - thus 390 * partially used pages are not usable - thus
556 * we are rounding upwards: 391 * we are rounding upwards:
557 */ 392 */
558 end_pfn = e820_end_of_ram(); 393 end_pfn = e820_end_of_ram();
559 num_physpages = end_pfn; /* for pfn_valid */ 394 num_physpages = end_pfn;
560 395
561 check_efer(); 396 check_efer();
562 397
@@ -576,6 +411,14 @@ void __init setup_arch(char **cmdline_p)
576 acpi_boot_table_init(); 411 acpi_boot_table_init();
577#endif 412#endif
578 413
414 /* How many end-of-memory variables you have, grandma! */
415 max_low_pfn = end_pfn;
416 max_pfn = end_pfn;
417 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
418
419 /* Remove active ranges so rediscovery with NUMA-awareness happens */
420 remove_all_active_ranges();
421
579#ifdef CONFIG_ACPI_NUMA 422#ifdef CONFIG_ACPI_NUMA
580 /* 423 /*
581 * Parse SRAT to discover nodes. 424 * Parse SRAT to discover nodes.
@@ -625,12 +468,10 @@ void __init setup_arch(char **cmdline_p)
625 */ 468 */
626 acpi_reserve_bootmem(); 469 acpi_reserve_bootmem();
627#endif 470#endif
628#ifdef CONFIG_X86_LOCAL_APIC
629 /* 471 /*
630 * Find and reserve possible boot-time SMP configuration: 472 * Find and reserve possible boot-time SMP configuration:
631 */ 473 */
632 find_smp_config(); 474 find_smp_config();
633#endif
634#ifdef CONFIG_BLK_DEV_INITRD 475#ifdef CONFIG_BLK_DEV_INITRD
635 if (LOADER_TYPE && INITRD_START) { 476 if (LOADER_TYPE && INITRD_START) {
636 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { 477 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
@@ -657,7 +498,9 @@ void __init setup_arch(char **cmdline_p)
657 498
658 paging_init(); 499 paging_init();
659 500
660 check_ioapic(); 501#ifdef CONFIG_PCI
502 early_quirks();
503#endif
661 504
662 /* 505 /*
663 * set this early, so we dont allocate cpu0 506 * set this early, so we dont allocate cpu0
@@ -674,14 +517,12 @@ void __init setup_arch(char **cmdline_p)
674 517
675 init_cpu_to_node(); 518 init_cpu_to_node();
676 519
677#ifdef CONFIG_X86_LOCAL_APIC
678 /* 520 /*
679 * get boot-time SMP configuration: 521 * get boot-time SMP configuration:
680 */ 522 */
681 if (smp_found_config) 523 if (smp_found_config)
682 get_smp_config(); 524 get_smp_config();
683 init_apic_mappings(); 525 init_apic_mappings();
684#endif
685 526
686 /* 527 /*
687 * Request address space for all standard RAM and ROM resources 528 * Request address space for all standard RAM and ROM resources
@@ -689,6 +530,7 @@ void __init setup_arch(char **cmdline_p)
689 */ 530 */
690 probe_roms(); 531 probe_roms();
691 e820_reserve_resources(); 532 e820_reserve_resources();
533 e820_mark_nosave_regions();
692 534
693 request_resource(&iomem_resource, &video_ram_resource); 535 request_resource(&iomem_resource, &video_ram_resource);
694 536
@@ -838,7 +680,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
838#endif 680#endif
839} 681}
840 682
841static void __init init_amd(struct cpuinfo_x86 *c) 683static void __cpuinit init_amd(struct cpuinfo_x86 *c)
842{ 684{
843 unsigned level; 685 unsigned level;
844 686
@@ -894,6 +736,12 @@ static void __init init_amd(struct cpuinfo_x86 *c)
894 736
895 /* Fix cpuid4 emulation for more */ 737 /* Fix cpuid4 emulation for more */
896 num_cache_leaves = 3; 738 num_cache_leaves = 3;
739
740 /* When there is only one core no need to synchronize RDTSC */
741 if (num_possible_cpus() == 1)
742 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
743 else
744 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
897} 745}
898 746
899static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 747static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -975,8 +823,7 @@ static void srat_detect_node(void)
975 node = first_node(node_online_map); 823 node = first_node(node_online_map);
976 numa_set_node(cpu, node); 824 numa_set_node(cpu, node);
977 825
978 if (acpi_numa > 0) 826 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
979 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
980#endif 827#endif
981} 828}
982 829
@@ -1010,6 +857,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
1010 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 857 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
1011 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 858 (c->x86 == 0x6 && c->x86_model >= 0x0e))
1012 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 859 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
860 if (c->x86 == 6)
861 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
1013 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 862 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
1014 c->x86_max_cores = intel_num_cpu_cores(c); 863 c->x86_max_cores = intel_num_cpu_cores(c);
1015 864
@@ -1228,8 +1077,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1228 1077
1229 /* Intel-defined (#2) */ 1078 /* Intel-defined (#2) */
1230 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", 1079 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
1231 "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, 1080 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
1232 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1081 NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
1233 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1082 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1234 1083
1235 /* VIA/Cyrix/Centaur-defined */ 1084 /* VIA/Cyrix/Centaur-defined */
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 417de564456e..8c4b80fe71a1 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -24,7 +24,7 @@
24#include <asm/proto.h> 24#include <asm/proto.h>
25#include <asm/sections.h> 25#include <asm/sections.h>
26 26
27char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; 27char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
28 28
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30 30
@@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes.
46on Enable(default) 46on Enable(default)
47off Disable 47off Disable
48*/ 48*/
49int __init nonx_setup(char *str) 49static int __init nonx_setup(char *str)
50{ 50{
51 if (!str)
52 return -EINVAL;
51 if (!strncmp(str, "on", 2)) { 53 if (!strncmp(str, "on", 2)) {
52 __supported_pte_mask |= _PAGE_NX; 54 __supported_pte_mask |= _PAGE_NX;
53 do_not_nx = 0; 55 do_not_nx = 0;
@@ -55,9 +57,9 @@ int __init nonx_setup(char *str)
55 do_not_nx = 1; 57 do_not_nx = 1;
56 __supported_pte_mask &= ~_PAGE_NX; 58 __supported_pte_mask &= ~_PAGE_NX;
57 } 59 }
58 return 1; 60 return 0;
59} 61}
60__setup("noexec=", nonx_setup); /* parsed early actually */ 62early_param("noexec", nonx_setup);
61 63
62int force_personality32 = 0; 64int force_personality32 = 0;
63 65
@@ -93,12 +95,9 @@ void __init setup_per_cpu_areas(void)
93#endif 95#endif
94 96
95 /* Copy section for each CPU (we discard the original) */ 97 /* Copy section for each CPU (we discard the original) */
96 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); 98 size = PERCPU_ENOUGH_ROOM;
97#ifdef CONFIG_MODULES
98 if (size < PERCPU_ENOUGH_ROOM)
99 size = PERCPU_ENOUGH_ROOM;
100#endif
101 99
100 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
102 for_each_cpu_mask (i, cpu_possible_map) { 101 for_each_cpu_mask (i, cpu_possible_map) {
103 char *ptr; 102 char *ptr;
104 103
@@ -122,7 +121,10 @@ void pda_init(int cpu)
122 121
123 /* Setup up data that may be needed in __get_free_pages early */ 122 /* Setup up data that may be needed in __get_free_pages early */
124 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 123 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
124 /* Memory clobbers used to order PDA accessed */
125 mb();
125 wrmsrl(MSR_GS_BASE, pda); 126 wrmsrl(MSR_GS_BASE, pda);
127 mb();
126 128
127 pda->cpunumber = cpu; 129 pda->cpunumber = cpu;
128 pda->irqcount = -1; 130 pda->irqcount = -1;
@@ -178,6 +180,8 @@ void __cpuinit check_efer(void)
178 } 180 }
179} 181}
180 182
183unsigned long kernel_eflags;
184
181/* 185/*
182 * cpu_init() initializes state that is per-CPU. Some data is already 186 * cpu_init() initializes state that is per-CPU. Some data is already
183 * initialized (naturally) in the bootstrap process, such as the GDT 187 * initialized (naturally) in the bootstrap process, such as the GDT
@@ -235,28 +239,17 @@ void __cpuinit cpu_init (void)
235 * set up and load the per-CPU TSS 239 * set up and load the per-CPU TSS
236 */ 240 */
237 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 241 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
242 static const unsigned int order[N_EXCEPTION_STACKS] = {
243 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
244 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
245 };
238 if (cpu) { 246 if (cpu) {
239 static const unsigned int order[N_EXCEPTION_STACKS] = {
240 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
241 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
242 };
243
244 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); 247 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
245 if (!estacks) 248 if (!estacks)
246 panic("Cannot allocate exception stack %ld %d\n", 249 panic("Cannot allocate exception stack %ld %d\n",
247 v, cpu); 250 v, cpu);
248 } 251 }
249 switch (v + 1) { 252 estacks += PAGE_SIZE << order[v];
250#if DEBUG_STKSZ > EXCEPTION_STKSZ
251 case DEBUG_STACK:
252 cpu_pda(cpu)->debugstack = (unsigned long)estacks;
253 estacks += DEBUG_STKSZ;
254 break;
255#endif
256 default:
257 estacks += EXCEPTION_STKSZ;
258 break;
259 }
260 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; 253 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
261 } 254 }
262 255
@@ -290,4 +283,6 @@ void __cpuinit cpu_init (void)
290 set_debugreg(0UL, 7); 283 set_debugreg(0UL, 7);
291 284
292 fpu_init(); 285 fpu_init();
286
287 raw_local_save_flags(kernel_eflags);
293} 288}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 28161170fb0a..49ec324cd141 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -38,37 +38,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
38 sigset_t *set, struct pt_regs * regs); 38 sigset_t *set, struct pt_regs * regs);
39 39
40asmlinkage long 40asmlinkage long
41sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
42{
43 sigset_t saveset, newset;
44
45 /* XXX: Don't preclude handling different sized sigset_t's. */
46 if (sigsetsize != sizeof(sigset_t))
47 return -EINVAL;
48
49 if (copy_from_user(&newset, unewset, sizeof(newset)))
50 return -EFAULT;
51 sigdelsetmask(&newset, ~_BLOCKABLE);
52
53 spin_lock_irq(&current->sighand->siglock);
54 saveset = current->blocked;
55 current->blocked = newset;
56 recalc_sigpending();
57 spin_unlock_irq(&current->sighand->siglock);
58#ifdef DEBUG_SIG
59 printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
60 saveset, newset, regs, regs->rip);
61#endif
62 regs->rax = -EINTR;
63 while (1) {
64 current->state = TASK_INTERRUPTIBLE;
65 schedule();
66 if (do_signal(regs, &saveset))
67 return -EINTR;
68 }
69}
70
71asmlinkage long
72sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 41sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
73 struct pt_regs *regs) 42 struct pt_regs *regs)
74{ 43{
@@ -308,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
308#endif 277#endif
309 278
310 /* Set up registers for signal handler */ 279 /* Set up registers for signal handler */
311 {
312 struct exec_domain *ed = current_thread_info()->exec_domain;
313 if (unlikely(ed && ed->signal_invmap && sig < 32))
314 sig = ed->signal_invmap[sig];
315 }
316 regs->rdi = sig; 280 regs->rdi = sig;
317 /* In case the signal handler was declared without prototypes */ 281 /* In case the signal handler was declared without prototypes */
318 regs->rax = 0; 282 regs->rax = 0;
@@ -341,11 +305,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
341 current->comm, current->pid, frame, regs->rip, frame->pretcode); 305 current->comm, current->pid, frame, regs->rip, frame->pretcode);
342#endif 306#endif
343 307
344 return 1; 308 return 0;
345 309
346give_sigsegv: 310give_sigsegv:
347 force_sigsegv(sig, current); 311 force_sigsegv(sig, current);
348 return 0; 312 return -EFAULT;
349} 313}
350 314
351/* 315/*
@@ -408,7 +372,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
408#endif 372#endif
409 ret = setup_rt_frame(sig, ka, info, oldset, regs); 373 ret = setup_rt_frame(sig, ka, info, oldset, regs);
410 374
411 if (ret) { 375 if (ret == 0) {
412 spin_lock_irq(&current->sighand->siglock); 376 spin_lock_irq(&current->sighand->siglock);
413 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask); 377 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
414 if (!(ka->sa.sa_flags & SA_NODEFER)) 378 if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -425,11 +389,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
425 * want to handle. Thus you cannot kill init even with a SIGKILL even by 389 * want to handle. Thus you cannot kill init even with a SIGKILL even by
426 * mistake. 390 * mistake.
427 */ 391 */
428int do_signal(struct pt_regs *regs, sigset_t *oldset) 392static void do_signal(struct pt_regs *regs)
429{ 393{
430 struct k_sigaction ka; 394 struct k_sigaction ka;
431 siginfo_t info; 395 siginfo_t info;
432 int signr; 396 int signr;
397 sigset_t *oldset;
433 398
434 /* 399 /*
435 * We want the common case to go fast, which 400 * We want the common case to go fast, which
@@ -438,9 +403,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
438 * if so. 403 * if so.
439 */ 404 */
440 if (!user_mode(regs)) 405 if (!user_mode(regs))
441 return 1; 406 return;
442 407
443 if (!oldset) 408 if (test_thread_flag(TIF_RESTORE_SIGMASK))
409 oldset = &current->saved_sigmask;
410 else
444 oldset = &current->blocked; 411 oldset = &current->blocked;
445 412
446 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 413 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
@@ -454,30 +421,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
454 set_debugreg(current->thread.debugreg7, 7); 421 set_debugreg(current->thread.debugreg7, 7);
455 422
456 /* Whee! Actually deliver the signal. */ 423 /* Whee! Actually deliver the signal. */
457 return handle_signal(signr, &info, &ka, oldset, regs); 424 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
425 /* a signal was successfully delivered; the saved
426 * sigmask will have been stored in the signal frame,
427 * and will be restored by sigreturn, so we can simply
428 * clear the TIF_RESTORE_SIGMASK flag */
429 clear_thread_flag(TIF_RESTORE_SIGMASK);
430 }
431 return;
458 } 432 }
459 433
460 /* Did we come from a system call? */ 434 /* Did we come from a system call? */
461 if ((long)regs->orig_rax >= 0) { 435 if ((long)regs->orig_rax >= 0) {
462 /* Restart the system call - no handlers present */ 436 /* Restart the system call - no handlers present */
463 long res = regs->rax; 437 long res = regs->rax;
464 if (res == -ERESTARTNOHAND || 438 switch (res) {
465 res == -ERESTARTSYS || 439 case -ERESTARTNOHAND:
466 res == -ERESTARTNOINTR) { 440 case -ERESTARTSYS:
441 case -ERESTARTNOINTR:
467 regs->rax = regs->orig_rax; 442 regs->rax = regs->orig_rax;
468 regs->rip -= 2; 443 regs->rip -= 2;
469 } 444 break;
470 if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { 445 case -ERESTART_RESTARTBLOCK:
471 regs->rax = test_thread_flag(TIF_IA32) ? 446 regs->rax = test_thread_flag(TIF_IA32) ?
472 __NR_ia32_restart_syscall : 447 __NR_ia32_restart_syscall :
473 __NR_restart_syscall; 448 __NR_restart_syscall;
474 regs->rip -= 2; 449 regs->rip -= 2;
450 break;
475 } 451 }
476 } 452 }
477 return 0; 453
454 /* if there's no signal to deliver, we just put the saved sigmask
455 back. */
456 if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
457 clear_thread_flag(TIF_RESTORE_SIGMASK);
458 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
459 }
478} 460}
479 461
480void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) 462void
463do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
481{ 464{
482#ifdef DEBUG_SIG 465#ifdef DEBUG_SIG
483 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", 466 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
@@ -491,8 +474,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_
491 } 474 }
492 475
493 /* deal with pending signal delivery */ 476 /* deal with pending signal delivery */
494 if (thread_info_flags & _TIF_SIGPENDING) 477 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
495 do_signal(regs,oldset); 478 do_signal(regs);
496} 479}
497 480
498void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 481void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 06af6ca60129..4f67697f5036 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -522,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void)
522 } 522 }
523} 523}
524 524
525int safe_smp_processor_id(void)
526{
527 unsigned apicid, i;
528
529 if (disable_apic)
530 return 0;
531
532 apicid = hard_smp_processor_id();
533 if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
534 return apicid;
535
536 for (i = 0; i < NR_CPUS; ++i) {
537 if (x86_cpu_to_apicid[i] == apicid)
538 return i;
539 }
540
541 /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
542 * or called too early. Either way, we must be CPU 0. */
543 if (x86_cpu_to_apicid[0] == BAD_APICID)
544 return 0;
545
546 return 0; /* Should not happen */
547}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 975380207b46..7b7a6870288a 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -46,9 +46,10 @@
46#include <linux/bootmem.h> 46#include <linux/bootmem.h>
47#include <linux/thread_info.h> 47#include <linux/thread_info.h>
48#include <linux/module.h> 48#include <linux/module.h>
49
50#include <linux/delay.h> 49#include <linux/delay.h>
51#include <linux/mc146818rtc.h> 50#include <linux/mc146818rtc.h>
51#include <linux/smp.h>
52
52#include <asm/mtrr.h> 53#include <asm/mtrr.h>
53#include <asm/pgalloc.h> 54#include <asm/pgalloc.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -1090,7 +1091,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
1090 /* 1091 /*
1091 * Switch from PIC to APIC mode. 1092 * Switch from PIC to APIC mode.
1092 */ 1093 */
1093 connect_bsp_APIC();
1094 setup_local_APIC(); 1094 setup_local_APIC();
1095 1095
1096 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { 1096 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
@@ -1175,12 +1175,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
1175void __init smp_cpus_done(unsigned int max_cpus) 1175void __init smp_cpus_done(unsigned int max_cpus)
1176{ 1176{
1177 smp_cleanup_boot(); 1177 smp_cleanup_boot();
1178
1179#ifdef CONFIG_X86_IO_APIC
1180 setup_ioapic_dest(); 1178 setup_ioapic_dest();
1181#endif
1182
1183 check_nmi_watchdog(); 1179 check_nmi_watchdog();
1180 time_init_gtod();
1184} 1181}
1185 1182
1186#ifdef CONFIG_HOTPLUG_CPU 1183#ifdef CONFIG_HOTPLUG_CPU
@@ -1233,6 +1230,8 @@ int __cpu_disable(void)
1233 if (cpu == 0) 1230 if (cpu == 0)
1234 return -EBUSY; 1231 return -EBUSY;
1235 1232
1233 if (nmi_watchdog == NMI_LOCAL_APIC)
1234 stop_apic_nmi_watchdog(NULL);
1236 clear_local_APIC(); 1235 clear_local_APIC();
1237 1236
1238 /* 1237 /*
@@ -1272,11 +1271,11 @@ void __cpu_die(unsigned int cpu)
1272 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1271 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1273} 1272}
1274 1273
1275__init int setup_additional_cpus(char *s) 1274static __init int setup_additional_cpus(char *s)
1276{ 1275{
1277 return get_option(&s, &additional_cpus); 1276 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1278} 1277}
1279__setup("additional_cpus=", setup_additional_cpus); 1278early_param("additional_cpus", setup_additional_cpus);
1280 1279
1281#else /* ... !CONFIG_HOTPLUG_CPU */ 1280#else /* ... !CONFIG_HOTPLUG_CPU */
1282 1281
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 32cf55eb9af8..6026b31d037e 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -7,215 +7,49 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/stacktrace.h> 9#include <linux/stacktrace.h>
10#include <linux/module.h>
11#include <asm/stacktrace.h>
10 12
11#include <asm/smp.h> 13static void save_stack_warning(void *data, char *msg)
12
13static inline int
14in_range(unsigned long start, unsigned long addr, unsigned long end)
15{ 14{
16 return addr >= start && addr <= end;
17} 15}
18 16
19static unsigned long 17static void
20get_stack_end(struct task_struct *task, unsigned long stack) 18save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
21{ 19{
22 unsigned long stack_start, stack_end, flags;
23 int i, cpu;
24
25 /*
26 * The most common case is that we are in the task stack:
27 */
28 stack_start = (unsigned long)task->thread_info;
29 stack_end = stack_start + THREAD_SIZE;
30
31 if (in_range(stack_start, stack, stack_end))
32 return stack_end;
33
34 /*
35 * We are in an interrupt if irqstackptr is set:
36 */
37 raw_local_irq_save(flags);
38 cpu = safe_smp_processor_id();
39 stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr;
40
41 if (stack_end) {
42 stack_start = stack_end & ~(IRQSTACKSIZE-1);
43 if (in_range(stack_start, stack, stack_end))
44 goto out_restore;
45 /*
46 * We get here if we are in an IRQ context but we
47 * are also in an exception stack.
48 */
49 }
50
51 /*
52 * Iterate over all exception stacks, and figure out whether
53 * 'stack' is in one of them:
54 */
55 for (i = 0; i < N_EXCEPTION_STACKS; i++) {
56 /*
57 * set 'end' to the end of the exception stack.
58 */
59 stack_end = per_cpu(init_tss, cpu).ist[i];
60 stack_start = stack_end - EXCEPTION_STKSZ;
61
62 /*
63 * Is 'stack' above this exception frame's end?
64 * If yes then skip to the next frame.
65 */
66 if (stack >= stack_end)
67 continue;
68 /*
69 * Is 'stack' above this exception frame's start address?
70 * If yes then we found the right frame.
71 */
72 if (stack >= stack_start)
73 goto out_restore;
74
75 /*
76 * If this is a debug stack, and if it has a larger size than
77 * the usual exception stacks, then 'stack' might still
78 * be within the lower portion of the debug stack:
79 */
80#if DEBUG_STKSZ > EXCEPTION_STKSZ
81 if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) {
82 /*
83 * Black magic. A large debug stack is composed of
84 * multiple exception stack entries, which we
85 * iterate through now. Dont look:
86 */
87 do {
88 stack_end -= EXCEPTION_STKSZ;
89 stack_start -= EXCEPTION_STKSZ;
90 } while (stack < stack_start);
91
92 goto out_restore;
93 }
94#endif
95 }
96 /*
97 * Ok, 'stack' is not pointing to any of the system stacks.
98 */
99 stack_end = 0;
100
101out_restore:
102 raw_local_irq_restore(flags);
103
104 return stack_end;
105} 20}
106 21
107 22static int save_stack_stack(void *data, char *name)
108/*
109 * Save stack-backtrace addresses into a stack_trace buffer:
110 */
111static inline unsigned long
112save_context_stack(struct stack_trace *trace, unsigned int skip,
113 unsigned long stack, unsigned long stack_end)
114{ 23{
115 unsigned long addr; 24 struct stack_trace *trace = (struct stack_trace *)data;
116 25 return trace->all_contexts ? 0 : -1;
117#ifdef CONFIG_FRAME_POINTER 26}
118 unsigned long prev_stack = 0;
119 27
120 while (in_range(prev_stack, stack, stack_end)) { 28static void save_stack_address(void *data, unsigned long addr)
121 pr_debug("stack: %p\n", (void *)stack); 29{
122 addr = (unsigned long)(((unsigned long *)stack)[1]); 30 struct stack_trace *trace = (struct stack_trace *)data;
123 pr_debug("addr: %p\n", (void *)addr); 31 if (trace->skip > 0) {
124 if (!skip) 32 trace->skip--;
125 trace->entries[trace->nr_entries++] = addr-1; 33 return;
126 else
127 skip--;
128 if (trace->nr_entries >= trace->max_entries)
129 break;
130 if (!addr)
131 return 0;
132 /*
133 * Stack frames must go forwards (otherwise a loop could
134 * happen if the stackframe is corrupted), so we move
135 * prev_stack forwards:
136 */
137 prev_stack = stack;
138 stack = (unsigned long)(((unsigned long *)stack)[0]);
139 }
140 pr_debug("invalid: %p\n", (void *)stack);
141#else
142 while (stack < stack_end) {
143 addr = ((unsigned long *)stack)[0];
144 stack += sizeof(long);
145 if (__kernel_text_address(addr)) {
146 if (!skip)
147 trace->entries[trace->nr_entries++] = addr-1;
148 else
149 skip--;
150 if (trace->nr_entries >= trace->max_entries)
151 break;
152 }
153 } 34 }
154#endif 35 if (trace->nr_entries < trace->max_entries - 1)
155 return stack; 36 trace->entries[trace->nr_entries++] = addr;
156} 37}
157 38
158#define MAX_STACKS 10 39static struct stacktrace_ops save_stack_ops = {
40 .warning = save_stack_warning,
41 .warning_symbol = save_stack_warning_symbol,
42 .stack = save_stack_stack,
43 .address = save_stack_address,
44};
159 45
160/* 46/*
161 * Save stack-backtrace addresses into a stack_trace buffer. 47 * Save stack-backtrace addresses into a stack_trace buffer.
162 * If all_contexts is set, all contexts (hardirq, softirq and process)
163 * are saved. If not set then only the current context is saved.
164 */ 48 */
165void save_stack_trace(struct stack_trace *trace, 49void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
166 struct task_struct *task, int all_contexts,
167 unsigned int skip)
168{ 50{
169 unsigned long stack = (unsigned long)&stack; 51 dump_trace(task, NULL, NULL, &save_stack_ops, trace);
170 int i, nr_stacks = 0, stacks_done[MAX_STACKS]; 52 trace->entries[trace->nr_entries++] = ULONG_MAX;
171
172 WARN_ON(trace->nr_entries || !trace->max_entries);
173
174 if (!task)
175 task = current;
176
177 pr_debug("task: %p, ti: %p\n", task, task->thread_info);
178
179 if (!task || task == current) {
180 /* Grab rbp right from our regs: */
181 asm ("mov %%rbp, %0" : "=r" (stack));
182 pr_debug("rbp: %p\n", (void *)stack);
183 } else {
184 /* rbp is the last reg pushed by switch_to(): */
185 stack = task->thread.rsp;
186 pr_debug("other task rsp: %p\n", (void *)stack);
187 stack = (unsigned long)(((unsigned long *)stack)[0]);
188 pr_debug("other task rbp: %p\n", (void *)stack);
189 }
190
191 while (1) {
192 unsigned long stack_end = get_stack_end(task, stack);
193
194 pr_debug("stack: %p\n", (void *)stack);
195 pr_debug("stack end: %p\n", (void *)stack_end);
196
197 /*
198 * Invalid stack addres?
199 */
200 if (!stack_end)
201 return;
202 /*
203 * Were we in this stack already? (recursion)
204 */
205 for (i = 0; i < nr_stacks; i++)
206 if (stacks_done[i] == stack_end)
207 return;
208 stacks_done[nr_stacks] = stack_end;
209
210 stack = save_context_stack(trace, skip, stack, stack_end);
211 if (!all_contexts || !stack ||
212 trace->nr_entries >= trace->max_entries)
213 return;
214 trace->entries[trace->nr_entries++] = ULONG_MAX;
215 if (trace->nr_entries >= trace->max_entries)
216 return;
217 if (++nr_stacks >= MAX_STACKS)
218 return;
219 }
220} 53}
54EXPORT_SYMBOL(save_stack_trace);
221 55
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 320b6fb00cca..bfbe00763c68 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -54,7 +54,7 @@ ENTRY(restore_image)
54 movq %rcx, %cr3; 54 movq %rcx, %cr3;
55 movq %rax, %cr4; # turn PGE back on 55 movq %rax, %cr4; # turn PGE back on
56 56
57 movq pagedir_nosave(%rip), %rdx 57 movq restore_pblist(%rip), %rdx
58loop: 58loop:
59 testq %rdx, %rdx 59 testq %rdx, %rdx
60 jz done 60 jz done
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index 5530dda3f27a..cbabfdf78e06 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -1,4 +1,6 @@
1/* 1/*
2 * This file manages the translation entries for the IBM Calgary IOMMU.
3 *
2 * Derived from arch/powerpc/platforms/pseries/iommu.c 4 * Derived from arch/powerpc/platforms/pseries/iommu.c
3 * 5 *
4 * Copyright (C) IBM Corporation, 2006 6 * Copyright (C) IBM Corporation, 2006
@@ -104,14 +106,6 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
104 /* set the tce table size - measured in entries */ 106 /* set the tce table size - measured in entries */
105 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 107 tbl->it_size = table_size_to_number_of_entries(specified_table_size);
106 108
107 tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
108 if (!tbl->it_base) {
109 printk(KERN_ERR "Calgary: iommu_table_setparms: "
110 "no table allocated?!\n");
111 ret = -ENOMEM;
112 goto done;
113 }
114
115 /* 109 /*
116 * number of bytes needed for the bitmap size in number of 110 * number of bytes needed for the bitmap size in number of
117 * entries; we need one bit per entry 111 * entries; we need one bit per entry
@@ -160,8 +154,6 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
160 if (ret) 154 if (ret)
161 goto free_tbl; 155 goto free_tbl;
162 156
163 tce_free(tbl, 0, tbl->it_size);
164
165 tbl->bbar = bbar; 157 tbl->bbar = bbar;
166 158
167 /* 159 /*
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7a9b18224182..1c255ee76e7c 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -24,6 +24,8 @@
24#include <linux/device.h> 24#include <linux/device.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/bcd.h> 26#include <linux/bcd.h>
27#include <linux/notifier.h>
28#include <linux/cpu.h>
27#include <linux/kallsyms.h> 29#include <linux/kallsyms.h>
28#include <linux/acpi.h> 30#include <linux/acpi.h>
29#ifdef CONFIG_ACPI 31#ifdef CONFIG_ACPI
@@ -39,9 +41,7 @@
39#include <asm/sections.h> 41#include <asm/sections.h>
40#include <linux/cpufreq.h> 42#include <linux/cpufreq.h>
41#include <linux/hpet.h> 43#include <linux/hpet.h>
42#ifdef CONFIG_X86_LOCAL_APIC
43#include <asm/apic.h> 44#include <asm/apic.h>
44#endif
45 45
46#ifdef CONFIG_CPU_FREQ 46#ifdef CONFIG_CPU_FREQ
47static void cpufreq_delayed_get(void); 47static void cpufreq_delayed_get(void);
@@ -49,7 +49,7 @@ static void cpufreq_delayed_get(void);
49extern void i8254_timer_resume(void); 49extern void i8254_timer_resume(void);
50extern int using_apic_timer; 50extern int using_apic_timer;
51 51
52static char *time_init_gtod(void); 52static char *timename = NULL;
53 53
54DEFINE_SPINLOCK(rtc_lock); 54DEFINE_SPINLOCK(rtc_lock);
55EXPORT_SYMBOL(rtc_lock); 55EXPORT_SYMBOL(rtc_lock);
@@ -187,20 +187,15 @@ unsigned long profile_pc(struct pt_regs *regs)
187{ 187{
188 unsigned long pc = instruction_pointer(regs); 188 unsigned long pc = instruction_pointer(regs);
189 189
190 /* Assume the lock function has either no stack frame or only a single 190 /* Assume the lock function has either no stack frame or a copy
191 word. This checks if the address on the stack looks like a kernel 191 of eflags from PUSHF
192 text address. 192 Eflags always has bits 22 and up cleared unlike kernel addresses. */
193 There is a small window for false hits, but in that case the tick
194 is just accounted to the spinlock function.
195 Better would be to write these functions in assembler again
196 and check exactly. */
197 if (!user_mode(regs) && in_lock_functions(pc)) { 193 if (!user_mode(regs) && in_lock_functions(pc)) {
198 char *v = *(char **)regs->rsp; 194 unsigned long *sp = (unsigned long *)regs->rsp;
199 if ((v >= _stext && v <= _etext) || 195 if (sp[0] >> 22)
200 (v >= _sinittext && v <= _einittext) || 196 return sp[0];
201 (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) 197 if (sp[1] >> 22)
202 return (unsigned long)v; 198 return sp[1];
203 return ((unsigned long *)regs->rsp)[1];
204 } 199 }
205 return pc; 200 return pc;
206} 201}
@@ -281,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime)
281 * Note: This function is required to return accurate 276 * Note: This function is required to return accurate
282 * time even in the absence of multiple timer ticks. 277 * time even in the absence of multiple timer ticks.
283 */ 278 */
279static inline unsigned long long cycles_2_ns(unsigned long long cyc);
284unsigned long long monotonic_clock(void) 280unsigned long long monotonic_clock(void)
285{ 281{
286 unsigned long seq; 282 unsigned long seq;
@@ -305,8 +301,7 @@ unsigned long long monotonic_clock(void)
305 base = monotonic_base; 301 base = monotonic_base;
306 } while (read_seqretry(&xtime_lock, seq)); 302 } while (read_seqretry(&xtime_lock, seq));
307 this_offset = get_cycles_sync(); 303 this_offset = get_cycles_sync();
308 /* FIXME: 1000 or 1000000? */ 304 offset = cycles_2_ns(this_offset - last_offset);
309 offset = (this_offset - last_offset)*1000 / cpu_khz;
310 } 305 }
311 return base + offset; 306 return base + offset;
312} 307}
@@ -410,8 +405,7 @@ void main_timer_handler(struct pt_regs *regs)
410 offset %= USEC_PER_TICK; 405 offset %= USEC_PER_TICK;
411 } 406 }
412 407
413 /* FIXME: 1000 or 1000000? */ 408 monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
414 monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;
415 409
416 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; 410 vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
417 411
@@ -441,12 +435,8 @@ void main_timer_handler(struct pt_regs *regs)
441 * have to call the local interrupt handler. 435 * have to call the local interrupt handler.
442 */ 436 */
443 437
444#ifndef CONFIG_X86_LOCAL_APIC
445 profile_tick(CPU_PROFILING, regs);
446#else
447 if (!using_apic_timer) 438 if (!using_apic_timer)
448 smp_local_timer_interrupt(regs); 439 smp_local_timer_interrupt(regs);
449#endif
450 440
451/* 441/*
452 * If we have an externally synchronized Linux clock, then update CMOS clock 442 * If we have an externally synchronized Linux clock, then update CMOS clock
@@ -470,10 +460,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
470 if (apic_runs_main_timer > 1) 460 if (apic_runs_main_timer > 1)
471 return IRQ_HANDLED; 461 return IRQ_HANDLED;
472 main_timer_handler(regs); 462 main_timer_handler(regs);
473#ifdef CONFIG_X86_LOCAL_APIC
474 if (using_apic_timer) 463 if (using_apic_timer)
475 smp_send_timer_broadcast_ipi(); 464 smp_send_timer_broadcast_ipi();
476#endif
477 return IRQ_HANDLED; 465 return IRQ_HANDLED;
478} 466}
479 467
@@ -893,11 +881,17 @@ static struct irqaction irq0 = {
893 timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL 881 timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
894}; 882};
895 883
896void __init time_init(void) 884static int __cpuinit
885time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
897{ 886{
898 char *timename; 887 unsigned cpu = (unsigned long) hcpu;
899 char *gtod; 888 if (action == CPU_ONLINE)
889 vsyscall_set_cpu(cpu);
890 return NOTIFY_DONE;
891}
900 892
893void __init time_init(void)
894{
901 if (nohpet) 895 if (nohpet)
902 vxtime.hpet_address = 0; 896 vxtime.hpet_address = 0;
903 897
@@ -931,18 +925,17 @@ void __init time_init(void)
931 } 925 }
932 926
933 vxtime.mode = VXTIME_TSC; 927 vxtime.mode = VXTIME_TSC;
934 gtod = time_init_gtod();
935
936 printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
937 vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
938 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
939 cpu_khz / 1000, cpu_khz % 1000);
940 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; 928 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
941 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; 929 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
942 vxtime.last_tsc = get_cycles_sync(); 930 vxtime.last_tsc = get_cycles_sync();
931 set_cyc2ns_scale(cpu_khz);
943 setup_irq(0, &irq0); 932 setup_irq(0, &irq0);
933 hotcpu_notifier(time_cpu_notifier, 0);
934 time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id());
944 935
945 set_cyc2ns_scale(cpu_khz); 936#ifndef CONFIG_SMP
937 time_init_gtod();
938#endif
946} 939}
947 940
948/* 941/*
@@ -973,12 +966,18 @@ __cpuinit int unsynchronized_tsc(void)
973/* 966/*
974 * Decide what mode gettimeofday should use. 967 * Decide what mode gettimeofday should use.
975 */ 968 */
976__init static char *time_init_gtod(void) 969void time_init_gtod(void)
977{ 970{
978 char *timetype; 971 char *timetype;
979 972
980 if (unsynchronized_tsc()) 973 if (unsynchronized_tsc())
981 notsc = 1; 974 notsc = 1;
975
976 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
977 vgetcpu_mode = VGETCPU_RDTSCP;
978 else
979 vgetcpu_mode = VGETCPU_LSL;
980
982 if (vxtime.hpet_address && notsc) { 981 if (vxtime.hpet_address && notsc) {
983 timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; 982 timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
984 if (hpet_use_timer) 983 if (hpet_use_timer)
@@ -1001,7 +1000,16 @@ __init static char *time_init_gtod(void)
1001 timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; 1000 timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
1002 vxtime.mode = VXTIME_TSC; 1001 vxtime.mode = VXTIME_TSC;
1003 } 1002 }
1004 return timetype; 1003
1004 printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
1005 vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
1006 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
1007 cpu_khz / 1000, cpu_khz % 1000);
1008 vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
1009 vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
1010 vxtime.last_tsc = get_cycles_sync();
1011
1012 set_cyc2ns_scale(cpu_khz);
1005} 1013}
1006 1014
1007__setup("report_lost_ticks", time_setup); 1015__setup("report_lost_ticks", time_setup);
@@ -1031,8 +1039,16 @@ static int timer_resume(struct sys_device *dev)
1031 unsigned long flags; 1039 unsigned long flags;
1032 unsigned long sec; 1040 unsigned long sec;
1033 unsigned long ctime = get_cmos_time(); 1041 unsigned long ctime = get_cmos_time();
1034 unsigned long sleep_length = (ctime - sleep_start) * HZ; 1042 long sleep_length = (ctime - sleep_start) * HZ;
1035 1043
1044 if (sleep_length < 0) {
1045 printk(KERN_WARNING "Time skew detected in timer resume!\n");
1046 /* The time after the resume must not be earlier than the time
1047 * before the suspend or some nasty things will happen
1048 */
1049 sleep_length = 0;
1050 ctime = sleep_start;
1051 }
1036 if (vxtime.hpet_address) 1052 if (vxtime.hpet_address)
1037 hpet_reenable(); 1053 hpet_reenable();
1038 else 1054 else
@@ -1148,23 +1164,25 @@ int hpet_rtc_timer_init(void)
1148 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; 1164 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1149 1165
1150 local_irq_save(flags); 1166 local_irq_save(flags);
1167
1151 cnt = hpet_readl(HPET_COUNTER); 1168 cnt = hpet_readl(HPET_COUNTER);
1152 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); 1169 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
1153 hpet_writel(cnt, HPET_T1_CMP); 1170 hpet_writel(cnt, HPET_T1_CMP);
1154 hpet_t1_cmp = cnt; 1171 hpet_t1_cmp = cnt;
1155 local_irq_restore(flags);
1156 1172
1157 cfg = hpet_readl(HPET_T1_CFG); 1173 cfg = hpet_readl(HPET_T1_CFG);
1158 cfg &= ~HPET_TN_PERIODIC; 1174 cfg &= ~HPET_TN_PERIODIC;
1159 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; 1175 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
1160 hpet_writel(cfg, HPET_T1_CFG); 1176 hpet_writel(cfg, HPET_T1_CFG);
1161 1177
1178 local_irq_restore(flags);
1179
1162 return 1; 1180 return 1;
1163} 1181}
1164 1182
1165static void hpet_rtc_timer_reinit(void) 1183static void hpet_rtc_timer_reinit(void)
1166{ 1184{
1167 unsigned int cfg, cnt; 1185 unsigned int cfg, cnt, ticks_per_int, lost_ints;
1168 1186
1169 if (unlikely(!(PIE_on | AIE_on | UIE_on))) { 1187 if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
1170 cfg = hpet_readl(HPET_T1_CFG); 1188 cfg = hpet_readl(HPET_T1_CFG);
@@ -1179,10 +1197,33 @@ static void hpet_rtc_timer_reinit(void)
1179 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; 1197 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
1180 1198
1181 /* It is more accurate to use the comparator value than current count.*/ 1199 /* It is more accurate to use the comparator value than current count.*/
1182 cnt = hpet_t1_cmp; 1200 ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
1183 cnt += hpet_tick*HZ/hpet_rtc_int_freq; 1201 hpet_t1_cmp += ticks_per_int;
1184 hpet_writel(cnt, HPET_T1_CMP); 1202 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
1185 hpet_t1_cmp = cnt; 1203
1204 /*
1205 * If the interrupt handler was delayed too long, the write above tries
1206 * to schedule the next interrupt in the past and the hardware would
1207 * not interrupt until the counter had wrapped around.
1208 * So we have to check that the comparator wasn't set to a past time.
1209 */
1210 cnt = hpet_readl(HPET_COUNTER);
1211 if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
1212 lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
1213 /* Make sure that, even with the time needed to execute
1214 * this code, the next scheduled interrupt has been moved
1215 * back to the future: */
1216 lost_ints++;
1217
1218 hpet_t1_cmp += lost_ints * ticks_per_int;
1219 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
1220
1221 if (PIE_on)
1222 PIE_count += lost_ints;
1223
1224 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
1225 hpet_rtc_int_freq);
1226 }
1186} 1227}
1187 1228
1188/* 1229/*
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
index 23a03eb91fc7..c79b99a9e2f6 100644
--- a/arch/x86_64/kernel/trampoline.S
+++ b/arch/x86_64/kernel/trampoline.S
@@ -64,7 +64,7 @@ idt_48:
64 .word 0, 0 # idt base = 0L 64 .word 0, 0 # idt base = 0L
65 65
66gdt_48: 66gdt_48:
67 .short __KERNEL32_CS + 7 # gdt limit 67 .short GDT_ENTRIES*8 - 1 # gdt limit
68 .long cpu_gdt_table-__START_KERNEL_map 68 .long cpu_gdt_table-__START_KERNEL_map
69 69
70.globl trampoline_end 70.globl trampoline_end
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b1249774d1e8..01f2a8d254c2 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <asm/pda.h> 45#include <asm/pda.h>
46#include <asm/proto.h> 46#include <asm/proto.h>
47#include <asm/nmi.h> 47#include <asm/nmi.h>
48#include <asm/stacktrace.h>
48 49
49asmlinkage void divide_error(void); 50asmlinkage void divide_error(void);
50asmlinkage void debug(void); 51asmlinkage void debug(void);
@@ -142,7 +143,7 @@ void printk_address(unsigned long address)
142#endif 143#endif
143 144
144static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 145static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
145 unsigned *usedp, const char **idp) 146 unsigned *usedp, char **idp)
146{ 147{
147 static char ids[][8] = { 148 static char ids[][8] = {
148 [DEBUG_STACK - 1] = "#DB", 149 [DEBUG_STACK - 1] = "#DB",
@@ -161,26 +162,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
161 * 'stack' is in one of them: 162 * 'stack' is in one of them:
162 */ 163 */
163 for (k = 0; k < N_EXCEPTION_STACKS; k++) { 164 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
164 unsigned long end; 165 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
165
166 /*
167 * set 'end' to the end of the exception stack.
168 */
169 switch (k + 1) {
170 /*
171 * TODO: this block is not needed i think, because
172 * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
173 * properly too.
174 */
175#if DEBUG_STKSZ > EXCEPTION_STKSZ
176 case DEBUG_STACK:
177 end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
178 break;
179#endif
180 default:
181 end = per_cpu(orig_ist, cpu).ist[k];
182 break;
183 }
184 /* 166 /*
185 * Is 'stack' above this exception frame's end? 167 * Is 'stack' above this exception frame's end?
186 * If yes then skip to the next frame. 168 * If yes then skip to the next frame.
@@ -234,13 +216,19 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
234 return NULL; 216 return NULL;
235} 217}
236 218
237static int show_trace_unwind(struct unwind_frame_info *info, void *context) 219struct ops_and_data {
220 struct stacktrace_ops *ops;
221 void *data;
222};
223
224static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
238{ 225{
226 struct ops_and_data *oad = (struct ops_and_data *)context;
239 int n = 0; 227 int n = 0;
240 228
241 while (unwind(info) == 0 && UNW_PC(info)) { 229 while (unwind(info) == 0 && UNW_PC(info)) {
242 n++; 230 n++;
243 printk_address(UNW_PC(info)); 231 oad->ops->address(oad->data, UNW_PC(info));
244 if (arch_unw_user_mode(info)) 232 if (arch_unw_user_mode(info))
245 break; 233 break;
246 } 234 }
@@ -254,45 +242,53 @@ static int show_trace_unwind(struct unwind_frame_info *info, void *context)
254 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 242 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
255 */ 243 */
256 244
257void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) 245void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
246 struct stacktrace_ops *ops, void *data)
258{ 247{
259 const unsigned cpu = safe_smp_processor_id(); 248 const unsigned cpu = smp_processor_id();
260 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 249 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
261 unsigned used = 0; 250 unsigned used = 0;
262 251
263 printk("\nCall Trace:\n");
264
265 if (!tsk) 252 if (!tsk)
266 tsk = current; 253 tsk = current;
267 254
268 if (call_trace >= 0) { 255 if (call_trace >= 0) {
269 int unw_ret = 0; 256 int unw_ret = 0;
270 struct unwind_frame_info info; 257 struct unwind_frame_info info;
258 struct ops_and_data oad = { .ops = ops, .data = data };
271 259
272 if (regs) { 260 if (regs) {
273 if (unwind_init_frame_info(&info, tsk, regs) == 0) 261 if (unwind_init_frame_info(&info, tsk, regs) == 0)
274 unw_ret = show_trace_unwind(&info, NULL); 262 unw_ret = dump_trace_unwind(&info, &oad);
275 } else if (tsk == current) 263 } else if (tsk == current)
276 unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); 264 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
277 else { 265 else {
278 if (unwind_init_blocked(&info, tsk) == 0) 266 if (unwind_init_blocked(&info, tsk) == 0)
279 unw_ret = show_trace_unwind(&info, NULL); 267 unw_ret = dump_trace_unwind(&info, &oad);
280 } 268 }
281 if (unw_ret > 0) { 269 if (unw_ret > 0) {
282 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 270 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
283 print_symbol("DWARF2 unwinder stuck at %s\n", 271 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
284 UNW_PC(&info)); 272 UNW_PC(&info));
285 if ((long)UNW_SP(&info) < 0) { 273 if ((long)UNW_SP(&info) < 0) {
286 printk("Leftover inexact backtrace:\n"); 274 ops->warning(data, "Leftover inexact backtrace:\n");
287 stack = (unsigned long *)UNW_SP(&info); 275 stack = (unsigned long *)UNW_SP(&info);
276 if (!stack)
277 return;
288 } else 278 } else
289 printk("Full inexact backtrace again:\n"); 279 ops->warning(data, "Full inexact backtrace again:\n");
290 } else if (call_trace >= 1) 280 } else if (call_trace >= 1)
291 return; 281 return;
292 else 282 else
293 printk("Full inexact backtrace again:\n"); 283 ops->warning(data, "Full inexact backtrace again:\n");
294 } else 284 } else
295 printk("Inexact backtrace:\n"); 285 ops->warning(data, "Inexact backtrace:\n");
286 }
287 if (!stack) {
288 unsigned long dummy;
289 stack = &dummy;
290 if (tsk && tsk != current)
291 stack = (unsigned long *)tsk->thread.rsp;
296 } 292 }
297 293
298 /* 294 /*
@@ -303,7 +299,9 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
303#define HANDLE_STACK(cond) \ 299#define HANDLE_STACK(cond) \
304 do while (cond) { \ 300 do while (cond) { \
305 unsigned long addr = *stack++; \ 301 unsigned long addr = *stack++; \
306 if (kernel_text_address(addr)) { \ 302 if (oops_in_progress ? \
303 __kernel_text_address(addr) : \
304 kernel_text_address(addr)) { \
307 /* \ 305 /* \
308 * If the address is either in the text segment of the \ 306 * If the address is either in the text segment of the \
309 * kernel, or in the region which contains vmalloc'ed \ 307 * kernel, or in the region which contains vmalloc'ed \
@@ -312,7 +310,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
312 * down the cause of the crash will be able to figure \ 310 * down the cause of the crash will be able to figure \
313 * out the call path that was taken. \ 311 * out the call path that was taken. \
314 */ \ 312 */ \
315 printk_address(addr); \ 313 ops->address(data, addr); \
316 } \ 314 } \
317 } while (0) 315 } while (0)
318 316
@@ -321,16 +319,17 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
321 * current stack address. If the stacks consist of nested 319 * current stack address. If the stacks consist of nested
322 * exceptions 320 * exceptions
323 */ 321 */
324 for ( ; ; ) { 322 for (;;) {
325 const char *id; 323 char *id;
326 unsigned long *estack_end; 324 unsigned long *estack_end;
327 estack_end = in_exception_stack(cpu, (unsigned long)stack, 325 estack_end = in_exception_stack(cpu, (unsigned long)stack,
328 &used, &id); 326 &used, &id);
329 327
330 if (estack_end) { 328 if (estack_end) {
331 printk(" <%s>", id); 329 if (ops->stack(data, id) < 0)
330 break;
332 HANDLE_STACK (stack < estack_end); 331 HANDLE_STACK (stack < estack_end);
333 printk(" <EOE>"); 332 ops->stack(data, "<EOE>");
334 /* 333 /*
335 * We link to the next stack via the 334 * We link to the next stack via the
336 * second-to-last pointer (index -2 to end) in the 335 * second-to-last pointer (index -2 to end) in the
@@ -345,7 +344,8 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
345 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 344 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
346 345
347 if (stack >= irqstack && stack < irqstack_end) { 346 if (stack >= irqstack && stack < irqstack_end) {
348 printk(" <IRQ>"); 347 if (ops->stack(data, "IRQ") < 0)
348 break;
349 HANDLE_STACK (stack < irqstack_end); 349 HANDLE_STACK (stack < irqstack_end);
350 /* 350 /*
351 * We link to the next stack (which would be 351 * We link to the next stack (which would be
@@ -354,7 +354,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
354 */ 354 */
355 stack = (unsigned long *) (irqstack_end[-1]); 355 stack = (unsigned long *) (irqstack_end[-1]);
356 irqstack_end = NULL; 356 irqstack_end = NULL;
357 printk(" <EOI>"); 357 ops->stack(data, "EOI");
358 continue; 358 continue;
359 } 359 }
360 } 360 }
@@ -362,19 +362,57 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
362 } 362 }
363 363
364 /* 364 /*
365 * This prints the process stack: 365 * This handles the process stack:
366 */ 366 */
367 HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); 367 HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
368#undef HANDLE_STACK 368#undef HANDLE_STACK
369}
370EXPORT_SYMBOL(dump_trace);
371
372static void
373print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
374{
375 print_symbol(msg, symbol);
376 printk("\n");
377}
378
379static void print_trace_warning(void *data, char *msg)
380{
381 printk("%s\n", msg);
382}
383
384static int print_trace_stack(void *data, char *name)
385{
386 printk(" <%s> ", name);
387 return 0;
388}
389
390static void print_trace_address(void *data, unsigned long addr)
391{
392 printk_address(addr);
393}
394
395static struct stacktrace_ops print_trace_ops = {
396 .warning = print_trace_warning,
397 .warning_symbol = print_trace_warning_symbol,
398 .stack = print_trace_stack,
399 .address = print_trace_address,
400};
369 401
402void
403show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
404{
405 printk("\nCall Trace:\n");
406 dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
370 printk("\n"); 407 printk("\n");
371} 408}
372 409
373static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) 410static void
411_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
374{ 412{
375 unsigned long *stack; 413 unsigned long *stack;
376 int i; 414 int i;
377 const int cpu = safe_smp_processor_id(); 415 const int cpu = smp_processor_id();
378 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); 416 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
379 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 417 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
380 418
@@ -428,7 +466,7 @@ void show_registers(struct pt_regs *regs)
428 int i; 466 int i;
429 int in_kernel = !user_mode(regs); 467 int in_kernel = !user_mode(regs);
430 unsigned long rsp; 468 unsigned long rsp;
431 const int cpu = safe_smp_processor_id(); 469 const int cpu = smp_processor_id();
432 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 470 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
433 471
434 rsp = regs->rsp; 472 rsp = regs->rsp;
@@ -503,9 +541,11 @@ static unsigned int die_nest_count;
503 541
504unsigned __kprobes long oops_begin(void) 542unsigned __kprobes long oops_begin(void)
505{ 543{
506 int cpu = safe_smp_processor_id(); 544 int cpu = smp_processor_id();
507 unsigned long flags; 545 unsigned long flags;
508 546
547 oops_enter();
548
509 /* racy, but better than risking deadlock. */ 549 /* racy, but better than risking deadlock. */
510 local_irq_save(flags); 550 local_irq_save(flags);
511 if (!spin_trylock(&die_lock)) { 551 if (!spin_trylock(&die_lock)) {
@@ -534,6 +574,7 @@ void __kprobes oops_end(unsigned long flags)
534 spin_unlock_irqrestore(&die_lock, flags); 574 spin_unlock_irqrestore(&die_lock, flags);
535 if (panic_on_oops) 575 if (panic_on_oops)
536 panic("Fatal exception"); 576 panic("Fatal exception");
577 oops_exit();
537} 578}
538 579
539void __kprobes __die(const char * str, struct pt_regs * regs, long err) 580void __kprobes __die(const char * str, struct pt_regs * regs, long err)
@@ -570,7 +611,7 @@ void die(const char * str, struct pt_regs * regs, long err)
570 do_exit(SIGSEGV); 611 do_exit(SIGSEGV);
571} 612}
572 613
573void __kprobes die_nmi(char *str, struct pt_regs *regs) 614void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
574{ 615{
575 unsigned long flags = oops_begin(); 616 unsigned long flags = oops_begin();
576 617
@@ -578,13 +619,12 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
578 * We are in trouble anyway, lets at least try 619 * We are in trouble anyway, lets at least try
579 * to get a message out. 620 * to get a message out.
580 */ 621 */
581 printk(str, safe_smp_processor_id()); 622 printk(str, smp_processor_id());
582 show_registers(regs); 623 show_registers(regs);
583 if (kexec_should_crash(current)) 624 if (kexec_should_crash(current))
584 crash_kexec(regs); 625 crash_kexec(regs);
585 if (panic_on_timeout || panic_on_oops) 626 if (do_panic || panic_on_oops)
586 panic("nmi watchdog"); 627 panic("Non maskable interrupt");
587 printk("console shuts up ...\n");
588 oops_end(flags); 628 oops_end(flags);
589 nmi_exit(); 629 nmi_exit();
590 local_irq_enable(); 630 local_irq_enable();
@@ -730,8 +770,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
730static __kprobes void 770static __kprobes void
731mem_parity_error(unsigned char reason, struct pt_regs * regs) 771mem_parity_error(unsigned char reason, struct pt_regs * regs)
732{ 772{
733 printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); 773 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
734 printk("You probably have a hardware problem with your RAM chips\n"); 774 reason);
775 printk(KERN_EMERG "You probably have a hardware problem with your "
776 "RAM chips\n");
777
778 if (panic_on_unrecovered_nmi)
779 panic("NMI: Not continuing");
780
781 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
735 782
736 /* Clear and disable the memory parity error line. */ 783 /* Clear and disable the memory parity error line. */
737 reason = (reason & 0xf) | 4; 784 reason = (reason & 0xf) | 4;
@@ -754,9 +801,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
754 801
755static __kprobes void 802static __kprobes void
756unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 803unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
757{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); 804{
758 printk("Dazed and confused, but trying to continue\n"); 805 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
759 printk("Do you have a strange power saving mode enabled?\n"); 806 reason);
807 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
808
809 if (panic_on_unrecovered_nmi)
810 panic("NMI: Not continuing");
811
812 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
760} 813}
761 814
762/* Runs on IST stack. This code must keep interrupts off all the time. 815/* Runs on IST stack. This code must keep interrupts off all the time.
@@ -776,17 +829,15 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
776 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 829 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
777 == NOTIFY_STOP) 830 == NOTIFY_STOP)
778 return; 831 return;
779#ifdef CONFIG_X86_LOCAL_APIC
780 /* 832 /*
781 * Ok, so this is none of the documented NMI sources, 833 * Ok, so this is none of the documented NMI sources,
782 * so it must be the NMI watchdog. 834 * so it must be the NMI watchdog.
783 */ 835 */
784 if (nmi_watchdog > 0) { 836 if (nmi_watchdog_tick(regs,reason))
785 nmi_watchdog_tick(regs,reason);
786 return; 837 return;
787 } 838 if (!do_nmi_callback(regs,cpu))
788#endif 839 unknown_nmi_error(reason, regs);
789 unknown_nmi_error(reason, regs); 840
790 return; 841 return;
791 } 842 }
792 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 843 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -1071,6 +1122,7 @@ asmlinkage void math_state_restore(void)
1071 init_fpu(me); 1122 init_fpu(me);
1072 restore_fpu_checking(&me->thread.i387.fxsave); 1123 restore_fpu_checking(&me->thread.i387.fxsave);
1073 task_thread_info(me)->status |= TS_USEDFPU; 1124 task_thread_info(me)->status |= TS_USEDFPU;
1125 me->fpu_counter++;
1074} 1126}
1075 1127
1076void __init trap_init(void) 1128void __init trap_init(void)
@@ -1109,24 +1161,30 @@ void __init trap_init(void)
1109} 1161}
1110 1162
1111 1163
1112/* Actual parsing is done early in setup.c. */ 1164static int __init oops_setup(char *s)
1113static int __init oops_dummy(char *s)
1114{ 1165{
1115 panic_on_oops = 1; 1166 if (!s)
1116 return 1; 1167 return -EINVAL;
1168 if (!strcmp(s, "panic"))
1169 panic_on_oops = 1;
1170 return 0;
1117} 1171}
1118__setup("oops=", oops_dummy); 1172early_param("oops", oops_setup);
1119 1173
1120static int __init kstack_setup(char *s) 1174static int __init kstack_setup(char *s)
1121{ 1175{
1176 if (!s)
1177 return -EINVAL;
1122 kstack_depth_to_print = simple_strtoul(s,NULL,0); 1178 kstack_depth_to_print = simple_strtoul(s,NULL,0);
1123 return 1; 1179 return 0;
1124} 1180}
1125__setup("kstack=", kstack_setup); 1181early_param("kstack", kstack_setup);
1126 1182
1127#ifdef CONFIG_STACK_UNWIND 1183#ifdef CONFIG_STACK_UNWIND
1128static int __init call_trace_setup(char *s) 1184static int __init call_trace_setup(char *s)
1129{ 1185{
1186 if (!s)
1187 return -EINVAL;
1130 if (strcmp(s, "old") == 0) 1188 if (strcmp(s, "old") == 0)
1131 call_trace = -1; 1189 call_trace = -1;
1132 else if (strcmp(s, "both") == 0) 1190 else if (strcmp(s, "both") == 0)
@@ -1135,7 +1193,7 @@ static int __init call_trace_setup(char *s)
1135 call_trace = 1; 1193 call_trace = 1;
1136 else if (strcmp(s, "new") == 0) 1194 else if (strcmp(s, "new") == 0)
1137 call_trace = 2; 1195 call_trace = 2;
1138 return 1; 1196 return 0;
1139} 1197}
1140__setup("call_trace=", call_trace_setup); 1198early_param("call_trace", call_trace_setup);
1141#endif 1199#endif
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7c4de31471d4..d0564f1bcb0b 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 13OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 14ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 15jiffies_64 = jiffies;
16PHDRS {
17 text PT_LOAD FLAGS(5); /* R_E */
18 data PT_LOAD FLAGS(7); /* RWE */
19 user PT_LOAD FLAGS(7); /* RWE */
20 note PT_NOTE FLAGS(4); /* R__ */
21}
16SECTIONS 22SECTIONS
17{ 23{
18 . = __START_KERNEL; 24 . = __START_KERNEL;
@@ -31,7 +37,7 @@ SECTIONS
31 KPROBES_TEXT 37 KPROBES_TEXT
32 *(.fixup) 38 *(.fixup)
33 *(.gnu.warning) 39 *(.gnu.warning)
34 } = 0x9090 40 } :text = 0x9090
35 /* out-of-line lock text */ 41 /* out-of-line lock text */
36 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } 42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
37 43
@@ -57,7 +63,7 @@ SECTIONS
57 .data : AT(ADDR(.data) - LOAD_OFFSET) { 63 .data : AT(ADDR(.data) - LOAD_OFFSET) {
58 *(.data) 64 *(.data)
59 CONSTRUCTORS 65 CONSTRUCTORS
60 } 66 } :data
61 67
62 _edata = .; /* End of data section */ 68 _edata = .; /* End of data section */
63 69
@@ -89,7 +95,7 @@ SECTIONS
89#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 95#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
90 96
91 . = VSYSCALL_ADDR; 97 . = VSYSCALL_ADDR;
92 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } 98 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
93 __vsyscall_0 = VSYSCALL_VIRT_ADDR; 99 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
94 100
95 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 101 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
@@ -99,6 +105,9 @@ SECTIONS
99 .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } 105 .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
100 vxtime = VVIRT(.vxtime); 106 vxtime = VVIRT(.vxtime);
101 107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
102 .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } 111 .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
103 wall_jiffies = VVIRT(.wall_jiffies); 112 wall_jiffies = VVIRT(.wall_jiffies);
104 113
@@ -132,7 +141,7 @@ SECTIONS
132 . = ALIGN(8192); /* init_task */ 141 . = ALIGN(8192); /* init_task */
133 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 142 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
134 *(.data.init_task) 143 *(.data.init_task)
135 } 144 } :data
136 145
137 . = ALIGN(4096); 146 . = ALIGN(4096);
138 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 147 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -207,14 +216,12 @@ SECTIONS
207 __initramfs_start = .; 216 __initramfs_start = .;
208 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 217 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
209 __initramfs_end = .; 218 __initramfs_end = .;
210 /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ 219 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
211 complain */
212 . = ALIGN(4096);
213 __init_end = .;
214 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
215 __per_cpu_start = .; 220 __per_cpu_start = .;
216 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 221 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
217 __per_cpu_end = .; 222 __per_cpu_end = .;
223 . = ALIGN(4096);
224 __init_end = .;
218 225
219 . = ALIGN(4096); 226 . = ALIGN(4096);
220 __nosave_begin = .; 227 __nosave_begin = .;
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c
index 92f70c74965f..044e852bd25e 100644
--- a/arch/x86_64/kernel/vsmp.c
+++ b/arch/x86_64/kernel/vsmp.c
@@ -20,6 +20,9 @@ static int __init vsmp_init(void)
20 void *address; 20 void *address;
21 unsigned int cap, ctl; 21 unsigned int cap, ctl;
22 22
23 if (!early_pci_allowed())
24 return 0;
25
23 /* Check if we are running on a ScaleMP vSMP box */ 26 /* Check if we are running on a ScaleMP vSMP box */
24 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || 27 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
25 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) 28 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index f603037df162..ac48c3857ddb 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
26#include <linux/seqlock.h> 26#include <linux/seqlock.h>
27#include <linux/jiffies.h> 27#include <linux/jiffies.h>
28#include <linux/sysctl.h> 28#include <linux/sysctl.h>
29#include <linux/getcpu.h>
29 30
30#include <asm/vsyscall.h> 31#include <asm/vsyscall.h>
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
@@ -33,11 +34,15 @@
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/errno.h> 35#include <asm/errno.h>
35#include <asm/io.h> 36#include <asm/io.h>
37#include <asm/segment.h>
38#include <asm/desc.h>
39#include <asm/topology.h>
36 40
37#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 41#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
38 42
39int __sysctl_vsyscall __section_sysctl_vsyscall = 1; 43int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
40seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; 44seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
45int __vgetcpu_mode __section_vgetcpu_mode;
41 46
42#include <asm/unistd.h> 47#include <asm/unistd.h>
43 48
@@ -72,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
72 __vxtime.tsc_quot) >> 32; 77 __vxtime.tsc_quot) >> 32;
73 /* See comment in x86_64 do_gettimeofday. */ 78 /* See comment in x86_64 do_gettimeofday. */
74 } else { 79 } else {
75 usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - 80 usec += ((readl((void __iomem *)
81 fix_to_virt(VSYSCALL_HPET) + 0xf0) -
76 __vxtime.last) * __vxtime.quot) >> 32; 82 __vxtime.last) * __vxtime.quot) >> 32;
77 } 83 }
78 } while (read_seqretry(&__xtime_lock, sequence)); 84 } while (read_seqretry(&__xtime_lock, sequence));
@@ -127,9 +133,46 @@ time_t __vsyscall(1) vtime(time_t *t)
127 return __xtime.tv_sec; 133 return __xtime.tv_sec;
128} 134}
129 135
130long __vsyscall(2) venosys_0(void) 136/* Fast way to get current CPU and node.
137 This helps to do per node and per CPU caches in user space.
138 The result is not guaranteed without CPU affinity, but usually
139 works out because the scheduler tries to keep a thread on the same
140 CPU.
141
142 tcache must point to a two element sized long array.
143 All arguments can be NULL. */
144long __vsyscall(2)
145vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
131{ 146{
132 return -ENOSYS; 147 unsigned int dummy, p;
148 unsigned long j = 0;
149
150 /* Fast cache - only recompute value once per jiffies and avoid
151 relatively costly rdtscp/cpuid otherwise.
152 This works because the scheduler usually keeps the process
153 on the same CPU and this syscall doesn't guarantee its
154 results anyways.
155 We do this here because otherwise user space would do it on
156 its own in a likely inferior way (no access to jiffies).
157 If you don't like it pass NULL. */
158 if (tcache && tcache->t0 == (j = __jiffies)) {
159 p = tcache->t1;
160 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
161 /* Load per CPU data from RDTSCP */
162 rdtscp(dummy, dummy, p);
163 } else {
164 /* Load per CPU data from GDT */
165 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
166 }
167 if (tcache) {
168 tcache->t0 = j;
169 tcache->t1 = p;
170 }
171 if (cpu)
172 *cpu = p & 0xfff;
173 if (node)
174 *node = p >> 12;
175 return 0;
133} 176}
134 177
135long __vsyscall(3) venosys_1(void) 178long __vsyscall(3) venosys_1(void)
@@ -149,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
149 void __user *buffer, size_t *lenp, loff_t *ppos) 192 void __user *buffer, size_t *lenp, loff_t *ppos)
150{ 193{
151 extern u16 vsysc1, vsysc2; 194 extern u16 vsysc1, vsysc2;
152 u16 *map1, *map2; 195 u16 __iomem *map1;
196 u16 __iomem *map2;
153 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 197 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
154 if (!write) 198 if (!write)
155 return ret; 199 return ret;
@@ -164,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
164 goto out; 208 goto out;
165 } 209 }
166 if (!sysctl_vsyscall) { 210 if (!sysctl_vsyscall) {
167 *map1 = SYSCALL; 211 writew(SYSCALL, map1);
168 *map2 = SYSCALL; 212 writew(SYSCALL, map2);
169 } else { 213 } else {
170 *map1 = NOP2; 214 writew(NOP2, map1);
171 *map2 = NOP2; 215 writew(NOP2, map2);
172 } 216 }
173 iounmap(map2); 217 iounmap(map2);
174out: 218out:
@@ -200,6 +244,43 @@ static ctl_table kernel_root_table2[] = {
200 244
201#endif 245#endif
202 246
247static void __cpuinit write_rdtscp_cb(void *info)
248{
249 write_rdtscp_aux((unsigned long)info);
250}
251
252void __cpuinit vsyscall_set_cpu(int cpu)
253{
254 unsigned long *d;
255 unsigned long node = 0;
256#ifdef CONFIG_NUMA
257 node = cpu_to_node[cpu];
258#endif
259 if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
260 void *info = (void *)((node << 12) | cpu);
261 /* Can happen on preemptive kernel */
262 if (get_cpu() == cpu)
263 write_rdtscp_cb(info);
264#ifdef CONFIG_SMP
265 else {
266 /* the notifier is unfortunately not executed on the
267 target CPU */
268 smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
269 }
270#endif
271 put_cpu();
272 }
273
274 /* Store cpu number in limit so that it can be loaded quickly
275 in user space in vgetcpu.
276 12 bits for the CPU and 8 bits for the node. */
277 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
278 *d = 0x0f40000000000ULL;
279 *d |= cpu;
280 *d |= (node & 0xf) << 12;
281 *d |= (node >> 4) << 48;
282}
283
203static void __init map_vsyscall(void) 284static void __init map_vsyscall(void)
204{ 285{
205 extern char __vsyscall_0; 286 extern char __vsyscall_0;
@@ -214,6 +295,7 @@ static int __init vsyscall_init(void)
214 VSYSCALL_ADDR(__NR_vgettimeofday))); 295 VSYSCALL_ADDR(__NR_vgettimeofday)));
215 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 296 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
216 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 297 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
298 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
217 map_vsyscall(); 299 map_vsyscall();
218#ifdef CONFIG_SYSCTL 300#ifdef CONFIG_SYSCTL
219 register_sysctl_table(kernel_root_table2, 0); 301 register_sysctl_table(kernel_root_table2, 0);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 370952c4ff22..c3454af5e3a2 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -29,6 +29,7 @@ EXPORT_SYMBOL(__put_user_8);
29EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic);
30EXPORT_SYMBOL(copy_from_user); 30EXPORT_SYMBOL(copy_from_user);
31EXPORT_SYMBOL(copy_to_user); 31EXPORT_SYMBOL(copy_to_user);
32EXPORT_SYMBOL(__copy_from_user_inatomic);
32 33
33EXPORT_SYMBOL(copy_page); 34EXPORT_SYMBOL(copy_page);
34EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);