aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile6
-rw-r--r--arch/i386/kernel/acpi/boot.c25
-rw-r--r--arch/i386/kernel/apic.c1631
-rw-r--r--arch/i386/kernel/apm.c70
-rw-r--r--arch/i386/kernel/asm-offsets.c2
-rw-r--r--arch/i386/kernel/cpu/common.c14
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/i386/kernel/cpu/cpufreq/e_powersaver.c334
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.c359
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.h153
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k8.c6
-rw-r--r--arch/i386/kernel/cpu/cyrix.c52
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c30
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/i386/kernel/cpu/proc.c14
-rw-r--r--arch/i386/kernel/cpu/transmeta.c5
-rw-r--r--arch/i386/kernel/cpuid.c7
-rw-r--r--arch/i386/kernel/e820.c18
-rw-r--r--arch/i386/kernel/entry.S78
-rw-r--r--arch/i386/kernel/head.S38
-rw-r--r--arch/i386/kernel/hpet.c498
-rw-r--r--arch/i386/kernel/i8253.c96
-rw-r--r--arch/i386/kernel/i8259.c7
-rw-r--r--arch/i386/kernel/io_apic.c14
-rw-r--r--arch/i386/kernel/irq.c25
-rw-r--r--arch/i386/kernel/kprobes.c6
-rw-r--r--arch/i386/kernel/microcode.c2
-rw-r--r--arch/i386/kernel/msr.c13
-rw-r--r--arch/i386/kernel/nmi.c107
-rw-r--r--arch/i386/kernel/paravirt.c116
-rw-r--r--arch/i386/kernel/pcspeaker.c20
-rw-r--r--arch/i386/kernel/process.c102
-rw-r--r--arch/i386/kernel/ptrace.c16
-rw-r--r--arch/i386/kernel/setup.c35
-rw-r--r--arch/i386/kernel/signal.c16
-rw-r--r--arch/i386/kernel/smp.c7
-rw-r--r--arch/i386/kernel/smpboot.c203
-rw-r--r--arch/i386/kernel/sysenter.c2
-rw-r--r--arch/i386/kernel/time.c138
-rw-r--r--arch/i386/kernel/time_hpet.c497
-rw-r--r--arch/i386/kernel/traps.c27
-rw-r--r--arch/i386/kernel/tsc.c195
-rw-r--r--arch/i386/kernel/tsc_sync.c1
-rw-r--r--arch/i386/kernel/vm86.c33
-rw-r--r--arch/i386/kernel/vmi.c949
-rw-r--r--arch/i386/kernel/vmitime.c499
-rw-r--r--arch/i386/kernel/vmlinux.lds.S7
52 files changed, 4141 insertions, 2355 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1e8988e558c5..4ae3dcf1d2f0 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_MSR) += msr.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o 18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_MICROCODE) += microcode.o 19obj-$(CONFIG_MICROCODE) += microcode.o
20obj-$(CONFIG_APM) += apm.o 20obj-$(CONFIG_APM) += apm.o
21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o 21obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o
22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 22obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
23obj-$(CONFIG_X86_MPPARSE) += mpparse.o 23obj-$(CONFIG_X86_MPPARSE) += mpparse.o
24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 24obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
@@ -32,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
32obj-$(CONFIG_MODULES) += module.o 32obj-$(CONFIG_MODULES) += module.o
33obj-y += sysenter.o vsyscall.o 33obj-y += sysenter.o vsyscall.o
34obj-$(CONFIG_ACPI_SRAT) += srat.o 34obj-$(CONFIG_ACPI_SRAT) += srat.o
35obj-$(CONFIG_HPET_TIMER) += time_hpet.o
36obj-$(CONFIG_EFI) += efi.o efi_stub.o 35obj-$(CONFIG_EFI) += efi.o efi_stub.o
37obj-$(CONFIG_DOUBLEFAULT) += doublefault.o 36obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
38obj-$(CONFIG_VM86) += vm86.o 37obj-$(CONFIG_VM86) += vm86.o
@@ -40,8 +39,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 39obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 40obj-$(CONFIG_K8_NB) += k8.o
42 41
43# Make sure this is linked after any other paravirt_ops structs: see head.S 42obj-$(CONFIG_VMI) += vmi.o vmitime.o
44obj-$(CONFIG_PARAVIRT) += paravirt.o 43obj-$(CONFIG_PARAVIRT) += paravirt.o
44obj-y += pcspeaker.o
45 45
46EXTRA_AFLAGS := -traditional 46EXTRA_AFLAGS := -traditional
47 47
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index e94aff6888ca..fb3e72328a5a 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/acpi.h> 27#include <linux/acpi.h>
28#include <linux/acpi_pmtmr.h>
28#include <linux/efi.h> 29#include <linux/efi.h>
29#include <linux/cpumask.h> 30#include <linux/cpumask.h>
30#include <linux/module.h> 31#include <linux/module.h>
@@ -615,6 +616,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
615} 616}
616 617
617#ifdef CONFIG_HPET_TIMER 618#ifdef CONFIG_HPET_TIMER
619#include <asm/hpet.h>
618 620
619static int __init acpi_parse_hpet(struct acpi_table_header *table) 621static int __init acpi_parse_hpet(struct acpi_table_header *table)
620{ 622{
@@ -645,24 +647,11 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
645 hpet_res->end = (1 * 1024) - 1; 647 hpet_res->end = (1 * 1024) - 1;
646 } 648 }
647 649
648#ifdef CONFIG_X86_64 650 hpet_address = hpet_tbl->address.address;
649 vxtime.hpet_address = hpet_tbl->address.address;
650
651 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", 651 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
652 hpet_tbl->id, vxtime.hpet_address); 652 hpet_tbl->id, hpet_address);
653
654 res_start = vxtime.hpet_address;
655#else /* X86 */
656 {
657 extern unsigned long hpet_address;
658
659 hpet_address = hpet_tbl->address.address;
660 printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
661 hpet_tbl->id, hpet_address);
662 653
663 res_start = hpet_address; 654 res_start = hpet_address;
664 }
665#endif /* X86 */
666 655
667 if (hpet_res) { 656 if (hpet_res) {
668 hpet_res->start = res_start; 657 hpet_res->start = res_start;
@@ -676,10 +665,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
676#define acpi_parse_hpet NULL 665#define acpi_parse_hpet NULL
677#endif 666#endif
678 667
679#ifdef CONFIG_X86_PM_TIMER
680extern u32 pmtmr_ioport;
681#endif
682
683static int __init acpi_parse_fadt(struct acpi_table_header *table) 668static int __init acpi_parse_fadt(struct acpi_table_header *table)
684{ 669{
685 670
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be26af9..9655c233e6f1 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -25,6 +25,8 @@
25#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/cpu.h> 27#include <linux/cpu.h>
28#include <linux/clockchips.h>
29#include <linux/acpi_pmtmr.h>
28#include <linux/module.h> 30#include <linux/module.h>
29 31
30#include <asm/atomic.h> 32#include <asm/atomic.h>
@@ -36,6 +38,7 @@
36#include <asm/hpet.h> 38#include <asm/hpet.h>
37#include <asm/i8253.h> 39#include <asm/i8253.h>
38#include <asm/nmi.h> 40#include <asm/nmi.h>
41#include <asm/idle.h>
39 42
40#include <mach_apic.h> 43#include <mach_apic.h>
41#include <mach_apicdef.h> 44#include <mach_apicdef.h>
@@ -44,128 +47,549 @@
44#include "io_ports.h" 47#include "io_ports.h"
45 48
46/* 49/*
47 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as 50 * Sanity check
48 * IPIs in place of local APIC timers
49 */ 51 */
50static cpumask_t timer_bcast_ipi; 52#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
53# error SPURIOUS_APIC_VECTOR definition error
54#endif
51 55
52/* 56/*
53 * Knob to control our willingness to enable the local APIC. 57 * Knob to control our willingness to enable the local APIC.
58 *
59 * -1=force-disable, +1=force-enable
54 */ 60 */
55static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ 61static int enable_local_apic __initdata = 0;
56
57static inline void lapic_disable(void)
58{
59 enable_local_apic = -1;
60 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
61}
62 62
63static inline void lapic_enable(void) 63/* Local APIC timer verification ok */
64{ 64static int local_apic_timer_verify_ok;
65 enable_local_apic = 1;
66}
67 65
68/* 66/*
69 * Debug level 67 * Debug level, exported for io_apic.c
70 */ 68 */
71int apic_verbosity; 69int apic_verbosity;
72 70
71static unsigned int calibration_result;
73 72
73static int lapic_next_event(unsigned long delta,
74 struct clock_event_device *evt);
75static void lapic_timer_setup(enum clock_event_mode mode,
76 struct clock_event_device *evt);
77static void lapic_timer_broadcast(cpumask_t mask);
74static void apic_pm_activate(void); 78static void apic_pm_activate(void);
75 79
80/*
81 * The local apic timer can be used for any function which is CPU local.
82 */
83static struct clock_event_device lapic_clockevent = {
84 .name = "lapic",
85 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
86 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
87 .shift = 32,
88 .set_mode = lapic_timer_setup,
89 .set_next_event = lapic_next_event,
90 .broadcast = lapic_timer_broadcast,
91 .rating = 100,
92 .irq = -1,
93};
94static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
95
96/* Local APIC was disabled by the BIOS and enabled by the kernel */
97static int enabled_via_apicbase;
98
99/*
100 * Get the LAPIC version
101 */
102static inline int lapic_get_version(void)
103{
104 return GET_APIC_VERSION(apic_read(APIC_LVR));
105}
106
107/*
108 * Check, if the APIC is integrated or a seperate chip
109 */
110static inline int lapic_is_integrated(void)
111{
112 return APIC_INTEGRATED(lapic_get_version());
113}
114
115/*
116 * Check, whether this is a modern or a first generation APIC
117 */
76static int modern_apic(void) 118static int modern_apic(void)
77{ 119{
78 unsigned int lvr, version;
79 /* AMD systems use old APIC versions, so check the CPU */ 120 /* AMD systems use old APIC versions, so check the CPU */
80 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 121 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
81 boot_cpu_data.x86 >= 0xf) 122 boot_cpu_data.x86 >= 0xf)
82 return 1; 123 return 1;
83 lvr = apic_read(APIC_LVR); 124 return lapic_get_version() >= 0x14;
84 version = GET_APIC_VERSION(lvr);
85 return version >= 0x14;
86} 125}
87 126
127/**
128 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
129 */
130void enable_NMI_through_LVT0 (void * dummy)
131{
132 unsigned int v = APIC_DM_NMI;
133
134 /* Level triggered for 82489DX */
135 if (!lapic_is_integrated())
136 v |= APIC_LVT_LEVEL_TRIGGER;
137 apic_write_around(APIC_LVT0, v);
138}
139
140/**
141 * get_physical_broadcast - Get number of physical broadcast IDs
142 */
143int get_physical_broadcast(void)
144{
145 return modern_apic() ? 0xff : 0xf;
146}
147
148/**
149 * lapic_get_maxlvt - get the maximum number of local vector table entries
150 */
151int lapic_get_maxlvt(void)
152{
153 unsigned int v = apic_read(APIC_LVR);
154
155 /* 82489DXs do not report # of LVT entries. */
156 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
157}
158
159/*
160 * Local APIC timer
161 */
162
163/* Clock divisor is set to 16 */
164#define APIC_DIVISOR 16
165
88/* 166/*
89 * 'what should we do if we get a hw irq event on an illegal vector'. 167 * This function sets up the local APIC timer, with a timeout of
90 * each architecture has to answer this themselves. 168 * 'clocks' APIC bus clock. During calibration we actually call
169 * this function twice on the boot CPU, once with a bogus timeout
170 * value, second time for real. The other (noncalibrating) CPUs
171 * call this function only once, with the real, calibrated value.
172 *
173 * We do reads before writes even if unnecessary, to get around the
174 * P5 APIC double write bug.
91 */ 175 */
92void ack_bad_irq(unsigned int irq) 176static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
93{ 177{
94 printk("unexpected IRQ trap at vector %02x\n", irq); 178 unsigned int lvtt_value, tmp_value;
179
180 lvtt_value = LOCAL_TIMER_VECTOR;
181 if (!oneshot)
182 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
183 if (!lapic_is_integrated())
184 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
185
186 if (!irqen)
187 lvtt_value |= APIC_LVT_MASKED;
188
189 apic_write_around(APIC_LVTT, lvtt_value);
190
95 /* 191 /*
96 * Currently unexpected vectors happen only on SMP and APIC. 192 * Divide PICLK by 16
97 * We _must_ ack these because every local APIC has only N
98 * irq slots per priority level, and a 'hanging, unacked' IRQ
99 * holds up an irq slot - in excessive cases (when multiple
100 * unexpected vectors occur) that might lock up the APIC
101 * completely.
102 * But only ack when the APIC is enabled -AK
103 */ 193 */
104 if (cpu_has_apic) 194 tmp_value = apic_read(APIC_TDCR);
105 ack_APIC_irq(); 195 apic_write_around(APIC_TDCR, (tmp_value
196 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
197 | APIC_TDR_DIV_16);
198
199 if (!oneshot)
200 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
106} 201}
107 202
108void __init apic_intr_init(void) 203/*
204 * Program the next event, relative to now
205 */
206static int lapic_next_event(unsigned long delta,
207 struct clock_event_device *evt)
208{
209 apic_write_around(APIC_TMICT, delta);
210 return 0;
211}
212
213/*
214 * Setup the lapic timer in periodic or oneshot mode
215 */
216static void lapic_timer_setup(enum clock_event_mode mode,
217 struct clock_event_device *evt)
218{
219 unsigned long flags;
220 unsigned int v;
221
222 /* Lapic used for broadcast ? */
223 if (!local_apic_timer_verify_ok)
224 return;
225
226 local_irq_save(flags);
227
228 switch (mode) {
229 case CLOCK_EVT_MODE_PERIODIC:
230 case CLOCK_EVT_MODE_ONESHOT:
231 __setup_APIC_LVTT(calibration_result,
232 mode != CLOCK_EVT_MODE_PERIODIC, 1);
233 break;
234 case CLOCK_EVT_MODE_UNUSED:
235 case CLOCK_EVT_MODE_SHUTDOWN:
236 v = apic_read(APIC_LVTT);
237 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
238 apic_write_around(APIC_LVTT, v);
239 break;
240 }
241
242 local_irq_restore(flags);
243}
244
245/*
246 * Local APIC timer broadcast function
247 */
248static void lapic_timer_broadcast(cpumask_t mask)
109{ 249{
110#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
111 smp_intr_init(); 251 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
112#endif 252#endif
113 /* self generated IPI for local APIC timer */ 253}
114 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
115 254
116 /* IPI vectors for APIC spurious and error interrupts */ 255/*
117 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 256 * Setup the local APIC timer for this CPU. Copy the initilized values
118 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 257 * of the boot CPU and register the clock event in the framework.
258 */
259static void __devinit setup_APIC_timer(void)
260{
261 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
119 262
120 /* thermal monitor LVT interrupt */ 263 memcpy(levt, &lapic_clockevent, sizeof(*levt));
121#ifdef CONFIG_X86_MCE_P4THERMAL 264 levt->cpumask = cpumask_of_cpu(smp_processor_id());
122 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 265
123#endif 266 clockevents_register_device(levt);
124} 267}
125 268
126/* Using APIC to generate smp_local_timer_interrupt? */ 269/*
127int using_apic_timer __read_mostly = 0; 270 * In this functions we calibrate APIC bus clocks to the external timer.
271 *
272 * We want to do the calibration only once since we want to have local timer
273 * irqs syncron. CPUs connected by the same APIC bus have the very same bus
274 * frequency.
275 *
276 * This was previously done by reading the PIT/HPET and waiting for a wrap
277 * around to find out, that a tick has elapsed. I have a box, where the PIT
278 * readout is broken, so it never gets out of the wait loop again. This was
279 * also reported by others.
280 *
281 * Monitoring the jiffies value is inaccurate and the clockevents
282 * infrastructure allows us to do a simple substitution of the interrupt
283 * handler.
284 *
285 * The calibration routine also uses the pm_timer when possible, as the PIT
286 * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
287 * back to normal later in the boot process).
288 */
289
290#define LAPIC_CAL_LOOPS (HZ/10)
128 291
129static int enabled_via_apicbase; 292static __initdata volatile int lapic_cal_loops = -1;
293static __initdata long lapic_cal_t1, lapic_cal_t2;
294static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
295static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
296static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
130 297
131void enable_NMI_through_LVT0 (void * dummy) 298/*
299 * Temporary interrupt handler.
300 */
301static void __init lapic_cal_handler(struct clock_event_device *dev)
132{ 302{
133 unsigned int v, ver; 303 unsigned long long tsc = 0;
304 long tapic = apic_read(APIC_TMCCT);
305 unsigned long pm = acpi_pm_read_early();
134 306
135 ver = apic_read(APIC_LVR); 307 if (cpu_has_tsc)
136 ver = GET_APIC_VERSION(ver); 308 rdtscll(tsc);
137 v = APIC_DM_NMI; /* unmask and set to NMI */ 309
138 if (!APIC_INTEGRATED(ver)) /* 82489DX */ 310 switch (lapic_cal_loops++) {
139 v |= APIC_LVT_LEVEL_TRIGGER; 311 case 0:
140 apic_write_around(APIC_LVT0, v); 312 lapic_cal_t1 = tapic;
313 lapic_cal_tsc1 = tsc;
314 lapic_cal_pm1 = pm;
315 lapic_cal_j1 = jiffies;
316 break;
317
318 case LAPIC_CAL_LOOPS:
319 lapic_cal_t2 = tapic;
320 lapic_cal_tsc2 = tsc;
321 if (pm < lapic_cal_pm1)
322 pm += ACPI_PM_OVRRUN;
323 lapic_cal_pm2 = pm;
324 lapic_cal_j2 = jiffies;
325 break;
326 }
141} 327}
142 328
143int get_physical_broadcast(void) 329/*
330 * Setup the boot APIC
331 *
332 * Calibrate and verify the result.
333 */
334void __init setup_boot_APIC_clock(void)
144{ 335{
145 if (modern_apic()) 336 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
146 return 0xff; 337 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
147 else 338 const long pm_thresh = pm_100ms/100;
148 return 0xf; 339 void (*real_handler)(struct clock_event_device *dev);
340 unsigned long deltaj;
341 long delta, deltapm;
342
343 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
344 "calibrating APIC timer ...\n");
345
346 local_irq_disable();
347
348 /* Replace the global interrupt handler */
349 real_handler = global_clock_event->event_handler;
350 global_clock_event->event_handler = lapic_cal_handler;
351
352 /*
353 * Setup the APIC counter to 1e9. There is no way the lapic
354 * can underflow in the 100ms detection time frame
355 */
356 __setup_APIC_LVTT(1000000000, 0, 0);
357
358 /* Let the interrupts run */
359 local_irq_enable();
360
361 while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
362
363 local_irq_disable();
364
365 /* Restore the real event handler */
366 global_clock_event->event_handler = real_handler;
367
368 /* Build delta t1-t2 as apic timer counts down */
369 delta = lapic_cal_t1 - lapic_cal_t2;
370 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
371
372 /* Check, if the PM timer is available */
373 deltapm = lapic_cal_pm2 - lapic_cal_pm1;
374 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
375
376 if (deltapm) {
377 unsigned long mult;
378 u64 res;
379
380 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
381
382 if (deltapm > (pm_100ms - pm_thresh) &&
383 deltapm < (pm_100ms + pm_thresh)) {
384 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
385 } else {
386 res = (((u64) deltapm) * mult) >> 22;
387 do_div(res, 1000000);
388 printk(KERN_WARNING "APIC calibration not consistent "
389 "with PM Timer: %ldms instead of 100ms\n",
390 (long)res);
391 /* Correct the lapic counter value */
392 res = (((u64) delta ) * pm_100ms);
393 do_div(res, deltapm);
394 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
395 "%lu (%ld)\n", (unsigned long) res, delta);
396 delta = (long) res;
397 }
398 }
399
400 /* Calculate the scaled math multiplication factor */
401 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
402 lapic_clockevent.max_delta_ns =
403 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
404 lapic_clockevent.min_delta_ns =
405 clockevent_delta2ns(0xF, &lapic_clockevent);
406
407 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
408
409 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
410 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
411 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
412 calibration_result);
413
414 if (cpu_has_tsc) {
415 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
416 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
417 "%ld.%04ld MHz.\n",
418 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
419 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
420 }
421
422 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
423 "%u.%04u MHz.\n",
424 calibration_result / (1000000 / HZ),
425 calibration_result % (1000000 / HZ));
426
427
428 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
429
430 /*
431 * Setup the apic timer manually
432 */
433 local_apic_timer_verify_ok = 1;
434 levt->event_handler = lapic_cal_handler;
435 lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
436 lapic_cal_loops = -1;
437
438 /* Let the interrupts run */
439 local_irq_enable();
440
441 while(lapic_cal_loops <= LAPIC_CAL_LOOPS);
442
443 local_irq_disable();
444
445 /* Stop the lapic timer */
446 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
447
448 local_irq_enable();
449
450 /* Jiffies delta */
451 deltaj = lapic_cal_j2 - lapic_cal_j1;
452 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
453
454 /* Check, if the PM timer is available */
455 deltapm = lapic_cal_pm2 - lapic_cal_pm1;
456 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
457
458 local_apic_timer_verify_ok = 0;
459
460 if (deltapm) {
461 if (deltapm > (pm_100ms - pm_thresh) &&
462 deltapm < (pm_100ms + pm_thresh)) {
463 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
464 /* Check, if the jiffies result is consistent */
465 if (deltaj < LAPIC_CAL_LOOPS-2 ||
466 deltaj > LAPIC_CAL_LOOPS+2) {
467 /*
468 * Not sure, what we can do about this one.
469 * When high resultion timers are active
470 * and the lapic timer does not stop in C3
471 * we are fine. Otherwise more trouble might
472 * be waiting. -- tglx
473 */
474 printk(KERN_WARNING "Global event device %s "
475 "has wrong frequency "
476 "(%lu ticks instead of %d)\n",
477 global_clock_event->name, deltaj,
478 LAPIC_CAL_LOOPS);
479 }
480 local_apic_timer_verify_ok = 1;
481 }
482 } else {
483 /* Check, if the jiffies result is consistent */
484 if (deltaj >= LAPIC_CAL_LOOPS-2 &&
485 deltaj <= LAPIC_CAL_LOOPS+2) {
486 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
487 local_apic_timer_verify_ok = 1;
488 }
489 }
490
491 if (!local_apic_timer_verify_ok) {
492 printk(KERN_WARNING
493 "APIC timer disabled due to verification failure.\n");
494 /* No broadcast on UP ! */
495 if (num_possible_cpus() == 1)
496 return;
497 } else
498 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
499
500 /* Setup the lapic or request the broadcast */
501 setup_APIC_timer();
502}
503
504void __devinit setup_secondary_APIC_clock(void)
505{
506 setup_APIC_timer();
149} 507}
150 508
151int get_maxlvt(void) 509/*
510 * The guts of the apic timer interrupt
511 */
512static void local_apic_timer_interrupt(void)
152{ 513{
153 unsigned int v, ver, maxlvt; 514 int cpu = smp_processor_id();
515 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
154 516
155 v = apic_read(APIC_LVR); 517 /*
156 ver = GET_APIC_VERSION(v); 518 * Normally we should not be here till LAPIC has been initialized but
157 /* 82489DXs do not report # of LVT entries. */ 519 * in some cases like kdump, its possible that there is a pending LAPIC
158 maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; 520 * timer interrupt from previous kernel's context and is delivered in
159 return maxlvt; 521 * new kernel the moment interrupts are enabled.
522 *
523 * Interrupts are enabled early and LAPIC is setup much later, hence
524 * its possible that when we get here evt->event_handler is NULL.
525 * Check for event_handler being NULL and discard the interrupt as
526 * spurious.
527 */
528 if (!evt->event_handler) {
529 printk(KERN_WARNING
530 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
531 /* Switch it off */
532 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
533 return;
534 }
535
536 per_cpu(irq_stat, cpu).apic_timer_irqs++;
537
538 evt->event_handler(evt);
160} 539}
161 540
541/*
542 * Local APIC timer interrupt. This is the most natural way for doing
543 * local interrupts, but local timer interrupts can be emulated by
544 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
545 *
546 * [ if a single-CPU system runs an SMP kernel then we call the local
547 * interrupt as well. Thus we cannot inline the local irq ... ]
548 */
549
550void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
551{
552 struct pt_regs *old_regs = set_irq_regs(regs);
553
554 /*
555 * NOTE! We'd better ACK the irq immediately,
556 * because timer handling can be slow.
557 */
558 ack_APIC_irq();
559 /*
560 * update_process_times() expects us to have done irq_enter().
561 * Besides, if we don't timer interrupts ignore the global
562 * interrupt lock, which is the WrongThing (tm) to do.
563 */
564 exit_idle();
565 irq_enter();
566 local_apic_timer_interrupt();
567 irq_exit();
568
569 set_irq_regs(old_regs);
570}
571
572int setup_profiling_timer(unsigned int multiplier)
573{
574 return -EINVAL;
575}
576
577/*
578 * Local APIC start and shutdown
579 */
580
581/**
582 * clear_local_APIC - shutdown the local APIC
583 *
584 * This is called, when a CPU is disabled and before rebooting, so the state of
585 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
586 * leftovers during boot.
587 */
162void clear_local_APIC(void) 588void clear_local_APIC(void)
163{ 589{
164 int maxlvt; 590 int maxlvt = lapic_get_maxlvt();
165 unsigned long v; 591 unsigned long v;
166 592
167 maxlvt = get_maxlvt();
168
169 /* 593 /*
170 * Masking an LVT entry can trigger a local APIC error 594 * Masking an LVT entry can trigger a local APIC error
171 * if the vector is zero. Mask LVTERR first to prevent this. 595 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -189,7 +613,7 @@ void clear_local_APIC(void)
189 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 613 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
190 } 614 }
191 615
192/* lets not touch this if we didn't frob it */ 616 /* lets not touch this if we didn't frob it */
193#ifdef CONFIG_X86_MCE_P4THERMAL 617#ifdef CONFIG_X86_MCE_P4THERMAL
194 if (maxlvt >= 5) { 618 if (maxlvt >= 5) {
195 v = apic_read(APIC_LVTTHMR); 619 v = apic_read(APIC_LVTTHMR);
@@ -211,85 +635,18 @@ void clear_local_APIC(void)
211 if (maxlvt >= 5) 635 if (maxlvt >= 5)
212 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 636 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
213#endif 637#endif
214 v = GET_APIC_VERSION(apic_read(APIC_LVR)); 638 /* Integrated APIC (!82489DX) ? */
215 if (APIC_INTEGRATED(v)) { /* !82489DX */ 639 if (lapic_is_integrated()) {
216 if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ 640 if (maxlvt > 3)
641 /* Clear ESR due to Pentium errata 3AP and 11AP */
217 apic_write(APIC_ESR, 0); 642 apic_write(APIC_ESR, 0);
218 apic_read(APIC_ESR); 643 apic_read(APIC_ESR);
219 } 644 }
220} 645}
221 646
222void __init connect_bsp_APIC(void) 647/**
223{ 648 * disable_local_APIC - clear and disable the local APIC
224 if (pic_mode) { 649 */
225 /*
226 * Do not trust the local APIC being empty at bootup.
227 */
228 clear_local_APIC();
229 /*
230 * PIC mode, enable APIC mode in the IMCR, i.e.
231 * connect BSP's local APIC to INT and NMI lines.
232 */
233 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
234 "enabling APIC mode.\n");
235 outb(0x70, 0x22);
236 outb(0x01, 0x23);
237 }
238 enable_apic_mode();
239}
240
241void disconnect_bsp_APIC(int virt_wire_setup)
242{
243 if (pic_mode) {
244 /*
245 * Put the board back into PIC mode (has an effect
246 * only on certain older boards). Note that APIC
247 * interrupts, including IPIs, won't work beyond
248 * this point! The only exception are INIT IPIs.
249 */
250 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
251 "entering PIC mode.\n");
252 outb(0x70, 0x22);
253 outb(0x00, 0x23);
254 }
255 else {
256 /* Go back to Virtual Wire compatibility mode */
257 unsigned long value;
258
259 /* For the spurious interrupt use vector F, and enable it */
260 value = apic_read(APIC_SPIV);
261 value &= ~APIC_VECTOR_MASK;
262 value |= APIC_SPIV_APIC_ENABLED;
263 value |= 0xf;
264 apic_write_around(APIC_SPIV, value);
265
266 if (!virt_wire_setup) {
267 /* For LVT0 make it edge triggered, active high, external and enabled */
268 value = apic_read(APIC_LVT0);
269 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
270 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
271 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
272 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
273 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
274 apic_write_around(APIC_LVT0, value);
275 }
276 else {
277 /* Disable LVT0 */
278 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
279 }
280
281 /* For LVT1 make it edge triggered, active high, nmi and enabled */
282 value = apic_read(APIC_LVT1);
283 value &= ~(
284 APIC_MODE_MASK | APIC_SEND_PENDING |
285 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
286 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
287 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
288 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
289 apic_write_around(APIC_LVT1, value);
290 }
291}
292
293void disable_local_APIC(void) 650void disable_local_APIC(void)
294{ 651{
295 unsigned long value; 652 unsigned long value;
@@ -304,8 +661,13 @@ void disable_local_APIC(void)
304 value &= ~APIC_SPIV_APIC_ENABLED; 661 value &= ~APIC_SPIV_APIC_ENABLED;
305 apic_write_around(APIC_SPIV, value); 662 apic_write_around(APIC_SPIV, value);
306 663
664 /*
665 * When LAPIC was disabled by the BIOS and enabled by the kernel,
666 * restore the disabled state.
667 */
307 if (enabled_via_apicbase) { 668 if (enabled_via_apicbase) {
308 unsigned int l, h; 669 unsigned int l, h;
670
309 rdmsr(MSR_IA32_APICBASE, l, h); 671 rdmsr(MSR_IA32_APICBASE, l, h);
310 l &= ~MSR_IA32_APICBASE_ENABLE; 672 l &= ~MSR_IA32_APICBASE_ENABLE;
311 wrmsr(MSR_IA32_APICBASE, l, h); 673 wrmsr(MSR_IA32_APICBASE, l, h);
@@ -313,6 +675,28 @@ void disable_local_APIC(void)
313} 675}
314 676
315/* 677/*
678 * If Linux enabled the LAPIC against the BIOS default disable it down before
679 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
680 * not power-off. Additionally clear all LVT entries before disable_local_APIC
681 * for the case where Linux didn't enable the LAPIC.
682 */
683void lapic_shutdown(void)
684{
685 unsigned long flags;
686
687 if (!cpu_has_apic)
688 return;
689
690 local_irq_save(flags);
691 clear_local_APIC();
692
693 if (enabled_via_apicbase)
694 disable_local_APIC();
695
696 local_irq_restore(flags);
697}
698
699/*
316 * This is to verify that we're looking at a real local APIC. 700 * This is to verify that we're looking at a real local APIC.
317 * Check these against your board if the CPUs aren't getting 701 * Check these against your board if the CPUs aren't getting
318 * started for no apparent reason. 702 * started for no apparent reason.
@@ -344,7 +728,7 @@ int __init verify_local_APIC(void)
344 reg1 = GET_APIC_VERSION(reg0); 728 reg1 = GET_APIC_VERSION(reg0);
345 if (reg1 == 0x00 || reg1 == 0xff) 729 if (reg1 == 0x00 || reg1 == 0xff)
346 return 0; 730 return 0;
347 reg1 = get_maxlvt(); 731 reg1 = lapic_get_maxlvt();
348 if (reg1 < 0x02 || reg1 == 0xff) 732 if (reg1 < 0x02 || reg1 == 0xff)
349 return 0; 733 return 0;
350 734
@@ -367,10 +751,15 @@ int __init verify_local_APIC(void)
367 return 1; 751 return 1;
368} 752}
369 753
754/**
755 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
756 */
370void __init sync_Arb_IDs(void) 757void __init sync_Arb_IDs(void)
371{ 758{
372 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 759 /*
373 And not needed on AMD */ 760 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
761 * needed on AMD.
762 */
374 if (modern_apic()) 763 if (modern_apic())
375 return; 764 return;
376 /* 765 /*
@@ -383,14 +772,12 @@ void __init sync_Arb_IDs(void)
383 | APIC_DM_INIT); 772 | APIC_DM_INIT);
384} 773}
385 774
386extern void __error_in_apic_c (void);
387
388/* 775/*
389 * An initial setup of the virtual wire mode. 776 * An initial setup of the virtual wire mode.
390 */ 777 */
391void __init init_bsp_APIC(void) 778void __init init_bsp_APIC(void)
392{ 779{
393 unsigned long value, ver; 780 unsigned long value;
394 781
395 /* 782 /*
396 * Don't do the setup now if we have a SMP BIOS as the 783 * Don't do the setup now if we have a SMP BIOS as the
@@ -399,9 +786,6 @@ void __init init_bsp_APIC(void)
399 if (smp_found_config || !cpu_has_apic) 786 if (smp_found_config || !cpu_has_apic)
400 return; 787 return;
401 788
402 value = apic_read(APIC_LVR);
403 ver = GET_APIC_VERSION(value);
404
405 /* 789 /*
406 * Do not trust the local APIC being empty at bootup. 790 * Do not trust the local APIC being empty at bootup.
407 */ 791 */
@@ -413,9 +797,10 @@ void __init init_bsp_APIC(void)
413 value = apic_read(APIC_SPIV); 797 value = apic_read(APIC_SPIV);
414 value &= ~APIC_VECTOR_MASK; 798 value &= ~APIC_VECTOR_MASK;
415 value |= APIC_SPIV_APIC_ENABLED; 799 value |= APIC_SPIV_APIC_ENABLED;
416 800
417 /* This bit is reserved on P4/Xeon and should be cleared */ 801 /* This bit is reserved on P4/Xeon and should be cleared */
418 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) 802 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
803 (boot_cpu_data.x86 == 15))
419 value &= ~APIC_SPIV_FOCUS_DISABLED; 804 value &= ~APIC_SPIV_FOCUS_DISABLED;
420 else 805 else
421 value |= APIC_SPIV_FOCUS_DISABLED; 806 value |= APIC_SPIV_FOCUS_DISABLED;
@@ -427,14 +812,17 @@ void __init init_bsp_APIC(void)
427 */ 812 */
428 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 813 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
429 value = APIC_DM_NMI; 814 value = APIC_DM_NMI;
430 if (!APIC_INTEGRATED(ver)) /* 82489DX */ 815 if (!lapic_is_integrated()) /* 82489DX */
431 value |= APIC_LVT_LEVEL_TRIGGER; 816 value |= APIC_LVT_LEVEL_TRIGGER;
432 apic_write_around(APIC_LVT1, value); 817 apic_write_around(APIC_LVT1, value);
433} 818}
434 819
820/**
821 * setup_local_APIC - setup the local APIC
822 */
435void __devinit setup_local_APIC(void) 823void __devinit setup_local_APIC(void)
436{ 824{
437 unsigned long oldvalue, value, ver, maxlvt; 825 unsigned long oldvalue, value, maxlvt, integrated;
438 int i, j; 826 int i, j;
439 827
440 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 828 /* Pound the ESR really hard over the head with a big hammer - mbligh */
@@ -445,11 +833,7 @@ void __devinit setup_local_APIC(void)
445 apic_write(APIC_ESR, 0); 833 apic_write(APIC_ESR, 0);
446 } 834 }
447 835
448 value = apic_read(APIC_LVR); 836 integrated = lapic_is_integrated();
449 ver = GET_APIC_VERSION(value);
450
451 if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
452 __error_in_apic_c();
453 837
454 /* 838 /*
455 * Double-check whether this APIC is really registered. 839 * Double-check whether this APIC is really registered.
@@ -520,13 +904,10 @@ void __devinit setup_local_APIC(void)
520 * like LRU than MRU (the short-term load is more even across CPUs). 904 * like LRU than MRU (the short-term load is more even across CPUs).
521 * See also the comment in end_level_ioapic_irq(). --macro 905 * See also the comment in end_level_ioapic_irq(). --macro
522 */ 906 */
523#if 1 907
524 /* Enable focus processor (bit==0) */ 908 /* Enable focus processor (bit==0) */
525 value &= ~APIC_SPIV_FOCUS_DISABLED; 909 value &= ~APIC_SPIV_FOCUS_DISABLED;
526#else 910
527 /* Disable focus processor (bit==1) */
528 value |= APIC_SPIV_FOCUS_DISABLED;
529#endif
530 /* 911 /*
531 * Set spurious IRQ vector 912 * Set spurious IRQ vector
532 */ 913 */
@@ -562,17 +943,18 @@ void __devinit setup_local_APIC(void)
562 value = APIC_DM_NMI; 943 value = APIC_DM_NMI;
563 else 944 else
564 value = APIC_DM_NMI | APIC_LVT_MASKED; 945 value = APIC_DM_NMI | APIC_LVT_MASKED;
565 if (!APIC_INTEGRATED(ver)) /* 82489DX */ 946 if (!integrated) /* 82489DX */
566 value |= APIC_LVT_LEVEL_TRIGGER; 947 value |= APIC_LVT_LEVEL_TRIGGER;
567 apic_write_around(APIC_LVT1, value); 948 apic_write_around(APIC_LVT1, value);
568 949
569 if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ 950 if (integrated && !esr_disable) { /* !82489DX */
570 maxlvt = get_maxlvt(); 951 maxlvt = lapic_get_maxlvt();
571 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 952 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
572 apic_write(APIC_ESR, 0); 953 apic_write(APIC_ESR, 0);
573 oldvalue = apic_read(APIC_ESR); 954 oldvalue = apic_read(APIC_ESR);
574 955
575 value = ERROR_APIC_VECTOR; // enables sending errors 956 /* enables sending errors */
957 value = ERROR_APIC_VECTOR;
576 apic_write_around(APIC_LVTERR, value); 958 apic_write_around(APIC_LVTERR, value);
577 /* 959 /*
578 * spec says clear errors after enabling vector. 960 * spec says clear errors after enabling vector.
@@ -585,207 +967,30 @@ void __devinit setup_local_APIC(void)
585 "vector: 0x%08lx after: 0x%08lx\n", 967 "vector: 0x%08lx after: 0x%08lx\n",
586 oldvalue, value); 968 oldvalue, value);
587 } else { 969 } else {
588 if (esr_disable) 970 if (esr_disable)
589 /* 971 /*
590 * Something untraceble is creating bad interrupts on 972 * Something untraceble is creating bad interrupts on
591 * secondary quads ... for the moment, just leave the 973 * secondary quads ... for the moment, just leave the
592 * ESR disabled - we can't do anything useful with the 974 * ESR disabled - we can't do anything useful with the
593 * errors anyway - mbligh 975 * errors anyway - mbligh
594 */ 976 */
595 printk("Leaving ESR disabled.\n"); 977 printk(KERN_INFO "Leaving ESR disabled.\n");
596 else 978 else
597 printk("No ESR for 82489DX.\n"); 979 printk(KERN_INFO "No ESR for 82489DX.\n");
598 } 980 }
599 981
982 /* Disable the local apic timer */
983 value = apic_read(APIC_LVTT);
984 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
985 apic_write_around(APIC_LVTT, value);
986
600 setup_apic_nmi_watchdog(NULL); 987 setup_apic_nmi_watchdog(NULL);
601 apic_pm_activate(); 988 apic_pm_activate();
602} 989}
603 990
604/* 991/*
605 * If Linux enabled the LAPIC against the BIOS default 992 * Detect and initialize APIC
606 * disable it down before re-entering the BIOS on shutdown.
607 * Otherwise the BIOS may get confused and not power-off.
608 * Additionally clear all LVT entries before disable_local_APIC
609 * for the case where Linux didn't enable the LAPIC.
610 */ 993 */
611void lapic_shutdown(void)
612{
613 unsigned long flags;
614
615 if (!cpu_has_apic)
616 return;
617
618 local_irq_save(flags);
619 clear_local_APIC();
620
621 if (enabled_via_apicbase)
622 disable_local_APIC();
623
624 local_irq_restore(flags);
625}
626
627#ifdef CONFIG_PM
628
629static struct {
630 int active;
631 /* r/w apic fields */
632 unsigned int apic_id;
633 unsigned int apic_taskpri;
634 unsigned int apic_ldr;
635 unsigned int apic_dfr;
636 unsigned int apic_spiv;
637 unsigned int apic_lvtt;
638 unsigned int apic_lvtpc;
639 unsigned int apic_lvt0;
640 unsigned int apic_lvt1;
641 unsigned int apic_lvterr;
642 unsigned int apic_tmict;
643 unsigned int apic_tdcr;
644 unsigned int apic_thmr;
645} apic_pm_state;
646
647static int lapic_suspend(struct sys_device *dev, pm_message_t state)
648{
649 unsigned long flags;
650 int maxlvt;
651
652 if (!apic_pm_state.active)
653 return 0;
654
655 maxlvt = get_maxlvt();
656
657 apic_pm_state.apic_id = apic_read(APIC_ID);
658 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
659 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
660 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
661 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
662 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
663 if (maxlvt >= 4)
664 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
665 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
666 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
667 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
668 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
669 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
670#ifdef CONFIG_X86_MCE_P4THERMAL
671 if (maxlvt >= 5)
672 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
673#endif
674
675 local_irq_save(flags);
676 disable_local_APIC();
677 local_irq_restore(flags);
678 return 0;
679}
680
681static int lapic_resume(struct sys_device *dev)
682{
683 unsigned int l, h;
684 unsigned long flags;
685 int maxlvt;
686
687 if (!apic_pm_state.active)
688 return 0;
689
690 maxlvt = get_maxlvt();
691
692 local_irq_save(flags);
693
694 /*
695 * Make sure the APICBASE points to the right address
696 *
697 * FIXME! This will be wrong if we ever support suspend on
698 * SMP! We'll need to do this as part of the CPU restore!
699 */
700 rdmsr(MSR_IA32_APICBASE, l, h);
701 l &= ~MSR_IA32_APICBASE_BASE;
702 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
703 wrmsr(MSR_IA32_APICBASE, l, h);
704
705 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
706 apic_write(APIC_ID, apic_pm_state.apic_id);
707 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
708 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
709 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
710 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
711 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
712 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
713#ifdef CONFIG_X86_MCE_P4THERMAL
714 if (maxlvt >= 5)
715 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
716#endif
717 if (maxlvt >= 4)
718 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
719 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
720 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
721 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
722 apic_write(APIC_ESR, 0);
723 apic_read(APIC_ESR);
724 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
725 apic_write(APIC_ESR, 0);
726 apic_read(APIC_ESR);
727 local_irq_restore(flags);
728 return 0;
729}
730
731/*
732 * This device has no shutdown method - fully functioning local APICs
733 * are needed on every CPU up until machine_halt/restart/poweroff.
734 */
735
736static struct sysdev_class lapic_sysclass = {
737 set_kset_name("lapic"),
738 .resume = lapic_resume,
739 .suspend = lapic_suspend,
740};
741
742static struct sys_device device_lapic = {
743 .id = 0,
744 .cls = &lapic_sysclass,
745};
746
747static void __devinit apic_pm_activate(void)
748{
749 apic_pm_state.active = 1;
750}
751
752static int __init init_lapic_sysfs(void)
753{
754 int error;
755
756 if (!cpu_has_apic)
757 return 0;
758 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
759
760 error = sysdev_class_register(&lapic_sysclass);
761 if (!error)
762 error = sysdev_register(&device_lapic);
763 return error;
764}
765device_initcall(init_lapic_sysfs);
766
767#else /* CONFIG_PM */
768
769static void apic_pm_activate(void) { }
770
771#endif /* CONFIG_PM */
772
773/*
774 * Detect and enable local APICs on non-SMP boards.
775 * Original code written by Keir Fraser.
776 */
777
778static int __init apic_set_verbosity(char *str)
779{
780 if (strcmp("debug", str) == 0)
781 apic_verbosity = APIC_DEBUG;
782 else if (strcmp("verbose", str) == 0)
783 apic_verbosity = APIC_VERBOSE;
784 return 1;
785}
786
787__setup("apic=", apic_set_verbosity);
788
789static int __init detect_init_APIC (void) 994static int __init detect_init_APIC (void)
790{ 995{
791 u32 h, l, features; 996 u32 h, l, features;
@@ -797,7 +1002,7 @@ static int __init detect_init_APIC (void)
797 switch (boot_cpu_data.x86_vendor) { 1002 switch (boot_cpu_data.x86_vendor) {
798 case X86_VENDOR_AMD: 1003 case X86_VENDOR_AMD:
799 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || 1004 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
800 (boot_cpu_data.x86 == 15)) 1005 (boot_cpu_data.x86 == 15))
801 break; 1006 break;
802 goto no_apic; 1007 goto no_apic;
803 case X86_VENDOR_INTEL: 1008 case X86_VENDOR_INTEL:
@@ -811,23 +1016,23 @@ static int __init detect_init_APIC (void)
811 1016
812 if (!cpu_has_apic) { 1017 if (!cpu_has_apic) {
813 /* 1018 /*
814 * Over-ride BIOS and try to enable the local 1019 * Over-ride BIOS and try to enable the local APIC only if
815 * APIC only if "lapic" specified. 1020 * "lapic" specified.
816 */ 1021 */
817 if (enable_local_apic <= 0) { 1022 if (enable_local_apic <= 0) {
818 printk("Local APIC disabled by BIOS -- " 1023 printk(KERN_INFO "Local APIC disabled by BIOS -- "
819 "you can enable it with \"lapic\"\n"); 1024 "you can enable it with \"lapic\"\n");
820 return -1; 1025 return -1;
821 } 1026 }
822 /* 1027 /*
823 * Some BIOSes disable the local APIC in the 1028 * Some BIOSes disable the local APIC in the APIC_BASE
824 * APIC_BASE MSR. This can only be done in 1029 * MSR. This can only be done in software for Intel P6 or later
825 * software for Intel P6 or later and AMD K7 1030 * and AMD K7 (Model > 1) or later.
826 * (Model > 1) or later.
827 */ 1031 */
828 rdmsr(MSR_IA32_APICBASE, l, h); 1032 rdmsr(MSR_IA32_APICBASE, l, h);
829 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1033 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
830 printk("Local APIC disabled by BIOS -- reenabling.\n"); 1034 printk(KERN_INFO
1035 "Local APIC disabled by BIOS -- reenabling.\n");
831 l &= ~MSR_IA32_APICBASE_BASE; 1036 l &= ~MSR_IA32_APICBASE_BASE;
832 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1037 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
833 wrmsr(MSR_IA32_APICBASE, l, h); 1038 wrmsr(MSR_IA32_APICBASE, l, h);
@@ -840,7 +1045,7 @@ static int __init detect_init_APIC (void)
840 */ 1045 */
841 features = cpuid_edx(1); 1046 features = cpuid_edx(1);
842 if (!(features & (1 << X86_FEATURE_APIC))) { 1047 if (!(features & (1 << X86_FEATURE_APIC))) {
843 printk("Could not enable APIC!\n"); 1048 printk(KERN_WARNING "Could not enable APIC!\n");
844 return -1; 1049 return -1;
845 } 1050 }
846 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1051 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
@@ -854,17 +1059,20 @@ static int __init detect_init_APIC (void)
854 if (nmi_watchdog != NMI_NONE) 1059 if (nmi_watchdog != NMI_NONE)
855 nmi_watchdog = NMI_LOCAL_APIC; 1060 nmi_watchdog = NMI_LOCAL_APIC;
856 1061
857 printk("Found and enabled local APIC!\n"); 1062 printk(KERN_INFO "Found and enabled local APIC!\n");
858 1063
859 apic_pm_activate(); 1064 apic_pm_activate();
860 1065
861 return 0; 1066 return 0;
862 1067
863no_apic: 1068no_apic:
864 printk("No local APIC present or hardware disabled\n"); 1069 printk(KERN_INFO "No local APIC present or hardware disabled\n");
865 return -1; 1070 return -1;
866} 1071}
867 1072
1073/**
1074 * init_apic_mappings - initialize APIC mappings
1075 */
868void __init init_apic_mappings(void) 1076void __init init_apic_mappings(void)
869{ 1077{
870 unsigned long apic_phys; 1078 unsigned long apic_phys;
@@ -924,387 +1132,96 @@ fake_ioapic_page:
924} 1132}
925 1133
926/* 1134/*
927 * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts 1135 * This initializes the IO-APIC and APIC hardware if this is
928 * per second. We assume that the caller has already set up the local 1136 * a UP kernel.
929 * APIC.
930 *
931 * The APIC timer is not exactly sync with the external timer chip, it
932 * closely follows bus clocks.
933 */
934
935/*
936 * The timer chip is already set up at HZ interrupts per second here,
937 * but we do not accept timer interrupts yet. We only allow the BP
938 * to calibrate.
939 */
940static unsigned int __devinit get_8254_timer_count(void)
941{
942 unsigned long flags;
943
944 unsigned int count;
945
946 spin_lock_irqsave(&i8253_lock, flags);
947
948 outb_p(0x00, PIT_MODE);
949 count = inb_p(PIT_CH0);
950 count |= inb_p(PIT_CH0) << 8;
951
952 spin_unlock_irqrestore(&i8253_lock, flags);
953
954 return count;
955}
956
957/* next tick in 8254 can be caught by catching timer wraparound */
958static void __devinit wait_8254_wraparound(void)
959{
960 unsigned int curr_count, prev_count;
961
962 curr_count = get_8254_timer_count();
963 do {
964 prev_count = curr_count;
965 curr_count = get_8254_timer_count();
966
967 /* workaround for broken Mercury/Neptune */
968 if (prev_count >= curr_count + 0x100)
969 curr_count = get_8254_timer_count();
970
971 } while (prev_count >= curr_count);
972}
973
974/*
975 * Default initialization for 8254 timers. If we use other timers like HPET,
976 * we override this later
977 */
978void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound;
979
980/*
981 * This function sets up the local APIC timer, with a timeout of
982 * 'clocks' APIC bus clock. During calibration we actually call
983 * this function twice on the boot CPU, once with a bogus timeout
984 * value, second time for real. The other (noncalibrating) CPUs
985 * call this function only once, with the real, calibrated value.
986 *
987 * We do reads before writes even if unnecessary, to get around the
988 * P5 APIC double write bug.
989 */ 1137 */
990 1138int __init APIC_init_uniprocessor (void)
991#define APIC_DIVISOR 16
992
993static void __setup_APIC_LVTT(unsigned int clocks)
994{ 1139{
995 unsigned int lvtt_value, tmp_value, ver; 1140 if (enable_local_apic < 0)
996 int cpu = smp_processor_id(); 1141 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
997
998 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
999 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
1000 if (!APIC_INTEGRATED(ver))
1001 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
1002
1003 if (cpu_isset(cpu, timer_bcast_ipi))
1004 lvtt_value |= APIC_LVT_MASKED;
1005 1142
1006 apic_write_around(APIC_LVTT, lvtt_value); 1143 if (!smp_found_config && !cpu_has_apic)
1144 return -1;
1007 1145
1008 /* 1146 /*
1009 * Divide PICLK by 16 1147 * Complain if the BIOS pretends there is one.
1010 */ 1148 */
1011 tmp_value = apic_read(APIC_TDCR); 1149 if (!cpu_has_apic &&
1012 apic_write_around(APIC_TDCR, (tmp_value 1150 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1013 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 1151 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1014 | APIC_TDR_DIV_16); 1152 boot_cpu_physical_apicid);
1015 1153 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1016 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 1154 return -1;
1017} 1155 }
1018 1156
1019static void __devinit setup_APIC_timer(unsigned int clocks) 1157 verify_local_APIC();
1020{
1021 unsigned long flags;
1022 1158
1023 local_irq_save(flags); 1159 connect_bsp_APIC();
1024 1160
1025 /* 1161 /*
1026 * Wait for IRQ0's slice: 1162 * Hack: In case of kdump, after a crash, kernel might be booting
1163 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1164 * might be zero if read from MP tables. Get it from LAPIC.
1027 */ 1165 */
1028 wait_timer_tick(); 1166#ifdef CONFIG_CRASH_DUMP
1167 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
1168#endif
1169 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
1029 1170
1030 __setup_APIC_LVTT(clocks); 1171 setup_local_APIC();
1031 1172
1032 local_irq_restore(flags); 1173#ifdef CONFIG_X86_IO_APIC
1174 if (smp_found_config)
1175 if (!skip_ioapic_setup && nr_ioapics)
1176 setup_IO_APIC();
1177#endif
1178 setup_boot_clock();
1179
1180 return 0;
1033} 1181}
1034 1182
1035/* 1183/*
1036 * In this function we calibrate APIC bus clocks to the external 1184 * APIC command line parameters
1037 * timer. Unfortunately we cannot use jiffies and the timer irq
1038 * to calibrate, since some later bootup code depends on getting
1039 * the first irq? Ugh.
1040 *
1041 * We want to do the calibration only once since we
1042 * want to have local timer irqs syncron. CPUs connected
1043 * by the same APIC bus have the very same bus frequency.
1044 * And we want to have irqs off anyways, no accidental
1045 * APIC irq that way.
1046 */ 1185 */
1047 1186static int __init parse_lapic(char *arg)
1048static int __init calibrate_APIC_clock(void)
1049{
1050 unsigned long long t1 = 0, t2 = 0;
1051 long tt1, tt2;
1052 long result;
1053 int i;
1054 const int LOOPS = HZ/10;
1055
1056 apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
1057
1058 /*
1059 * Put whatever arbitrary (but long enough) timeout
1060 * value into the APIC clock, we just want to get the
1061 * counter running for calibration.
1062 */
1063 __setup_APIC_LVTT(1000000000);
1064
1065 /*
1066 * The timer chip counts down to zero. Let's wait
1067 * for a wraparound to start exact measurement:
1068 * (the current tick might have been already half done)
1069 */
1070
1071 wait_timer_tick();
1072
1073 /*
1074 * We wrapped around just now. Let's start:
1075 */
1076 if (cpu_has_tsc)
1077 rdtscll(t1);
1078 tt1 = apic_read(APIC_TMCCT);
1079
1080 /*
1081 * Let's wait LOOPS wraprounds:
1082 */
1083 for (i = 0; i < LOOPS; i++)
1084 wait_timer_tick();
1085
1086 tt2 = apic_read(APIC_TMCCT);
1087 if (cpu_has_tsc)
1088 rdtscll(t2);
1089
1090 /*
1091 * The APIC bus clock counter is 32 bits only, it
1092 * might have overflown, but note that we use signed
1093 * longs, thus no extra care needed.
1094 *
1095 * underflown to be exact, as the timer counts down ;)
1096 */
1097
1098 result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
1099
1100 if (cpu_has_tsc)
1101 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
1102 "%ld.%04ld MHz.\n",
1103 ((long)(t2-t1)/LOOPS)/(1000000/HZ),
1104 ((long)(t2-t1)/LOOPS)%(1000000/HZ));
1105
1106 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
1107 "%ld.%04ld MHz.\n",
1108 result/(1000000/HZ),
1109 result%(1000000/HZ));
1110
1111 return result;
1112}
1113
1114static unsigned int calibration_result;
1115
1116void __init setup_boot_APIC_clock(void)
1117{
1118 unsigned long flags;
1119 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
1120 using_apic_timer = 1;
1121
1122 local_irq_save(flags);
1123
1124 calibration_result = calibrate_APIC_clock();
1125 /*
1126 * Now set up the timer for real.
1127 */
1128 setup_APIC_timer(calibration_result);
1129
1130 local_irq_restore(flags);
1131}
1132
1133void __devinit setup_secondary_APIC_clock(void)
1134{
1135 setup_APIC_timer(calibration_result);
1136}
1137
1138void disable_APIC_timer(void)
1139{
1140 if (using_apic_timer) {
1141 unsigned long v;
1142
1143 v = apic_read(APIC_LVTT);
1144 /*
1145 * When an illegal vector value (0-15) is written to an LVT
1146 * entry and delivery mode is Fixed, the APIC may signal an
1147 * illegal vector error, with out regard to whether the mask
1148 * bit is set or whether an interrupt is actually seen on input.
1149 *
1150 * Boot sequence might call this function when the LVTT has
1151 * '0' vector value. So make sure vector field is set to
1152 * valid value.
1153 */
1154 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1155 apic_write_around(APIC_LVTT, v);
1156 }
1157}
1158
1159void enable_APIC_timer(void)
1160{ 1187{
1161 int cpu = smp_processor_id(); 1188 enable_local_apic = 1;
1162 1189 return 0;
1163 if (using_apic_timer &&
1164 !cpu_isset(cpu, timer_bcast_ipi)) {
1165 unsigned long v;
1166
1167 v = apic_read(APIC_LVTT);
1168 apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
1169 }
1170} 1190}
1191early_param("lapic", parse_lapic);
1171 1192
1172void switch_APIC_timer_to_ipi(void *cpumask) 1193static int __init parse_nolapic(char *arg)
1173{ 1194{
1174 cpumask_t mask = *(cpumask_t *)cpumask; 1195 enable_local_apic = -1;
1175 int cpu = smp_processor_id(); 1196 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1176 1197 return 0;
1177 if (cpu_isset(cpu, mask) &&
1178 !cpu_isset(cpu, timer_bcast_ipi)) {
1179 disable_APIC_timer();
1180 cpu_set(cpu, timer_bcast_ipi);
1181 }
1182} 1198}
1183EXPORT_SYMBOL(switch_APIC_timer_to_ipi); 1199early_param("nolapic", parse_nolapic);
1184 1200
1185void switch_ipi_to_APIC_timer(void *cpumask) 1201static int __init apic_set_verbosity(char *str)
1186{ 1202{
1187 cpumask_t mask = *(cpumask_t *)cpumask; 1203 if (strcmp("debug", str) == 0)
1188 int cpu = smp_processor_id(); 1204 apic_verbosity = APIC_DEBUG;
1189 1205 else if (strcmp("verbose", str) == 0)
1190 if (cpu_isset(cpu, mask) && 1206 apic_verbosity = APIC_VERBOSE;
1191 cpu_isset(cpu, timer_bcast_ipi)) { 1207 return 1;
1192 cpu_clear(cpu, timer_bcast_ipi);
1193 enable_APIC_timer();
1194 }
1195} 1208}
1196EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
1197
1198#undef APIC_DIVISOR
1199 1209
1200/* 1210__setup("apic=", apic_set_verbosity);
1201 * Local timer interrupt handler. It does both profiling and
1202 * process statistics/rescheduling.
1203 *
1204 * We do profiling in every local tick, statistics/rescheduling
1205 * happen only every 'profiling multiplier' ticks. The default
1206 * multiplier is 1 and it can be changed by writing the new multiplier
1207 * value into /proc/profile.
1208 */
1209
1210inline void smp_local_timer_interrupt(void)
1211{
1212 profile_tick(CPU_PROFILING);
1213#ifdef CONFIG_SMP
1214 update_process_times(user_mode_vm(get_irq_regs()));
1215#endif
1216 1211
1217 /*
1218 * We take the 'long' return path, and there every subsystem
1219 * grabs the apropriate locks (kernel lock/ irq lock).
1220 *
1221 * we might want to decouple profiling from the 'long path',
1222 * and do the profiling totally in assembly.
1223 *
1224 * Currently this isn't too much of an issue (performance wise),
1225 * we can take more than 100K local irqs per second on a 100 MHz P5.
1226 */
1227}
1228 1212
1229/* 1213/*
1230 * Local APIC timer interrupt. This is the most natural way for doing 1214 * Local APIC interrupts
1231 * local interrupts, but local timer interrupts can be emulated by
1232 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1233 *
1234 * [ if a single-CPU system runs an SMP kernel then we call the local
1235 * interrupt as well. Thus we cannot inline the local irq ... ]
1236 */ 1215 */
1237 1216
1238fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
1239{
1240 struct pt_regs *old_regs = set_irq_regs(regs);
1241 int cpu = smp_processor_id();
1242
1243 /*
1244 * the NMI deadlock-detector uses this.
1245 */
1246 per_cpu(irq_stat, cpu).apic_timer_irqs++;
1247
1248 /*
1249 * NOTE! We'd better ACK the irq immediately,
1250 * because timer handling can be slow.
1251 */
1252 ack_APIC_irq();
1253 /*
1254 * update_process_times() expects us to have done irq_enter().
1255 * Besides, if we don't timer interrupts ignore the global
1256 * interrupt lock, which is the WrongThing (tm) to do.
1257 */
1258 irq_enter();
1259 smp_local_timer_interrupt();
1260 irq_exit();
1261 set_irq_regs(old_regs);
1262}
1263
1264#ifndef CONFIG_SMP
1265static void up_apic_timer_interrupt_call(void)
1266{
1267 int cpu = smp_processor_id();
1268
1269 /*
1270 * the NMI deadlock-detector uses this.
1271 */
1272 per_cpu(irq_stat, cpu).apic_timer_irqs++;
1273
1274 smp_local_timer_interrupt();
1275}
1276#endif
1277
1278void smp_send_timer_broadcast_ipi(void)
1279{
1280 cpumask_t mask;
1281
1282 cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1283 if (!cpus_empty(mask)) {
1284#ifdef CONFIG_SMP
1285 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1286#else
1287 /*
1288 * We can directly call the apic timer interrupt handler
1289 * in UP case. Minus all irq related functions
1290 */
1291 up_apic_timer_interrupt_call();
1292#endif
1293 }
1294}
1295
1296int setup_profiling_timer(unsigned int multiplier)
1297{
1298 return -EINVAL;
1299}
1300
1301/* 1217/*
1302 * This interrupt should _never_ happen with our APIC/SMP architecture 1218 * This interrupt should _never_ happen with our APIC/SMP architecture
1303 */ 1219 */
1304fastcall void smp_spurious_interrupt(struct pt_regs *regs) 1220void smp_spurious_interrupt(struct pt_regs *regs)
1305{ 1221{
1306 unsigned long v; 1222 unsigned long v;
1307 1223
1224 exit_idle();
1308 irq_enter(); 1225 irq_enter();
1309 /* 1226 /*
1310 * Check if this really is a spurious interrupt and ACK it 1227 * Check if this really is a spurious interrupt and ACK it
@@ -1316,19 +1233,19 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs)
1316 ack_APIC_irq(); 1233 ack_APIC_irq();
1317 1234
1318 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1235 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1319 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", 1236 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1320 smp_processor_id()); 1237 "should never happen.\n", smp_processor_id());
1321 irq_exit(); 1238 irq_exit();
1322} 1239}
1323 1240
1324/* 1241/*
1325 * This interrupt should never happen with our APIC/SMP architecture 1242 * This interrupt should never happen with our APIC/SMP architecture
1326 */ 1243 */
1327 1244void smp_error_interrupt(struct pt_regs *regs)
1328fastcall void smp_error_interrupt(struct pt_regs *regs)
1329{ 1245{
1330 unsigned long v, v1; 1246 unsigned long v, v1;
1331 1247
1248 exit_idle();
1332 irq_enter(); 1249 irq_enter();
1333 /* First tickle the hardware, only then report what went on. -- REW */ 1250 /* First tickle the hardware, only then report what went on. -- REW */
1334 v = apic_read(APIC_ESR); 1251 v = apic_read(APIC_ESR);
@@ -1348,69 +1265,261 @@ fastcall void smp_error_interrupt(struct pt_regs *regs)
1348 7: Illegal register address 1265 7: Illegal register address
1349 */ 1266 */
1350 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1267 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1351 smp_processor_id(), v , v1); 1268 smp_processor_id(), v , v1);
1352 irq_exit(); 1269 irq_exit();
1353} 1270}
1354 1271
1355/* 1272/*
1356 * This initializes the IO-APIC and APIC hardware if this is 1273 * Initialize APIC interrupts
1357 * a UP kernel.
1358 */ 1274 */
1359int __init APIC_init_uniprocessor (void) 1275void __init apic_intr_init(void)
1360{ 1276{
1361 if (enable_local_apic < 0) 1277#ifdef CONFIG_SMP
1362 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1278 smp_intr_init();
1279#endif
1280 /* self generated IPI for local APIC timer */
1281 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1363 1282
1364 if (!smp_found_config && !cpu_has_apic) 1283 /* IPI vectors for APIC spurious and error interrupts */
1365 return -1; 1284 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1285 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1366 1286
1367 /* 1287 /* thermal monitor LVT interrupt */
1368 * Complain if the BIOS pretends there is one. 1288#ifdef CONFIG_X86_MCE_P4THERMAL
1369 */ 1289 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1370 if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1290#endif
1371 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1291}
1372 boot_cpu_physical_apicid); 1292
1373 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1293/**
1374 return -1; 1294 * connect_bsp_APIC - attach the APIC to the interrupt system
1295 */
1296void __init connect_bsp_APIC(void)
1297{
1298 if (pic_mode) {
1299 /*
1300 * Do not trust the local APIC being empty at bootup.
1301 */
1302 clear_local_APIC();
1303 /*
1304 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1305 * local APIC to INT and NMI lines.
1306 */
1307 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1308 "enabling APIC mode.\n");
1309 outb(0x70, 0x22);
1310 outb(0x01, 0x23);
1375 } 1311 }
1312 enable_apic_mode();
1313}
1376 1314
1377 verify_local_APIC(); 1315/**
1316 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1317 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1318 *
1319 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1320 * APIC is disabled.
1321 */
1322void disconnect_bsp_APIC(int virt_wire_setup)
1323{
1324 if (pic_mode) {
1325 /*
1326 * Put the board back into PIC mode (has an effect only on
1327 * certain older boards). Note that APIC interrupts, including
1328 * IPIs, won't work beyond this point! The only exception are
1329 * INIT IPIs.
1330 */
1331 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1332 "entering PIC mode.\n");
1333 outb(0x70, 0x22);
1334 outb(0x00, 0x23);
1335 } else {
1336 /* Go back to Virtual Wire compatibility mode */
1337 unsigned long value;
1378 1338
1379 connect_bsp_APIC(); 1339 /* For the spurious interrupt use vector F, and enable it */
1340 value = apic_read(APIC_SPIV);
1341 value &= ~APIC_VECTOR_MASK;
1342 value |= APIC_SPIV_APIC_ENABLED;
1343 value |= 0xf;
1344 apic_write_around(APIC_SPIV, value);
1380 1345
1381 /* 1346 if (!virt_wire_setup) {
1382 * Hack: In case of kdump, after a crash, kernel might be booting 1347 /*
1383 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid 1348 * For LVT0 make it edge triggered, active high,
1384 * might be zero if read from MP tables. Get it from LAPIC. 1349 * external and enabled
1385 */ 1350 */
1386#ifdef CONFIG_CRASH_DUMP 1351 value = apic_read(APIC_LVT0);
1387 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); 1352 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1388#endif 1353 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1389 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 1354 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
1355 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1356 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1357 apic_write_around(APIC_LVT0, value);
1358 } else {
1359 /* Disable LVT0 */
1360 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1361 }
1390 1362
1391 setup_local_APIC(); 1363 /*
1364 * For LVT1 make it edge triggered, active high, nmi and
1365 * enabled
1366 */
1367 value = apic_read(APIC_LVT1);
1368 value &= ~(
1369 APIC_MODE_MASK | APIC_SEND_PENDING |
1370 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1371 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1372 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1373 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1374 apic_write_around(APIC_LVT1, value);
1375 }
1376}
1392 1377
1393#ifdef CONFIG_X86_IO_APIC 1378/*
1394 if (smp_found_config) 1379 * Power management
1395 if (!skip_ioapic_setup && nr_ioapics) 1380 */
1396 setup_IO_APIC(); 1381#ifdef CONFIG_PM
1382
1383static struct {
1384 int active;
1385 /* r/w apic fields */
1386 unsigned int apic_id;
1387 unsigned int apic_taskpri;
1388 unsigned int apic_ldr;
1389 unsigned int apic_dfr;
1390 unsigned int apic_spiv;
1391 unsigned int apic_lvtt;
1392 unsigned int apic_lvtpc;
1393 unsigned int apic_lvt0;
1394 unsigned int apic_lvt1;
1395 unsigned int apic_lvterr;
1396 unsigned int apic_tmict;
1397 unsigned int apic_tdcr;
1398 unsigned int apic_thmr;
1399} apic_pm_state;
1400
1401static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1402{
1403 unsigned long flags;
1404 int maxlvt;
1405
1406 if (!apic_pm_state.active)
1407 return 0;
1408
1409 maxlvt = lapic_get_maxlvt();
1410
1411 apic_pm_state.apic_id = apic_read(APIC_ID);
1412 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1413 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1414 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1415 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1416 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1417 if (maxlvt >= 4)
1418 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1419 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1420 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1421 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1422 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1423 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1424#ifdef CONFIG_X86_MCE_P4THERMAL
1425 if (maxlvt >= 5)
1426 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1397#endif 1427#endif
1398 setup_boot_APIC_clock();
1399 1428
1429 local_irq_save(flags);
1430 disable_local_APIC();
1431 local_irq_restore(flags);
1400 return 0; 1432 return 0;
1401} 1433}
1402 1434
1403static int __init parse_lapic(char *arg) 1435static int lapic_resume(struct sys_device *dev)
1404{ 1436{
1405 lapic_enable(); 1437 unsigned int l, h;
1438 unsigned long flags;
1439 int maxlvt;
1440
1441 if (!apic_pm_state.active)
1442 return 0;
1443
1444 maxlvt = lapic_get_maxlvt();
1445
1446 local_irq_save(flags);
1447
1448 /*
1449 * Make sure the APICBASE points to the right address
1450 *
1451 * FIXME! This will be wrong if we ever support suspend on
1452 * SMP! We'll need to do this as part of the CPU restore!
1453 */
1454 rdmsr(MSR_IA32_APICBASE, l, h);
1455 l &= ~MSR_IA32_APICBASE_BASE;
1456 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1457 wrmsr(MSR_IA32_APICBASE, l, h);
1458
1459 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1460 apic_write(APIC_ID, apic_pm_state.apic_id);
1461 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1462 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1463 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1464 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1465 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1466 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1467#ifdef CONFIG_X86_MCE_P4THERMAL
1468 if (maxlvt >= 5)
1469 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1470#endif
1471 if (maxlvt >= 4)
1472 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1473 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1474 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1475 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1476 apic_write(APIC_ESR, 0);
1477 apic_read(APIC_ESR);
1478 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1479 apic_write(APIC_ESR, 0);
1480 apic_read(APIC_ESR);
1481 local_irq_restore(flags);
1406 return 0; 1482 return 0;
1407} 1483}
1408early_param("lapic", parse_lapic);
1409 1484
1410static int __init parse_nolapic(char *arg) 1485/*
1486 * This device has no shutdown method - fully functioning local APICs
1487 * are needed on every CPU up until machine_halt/restart/poweroff.
1488 */
1489
1490static struct sysdev_class lapic_sysclass = {
1491 set_kset_name("lapic"),
1492 .resume = lapic_resume,
1493 .suspend = lapic_suspend,
1494};
1495
1496static struct sys_device device_lapic = {
1497 .id = 0,
1498 .cls = &lapic_sysclass,
1499};
1500
1501static void __devinit apic_pm_activate(void)
1411{ 1502{
1412 lapic_disable(); 1503 apic_pm_state.active = 1;
1413 return 0;
1414} 1504}
1415early_param("nolapic", parse_nolapic);
1416 1505
1506static int __init init_lapic_sysfs(void)
1507{
1508 int error;
1509
1510 if (!cpu_has_apic)
1511 return 0;
1512 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1513
1514 error = sysdev_class_register(&lapic_sysclass);
1515 if (!error)
1516 error = sysdev_register(&device_lapic);
1517 return error;
1518}
1519device_initcall(init_lapic_sysfs);
1520
1521#else /* CONFIG_PM */
1522
1523static void apic_pm_activate(void) { }
1524
1525#endif /* CONFIG_PM */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index db99a8948dae..064bbf2861f4 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -211,6 +211,7 @@
211#include <linux/slab.h> 211#include <linux/slab.h>
212#include <linux/stat.h> 212#include <linux/stat.h>
213#include <linux/proc_fs.h> 213#include <linux/proc_fs.h>
214#include <linux/seq_file.h>
214#include <linux/miscdevice.h> 215#include <linux/miscdevice.h>
215#include <linux/apm_bios.h> 216#include <linux/apm_bios.h>
216#include <linux/init.h> 217#include <linux/init.h>
@@ -235,7 +236,6 @@
235 236
236#include "io_ports.h" 237#include "io_ports.h"
237 238
238extern unsigned long get_cmos_time(void);
239extern void machine_real_restart(unsigned char *, int); 239extern void machine_real_restart(unsigned char *, int);
240 240
241#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 241#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
@@ -1175,28 +1175,6 @@ out:
1175 spin_unlock(&user_list_lock); 1175 spin_unlock(&user_list_lock);
1176} 1176}
1177 1177
1178static void set_time(void)
1179{
1180 struct timespec ts;
1181 if (got_clock_diff) { /* Must know time zone in order to set clock */
1182 ts.tv_sec = get_cmos_time() + clock_cmos_diff;
1183 ts.tv_nsec = 0;
1184 do_settimeofday(&ts);
1185 }
1186}
1187
1188static void get_time_diff(void)
1189{
1190#ifndef CONFIG_APM_RTC_IS_GMT
1191 /*
1192 * Estimate time zone so that set_time can update the clock
1193 */
1194 clock_cmos_diff = -get_cmos_time();
1195 clock_cmos_diff += get_seconds();
1196 got_clock_diff = 1;
1197#endif
1198}
1199
1200static void reinit_timer(void) 1178static void reinit_timer(void)
1201{ 1179{
1202#ifdef INIT_TIMER_AFTER_SUSPEND 1180#ifdef INIT_TIMER_AFTER_SUSPEND
@@ -1236,19 +1214,6 @@ static int suspend(int vetoable)
1236 local_irq_disable(); 1214 local_irq_disable();
1237 device_power_down(PMSG_SUSPEND); 1215 device_power_down(PMSG_SUSPEND);
1238 1216
1239 /* serialize with the timer interrupt */
1240 write_seqlock(&xtime_lock);
1241
1242 /* protect against access to timer chip registers */
1243 spin_lock(&i8253_lock);
1244
1245 get_time_diff();
1246 /*
1247 * Irq spinlock must be dropped around set_system_power_state.
1248 * We'll undo any timer changes due to interrupts below.
1249 */
1250 spin_unlock(&i8253_lock);
1251 write_sequnlock(&xtime_lock);
1252 local_irq_enable(); 1217 local_irq_enable();
1253 1218
1254 save_processor_state(); 1219 save_processor_state();
@@ -1257,7 +1222,6 @@ static int suspend(int vetoable)
1257 restore_processor_state(); 1222 restore_processor_state();
1258 1223
1259 local_irq_disable(); 1224 local_irq_disable();
1260 set_time();
1261 reinit_timer(); 1225 reinit_timer();
1262 1226
1263 if (err == APM_NO_ERROR) 1227 if (err == APM_NO_ERROR)
@@ -1287,11 +1251,6 @@ static void standby(void)
1287 1251
1288 local_irq_disable(); 1252 local_irq_disable();
1289 device_power_down(PMSG_SUSPEND); 1253 device_power_down(PMSG_SUSPEND);
1290 /* serialize with the timer interrupt */
1291 write_seqlock(&xtime_lock);
1292 /* If needed, notify drivers here */
1293 get_time_diff();
1294 write_sequnlock(&xtime_lock);
1295 local_irq_enable(); 1254 local_irq_enable();
1296 1255
1297 err = set_system_power_state(APM_STATE_STANDBY); 1256 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1385,7 +1344,6 @@ static void check_events(void)
1385 ignore_bounce = 1; 1344 ignore_bounce = 1;
1386 if ((event != APM_NORMAL_RESUME) 1345 if ((event != APM_NORMAL_RESUME)
1387 || (ignore_normal_resume == 0)) { 1346 || (ignore_normal_resume == 0)) {
1388 set_time();
1389 device_resume(); 1347 device_resume();
1390 pm_send_all(PM_RESUME, (void *)0); 1348 pm_send_all(PM_RESUME, (void *)0);
1391 queue_event(event, NULL); 1349 queue_event(event, NULL);
@@ -1401,7 +1359,6 @@ static void check_events(void)
1401 break; 1359 break;
1402 1360
1403 case APM_UPDATE_TIME: 1361 case APM_UPDATE_TIME:
1404 set_time();
1405 break; 1362 break;
1406 1363
1407 case APM_CRITICAL_SUSPEND: 1364 case APM_CRITICAL_SUSPEND:
@@ -1636,9 +1593,8 @@ static int do_open(struct inode * inode, struct file * filp)
1636 return 0; 1593 return 0;
1637} 1594}
1638 1595
1639static int apm_get_info(char *buf, char **start, off_t fpos, int length) 1596static int proc_apm_show(struct seq_file *m, void *v)
1640{ 1597{
1641 char * p;
1642 unsigned short bx; 1598 unsigned short bx;
1643 unsigned short cx; 1599 unsigned short cx;
1644 unsigned short dx; 1600 unsigned short dx;
@@ -1650,8 +1606,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1650 int time_units = -1; 1606 int time_units = -1;
1651 char *units = "?"; 1607 char *units = "?";
1652 1608
1653 p = buf;
1654
1655 if ((num_online_cpus() == 1) && 1609 if ((num_online_cpus() == 1) &&
1656 !(error = apm_get_power_status(&bx, &cx, &dx))) { 1610 !(error = apm_get_power_status(&bx, &cx, &dx))) {
1657 ac_line_status = (bx >> 8) & 0xff; 1611 ac_line_status = (bx >> 8) & 0xff;
@@ -1705,7 +1659,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1705 -1: Unknown 1659 -1: Unknown
1706 8) min = minutes; sec = seconds */ 1660 8) min = minutes; sec = seconds */
1707 1661
1708 p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", 1662 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1709 driver_version, 1663 driver_version,
1710 (apm_info.bios.version >> 8) & 0xff, 1664 (apm_info.bios.version >> 8) & 0xff,
1711 apm_info.bios.version & 0xff, 1665 apm_info.bios.version & 0xff,
@@ -1716,10 +1670,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1716 percentage, 1670 percentage,
1717 time_units, 1671 time_units,
1718 units); 1672 units);
1673 return 0;
1674}
1719 1675
1720 return p - buf; 1676static int proc_apm_open(struct inode *inode, struct file *file)
1677{
1678 return single_open(file, proc_apm_show, NULL);
1721} 1679}
1722 1680
1681static const struct file_operations apm_file_ops = {
1682 .owner = THIS_MODULE,
1683 .open = proc_apm_open,
1684 .read = seq_read,
1685 .llseek = seq_lseek,
1686 .release = single_release,
1687};
1688
1723static int apm(void *unused) 1689static int apm(void *unused)
1724{ 1690{
1725 unsigned short bx; 1691 unsigned short bx;
@@ -2341,9 +2307,9 @@ static int __init apm_init(void)
2341 set_base(gdt[APM_DS >> 3], 2307 set_base(gdt[APM_DS >> 3],
2342 __va((unsigned long)apm_info.bios.dseg << 4)); 2308 __va((unsigned long)apm_info.bios.dseg << 4));
2343 2309
2344 apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); 2310 apm_proc = create_proc_entry("apm", 0, NULL);
2345 if (apm_proc) 2311 if (apm_proc)
2346 apm_proc->owner = THIS_MODULE; 2312 apm_proc->proc_fops = &apm_file_ops;
2347 2313
2348 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2314 kapmd_task = kthread_create(apm, NULL, "kapmd");
2349 if (IS_ERR(kapmd_task)) { 2315 if (IS_ERR(kapmd_task)) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1b2f3cd33270..c37535163bfc 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
72 OFFSET(PT_EAX, pt_regs, eax); 72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds); 73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes); 74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs); 75 OFFSET(PT_FS, pt_regs, xfs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); 76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip); 77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs); 78 OFFSET(PT_CS, pt_regs, xcs);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 8a8bbdaaf38a..dcbbd0a8bfc2 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
605struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
606{ 606{
607 memset(regs, 0, sizeof(struct pt_regs)); 607 memset(regs, 0, sizeof(struct pt_regs));
608 regs->xgs = __KERNEL_PDA; 608 regs->xfs = __KERNEL_PDA;
609 return regs; 609 return regs;
610} 610}
611 611
@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
662 .pcurrent = &init_task, 662 .pcurrent = &init_task,
663}; 663};
664 664
665static inline void set_kernel_gs(void) 665static inline void set_kernel_fs(void)
666{ 666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a 667 /* Set %fs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler 668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */ 669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); 670 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
671} 671}
672 672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for 673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
718 the boot CPU, this will transition from the boot gdt+pda to 718 the boot CPU, this will transition from the boot gdt+pda to
719 the real ones). */ 719 the real ones). */
720 load_gdt(cpu_gdt_descr); 720 load_gdt(cpu_gdt_descr);
721 set_kernel_gs(); 721 set_kernel_fs();
722} 722}
723 723
724/* Common CPU init for both boot and secondary CPUs */ 724/* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
764 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 764 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
765#endif 765#endif
766 766
767 /* Clear %fs. */ 767 /* Clear %gs. */
768 asm volatile ("mov %0, %%fs" : : "r" (0)); 768 asm volatile ("mov %0, %%gs" : : "r" (0));
769 769
770 /* Clear all 6 debug registers: */ 770 /* Clear all 6 debug registers: */
771 set_debugreg(0, 0); 771 set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
index 5299c5bf4454..6c52182ca323 100644
--- a/arch/i386/kernel/cpu/cpufreq/Kconfig
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -217,6 +217,15 @@ config X86_LONGHAUL
217 217
218 If in doubt, say N. 218 If in doubt, say N.
219 219
220config X86_E_POWERSAVER
221 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
222 select CPU_FREQ_TABLE
223 depends on EXPERIMENTAL
224 help
225 This adds the CPUFreq driver for VIA C7 processors.
226
227 If in doubt, say N.
228
220comment "shared options" 229comment "shared options"
221 230
222config X86_ACPI_CPUFREQ_PROC_INTF 231config X86_ACPI_CPUFREQ_PROC_INTF
diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile
index 8de3abe322a9..560f7760dae5 100644
--- a/arch/i386/kernel/cpu/cpufreq/Makefile
+++ b/arch/i386/kernel/cpu/cpufreq/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
5obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
5obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o 6obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
6obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o 7obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
7obj-$(CONFIG_X86_LONGRUN) += longrun.o 8obj-$(CONFIG_X86_LONGRUN) += longrun.o
diff --git a/arch/i386/kernel/cpu/cpufreq/e_powersaver.c b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c
new file mode 100644
index 000000000000..f43d98e11cc7
--- /dev/null
+++ b/arch/i386/kernel/cpu/cpufreq/e_powersaver.c
@@ -0,0 +1,334 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15
16#include <asm/msr.h>
17#include <asm/tsc.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20#include <asm/delay.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26
27struct eps_cpu_data {
28 u32 fsb;
29 struct cpufreq_frequency_table freq_table[];
30};
31
32static struct eps_cpu_data *eps_cpu[NR_CPUS];
33
34
35static unsigned int eps_get(unsigned int cpu)
36{
37 struct eps_cpu_data *centaur;
38 u32 lo, hi;
39
40 if (cpu)
41 return 0;
42 centaur = eps_cpu[cpu];
43 if (centaur == NULL)
44 return 0;
45
46 /* Return current frequency */
47 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
48 return centaur->fsb * ((lo >> 8) & 0xff);
49}
50
51static int eps_set_state(struct eps_cpu_data *centaur,
52 unsigned int cpu,
53 u32 dest_state)
54{
55 struct cpufreq_freqs freqs;
56 u32 lo, hi;
57 int err = 0;
58 int i;
59
60 freqs.old = eps_get(cpu);
61 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
62 freqs.cpu = cpu;
63 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
64
65 /* Wait while CPU is busy */
66 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
67 i = 0;
68 while (lo & ((1 << 16) | (1 << 17))) {
69 udelay(16);
70 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
71 i++;
72 if (unlikely(i > 64)) {
73 err = -ENODEV;
74 goto postchange;
75 }
76 }
77 /* Set new multiplier and voltage */
78 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
79 /* Wait until transition end */
80 i = 0;
81 do {
82 udelay(16);
83 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
84 i++;
85 if (unlikely(i > 64)) {
86 err = -ENODEV;
87 goto postchange;
88 }
89 } while (lo & ((1 << 16) | (1 << 17)));
90
91 /* Return current frequency */
92postchange:
93 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
94 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
95
96 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
97 return err;
98}
99
100static int eps_target(struct cpufreq_policy *policy,
101 unsigned int target_freq,
102 unsigned int relation)
103{
104 struct eps_cpu_data *centaur;
105 unsigned int newstate = 0;
106 unsigned int cpu = policy->cpu;
107 unsigned int dest_state;
108 int ret;
109
110 if (unlikely(eps_cpu[cpu] == NULL))
111 return -ENODEV;
112 centaur = eps_cpu[cpu];
113
114 if (unlikely(cpufreq_frequency_table_target(policy,
115 &eps_cpu[cpu]->freq_table[0],
116 target_freq,
117 relation,
118 &newstate))) {
119 return -EINVAL;
120 }
121
122 /* Make frequency transition */
123 dest_state = centaur->freq_table[newstate].index & 0xffff;
124 ret = eps_set_state(centaur, cpu, dest_state);
125 if (ret)
126 printk(KERN_ERR "eps: Timeout!\n");
127 return ret;
128}
129
130static int eps_verify(struct cpufreq_policy *policy)
131{
132 return cpufreq_frequency_table_verify(policy,
133 &eps_cpu[policy->cpu]->freq_table[0]);
134}
135
136static int eps_cpu_init(struct cpufreq_policy *policy)
137{
138 unsigned int i;
139 u32 lo, hi;
140 u64 val;
141 u8 current_multiplier, current_voltage;
142 u8 max_multiplier, max_voltage;
143 u8 min_multiplier, min_voltage;
144 u8 brand;
145 u32 fsb;
146 struct eps_cpu_data *centaur;
147 struct cpufreq_frequency_table *f_table;
148 int k, step, voltage;
149 int ret;
150 int states;
151
152 if (policy->cpu != 0)
153 return -ENODEV;
154
155 /* Check brand */
156 printk("eps: Detected VIA ");
157 rdmsr(0x1153, lo, hi);
158 brand = (((lo >> 2) ^ lo) >> 18) & 3;
159 switch(brand) {
160 case EPS_BRAND_C7M:
161 printk("C7-M\n");
162 break;
163 case EPS_BRAND_C7:
164 printk("C7\n");
165 break;
166 case EPS_BRAND_EDEN:
167 printk("Eden\n");
168 break;
169 case EPS_BRAND_C3:
170 printk("C3\n");
171 return -ENODEV;
172 break;
173 }
174 /* Enable Enhanced PowerSaver */
175 rdmsrl(MSR_IA32_MISC_ENABLE, val);
176 if (!(val & 1 << 16)) {
177 val |= 1 << 16;
178 wrmsrl(MSR_IA32_MISC_ENABLE, val);
179 /* Can be locked at 0 */
180 rdmsrl(MSR_IA32_MISC_ENABLE, val);
181 if (!(val & 1 << 16)) {
182 printk("eps: Can't enable Enhanced PowerSaver\n");
183 return -ENODEV;
184 }
185 }
186
187 /* Print voltage and multiplier */
188 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
189 current_voltage = lo & 0xff;
190 printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
191 current_multiplier = (lo >> 8) & 0xff;
192 printk("eps: Current multiplier = %d\n", current_multiplier);
193
194 /* Print limits */
195 max_voltage = hi & 0xff;
196 printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
197 max_multiplier = (hi >> 8) & 0xff;
198 printk("eps: Highest multiplier = %d\n", max_multiplier);
199 min_voltage = (hi >> 16) & 0xff;
200 printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
201 min_multiplier = (hi >> 24) & 0xff;
202 printk("eps: Lowest multiplier = %d\n", min_multiplier);
203
204 /* Sanity checks */
205 if (current_multiplier == 0 || max_multiplier == 0
206 || min_multiplier == 0)
207 return -EINVAL;
208 if (current_multiplier > max_multiplier
209 || max_multiplier <= min_multiplier)
210 return -EINVAL;
211 if (current_voltage > 0x1c || max_voltage > 0x1c)
212 return -EINVAL;
213 if (max_voltage < min_voltage)
214 return -EINVAL;
215
216 /* Calc FSB speed */
217 fsb = cpu_khz / current_multiplier;
218 /* Calc number of p-states supported */
219 if (brand == EPS_BRAND_C7M)
220 states = max_multiplier - min_multiplier + 1;
221 else
222 states = 2;
223
224 /* Allocate private data and frequency table for current cpu */
225 centaur = kzalloc(sizeof(struct eps_cpu_data)
226 + (states + 1) * sizeof(struct cpufreq_frequency_table),
227 GFP_KERNEL);
228 if (!centaur)
229 return -ENOMEM;
230 eps_cpu[0] = centaur;
231
232 /* Copy basic values */
233 centaur->fsb = fsb;
234
235 /* Fill frequency and MSR value table */
236 f_table = &centaur->freq_table[0];
237 if (brand != EPS_BRAND_C7M) {
238 f_table[0].frequency = fsb * min_multiplier;
239 f_table[0].index = (min_multiplier << 8) | min_voltage;
240 f_table[1].frequency = fsb * max_multiplier;
241 f_table[1].index = (max_multiplier << 8) | max_voltage;
242 f_table[2].frequency = CPUFREQ_TABLE_END;
243 } else {
244 k = 0;
245 step = ((max_voltage - min_voltage) * 256)
246 / (max_multiplier - min_multiplier);
247 for (i = min_multiplier; i <= max_multiplier; i++) {
248 voltage = (k * step) / 256 + min_voltage;
249 f_table[k].frequency = fsb * i;
250 f_table[k].index = (i << 8) | voltage;
251 k++;
252 }
253 f_table[k].frequency = CPUFREQ_TABLE_END;
254 }
255
256 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
257 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
258 policy->cur = fsb * current_multiplier;
259
260 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
261 if (ret) {
262 kfree(centaur);
263 return ret;
264 }
265
266 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
267 return 0;
268}
269
270static int eps_cpu_exit(struct cpufreq_policy *policy)
271{
272 unsigned int cpu = policy->cpu;
273 struct eps_cpu_data *centaur;
274 u32 lo, hi;
275
276 if (eps_cpu[cpu] == NULL)
277 return -ENODEV;
278 centaur = eps_cpu[cpu];
279
280 /* Get max frequency */
281 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
282 /* Set max frequency */
283 eps_set_state(centaur, cpu, hi & 0xffff);
284 /* Bye */
285 cpufreq_frequency_table_put_attr(policy->cpu);
286 kfree(eps_cpu[cpu]);
287 eps_cpu[cpu] = NULL;
288 return 0;
289}
290
291static struct freq_attr* eps_attr[] = {
292 &cpufreq_freq_attr_scaling_available_freqs,
293 NULL,
294};
295
296static struct cpufreq_driver eps_driver = {
297 .verify = eps_verify,
298 .target = eps_target,
299 .init = eps_cpu_init,
300 .exit = eps_cpu_exit,
301 .get = eps_get,
302 .name = "e_powersaver",
303 .owner = THIS_MODULE,
304 .attr = eps_attr,
305};
306
307static int __init eps_init(void)
308{
309 struct cpuinfo_x86 *c = cpu_data;
310
311 /* This driver will work only on Centaur C7 processors with
312 * Enhanced SpeedStep/PowerSaver registers */
313 if (c->x86_vendor != X86_VENDOR_CENTAUR
314 || c->x86 != 6 || c->x86_model != 10)
315 return -ENODEV;
316 if (!cpu_has(c, X86_FEATURE_EST))
317 return -ENODEV;
318
319 if (cpufreq_register_driver(&eps_driver))
320 return -EINVAL;
321 return 0;
322}
323
324static void __exit eps_exit(void)
325{
326 cpufreq_unregister_driver(&eps_driver);
327}
328
329MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>");
330MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
331MODULE_LICENSE("GPL");
332
333module_init(eps_init);
334module_exit(eps_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index a3db9332d652..b59878a0d9b3 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -8,12 +8,11 @@
8 * VIA have currently 3 different versions of Longhaul. 8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. 9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. 10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is the same as v1, but adds voltage scaling. 11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C) 12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * voltage scaling support has currently been disabled in this driver 13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * until we have code that gets it right.
15 * Version 3 of longhaul got renamed to Powersaver and redesigned 14 * Version 3 of longhaul got renamed to Powersaver and redesigned
16 * to use the POWERSAVER MSR at 0x110a. 15 * to use only the POWERSAVER MSR at 0x110a.
17 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. 16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
18 * It's pretty much the same feature wise to longhaul v2, though 17 * It's pretty much the same feature wise to longhaul v2, though
19 * there is provision for scaling FSB too, but this doesn't work 18 * there is provision for scaling FSB too, but this doesn't work
@@ -51,10 +50,12 @@
51#define CPU_EZRA 3 50#define CPU_EZRA 3
52#define CPU_EZRA_T 4 51#define CPU_EZRA_T 4
53#define CPU_NEHEMIAH 5 52#define CPU_NEHEMIAH 5
53#define CPU_NEHEMIAH_C 6
54 54
55/* Flags */ 55/* Flags */
56#define USE_ACPI_C3 (1 << 1) 56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2) 57#define USE_NORTHBRIDGE (1 << 2)
58#define USE_VT8235 (1 << 3)
58 59
59static int cpu_model; 60static int cpu_model;
60static unsigned int numscales=16; 61static unsigned int numscales=16;
@@ -63,7 +64,8 @@ static unsigned int fsb;
63static struct mV_pos *vrm_mV_table; 64static struct mV_pos *vrm_mV_table;
64static unsigned char *mV_vrm_table; 65static unsigned char *mV_vrm_table;
65struct f_msr { 66struct f_msr {
66 unsigned char vrm; 67 u8 vrm;
68 u8 pos;
67}; 69};
68static struct f_msr f_msr_table[32]; 70static struct f_msr f_msr_table[32];
69 71
@@ -73,10 +75,10 @@ static int can_scale_voltage;
73static struct acpi_processor *pr = NULL; 75static struct acpi_processor *pr = NULL;
74static struct acpi_processor_cx *cx = NULL; 76static struct acpi_processor_cx *cx = NULL;
75static u8 longhaul_flags; 77static u8 longhaul_flags;
78static u8 longhaul_pos;
76 79
77/* Module parameters */ 80/* Module parameters */
78static int scale_voltage; 81static int scale_voltage;
79static int ignore_latency;
80 82
81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) 83#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
82 84
@@ -164,26 +166,47 @@ static void do_longhaul1(unsigned int clock_ratio_index)
164static void do_powersaver(int cx_address, unsigned int clock_ratio_index) 166static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
165{ 167{
166 union msr_longhaul longhaul; 168 union msr_longhaul longhaul;
169 u8 dest_pos;
167 u32 t; 170 u32 t;
168 171
172 dest_pos = f_msr_table[clock_ratio_index].pos;
173
169 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); 174 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
175 /* Setup new frequency */
170 longhaul.bits.RevisionKey = longhaul.bits.RevisionID; 176 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
171 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; 177 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
172 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; 178 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
173 longhaul.bits.EnableSoftBusRatio = 1; 179 /* Setup new voltage */
174 180 if (can_scale_voltage)
175 if (can_scale_voltage) {
176 longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm; 181 longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm;
182 /* Sync to timer tick */
183 safe_halt();
184 /* Raise voltage if necessary */
185 if (can_scale_voltage && longhaul_pos < dest_pos) {
177 longhaul.bits.EnableSoftVID = 1; 186 longhaul.bits.EnableSoftVID = 1;
187 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
188 /* Change voltage */
189 if (!cx_address) {
190 ACPI_FLUSH_CPU_CACHE();
191 halt();
192 } else {
193 ACPI_FLUSH_CPU_CACHE();
194 /* Invoke C3 */
195 inb(cx_address);
196 /* Dummy op - must do something useless after P_LVL3
197 * read */
198 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
199 }
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
202 longhaul_pos = dest_pos;
178 } 203 }
179 204
180 /* Sync to timer tick */
181 safe_halt();
182 /* Change frequency on next halt or sleep */ 205 /* Change frequency on next halt or sleep */
206 longhaul.bits.EnableSoftBusRatio = 1;
183 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); 207 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
184 if (!cx_address) { 208 if (!cx_address) {
185 ACPI_FLUSH_CPU_CACHE(); 209 ACPI_FLUSH_CPU_CACHE();
186 /* Invoke C1 */
187 halt(); 210 halt();
188 } else { 211 } else {
189 ACPI_FLUSH_CPU_CACHE(); 212 ACPI_FLUSH_CPU_CACHE();
@@ -193,12 +216,29 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
193 t = inl(acpi_gbl_FADT.xpm_timer_block.address); 216 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
194 } 217 }
195 /* Disable bus ratio bit */ 218 /* Disable bus ratio bit */
196 local_irq_disable();
197 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
198 longhaul.bits.EnableSoftBusRatio = 0; 219 longhaul.bits.EnableSoftBusRatio = 0;
199 longhaul.bits.EnableSoftBSEL = 0;
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); 220 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
221
222 /* Reduce voltage if necessary */
223 if (can_scale_voltage && longhaul_pos > dest_pos) {
224 longhaul.bits.EnableSoftVID = 1;
225 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
226 /* Change voltage */
227 if (!cx_address) {
228 ACPI_FLUSH_CPU_CACHE();
229 halt();
230 } else {
231 ACPI_FLUSH_CPU_CACHE();
232 /* Invoke C3 */
233 inb(cx_address);
234 /* Dummy op - must do something useless after P_LVL3
235 * read */
236 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
237 }
238 longhaul.bits.EnableSoftVID = 0;
239 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
240 longhaul_pos = dest_pos;
241 }
202} 242}
203 243
204/** 244/**
@@ -257,26 +297,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
257 /* 297 /*
258 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) 298 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
259 * Software controlled multipliers only. 299 * Software controlled multipliers only.
260 *
261 * *NB* Until we get voltage scaling working v1 & v2 are the same code.
262 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C]
263 */ 300 */
264 case TYPE_LONGHAUL_V1: 301 case TYPE_LONGHAUL_V1:
265 case TYPE_LONGHAUL_V2:
266 do_longhaul1(clock_ratio_index); 302 do_longhaul1(clock_ratio_index);
267 break; 303 break;
268 304
269 /* 305 /*
306 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
307 *
270 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) 308 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
271 * We can scale voltage with this too, but that's currently
272 * disabled until we come up with a decent 'match freq to voltage'
273 * algorithm.
274 * When we add voltage scaling, we will also need to do the
275 * voltage/freq setting in order depending on the direction
276 * of scaling (like we do in powernow-k7.c)
277 * Nehemiah can do FSB scaling too, but this has never been proven 309 * Nehemiah can do FSB scaling too, but this has never been proven
278 * to work in practice. 310 * to work in practice.
279 */ 311 */
312 case TYPE_LONGHAUL_V2:
280 case TYPE_POWERSAVER: 313 case TYPE_POWERSAVER:
281 if (longhaul_flags & USE_ACPI_C3) { 314 if (longhaul_flags & USE_ACPI_C3) {
282 /* Don't allow wakeup */ 315 /* Don't allow wakeup */
@@ -301,6 +334,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
301 local_irq_restore(flags); 334 local_irq_restore(flags);
302 preempt_enable(); 335 preempt_enable();
303 336
337 freqs.new = calc_speed(longhaul_get_cpu_mult());
304 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 338 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
305} 339}
306 340
@@ -315,31 +349,19 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
315 349
316#define ROUNDING 0xf 350#define ROUNDING 0xf
317 351
318static int _guess(int guess, int mult)
319{
320 int target;
321
322 target = ((mult/10)*guess);
323 if (mult%10 != 0)
324 target += (guess/2);
325 target += ROUNDING/2;
326 target &= ~ROUNDING;
327 return target;
328}
329
330
331static int guess_fsb(int mult) 352static int guess_fsb(int mult)
332{ 353{
333 int speed = (cpu_khz/1000); 354 int speed = cpu_khz / 1000;
334 int i; 355 int i;
335 int speeds[] = { 66, 100, 133, 200 }; 356 int speeds[] = { 666, 1000, 1333, 2000 };
336 357 int f_max, f_min;
337 speed += ROUNDING/2; 358
338 speed &= ~ROUNDING; 359 for (i = 0; i < 4; i++) {
339 360 f_max = ((speeds[i] * mult) + 50) / 100;
340 for (i=0; i<4; i++) { 361 f_max += (ROUNDING / 2);
341 if (_guess(speeds[i], mult) == speed) 362 f_min = f_max - ROUNDING;
342 return speeds[i]; 363 if ((speed <= f_max) && (speed >= f_min))
364 return speeds[i] / 10;
343 } 365 }
344 return 0; 366 return 0;
345} 367}
@@ -347,67 +369,40 @@ static int guess_fsb(int mult)
347 369
348static int __init longhaul_get_ranges(void) 370static int __init longhaul_get_ranges(void)
349{ 371{
350 unsigned long invalue;
351 unsigned int ezra_t_multipliers[32]= {
352 90, 30, 40, 100, 55, 35, 45, 95,
353 50, 70, 80, 60, 120, 75, 85, 65,
354 -1, 110, 120, -1, 135, 115, 125, 105,
355 130, 150, 160, 140, -1, 155, -1, 145 };
356 unsigned int j, k = 0; 372 unsigned int j, k = 0;
357 union msr_longhaul longhaul; 373 int mult;
358 int mult = 0;
359 374
360 switch (longhaul_version) { 375 /* Get current frequency */
361 case TYPE_LONGHAUL_V1: 376 mult = longhaul_get_cpu_mult();
362 case TYPE_LONGHAUL_V2: 377 if (mult == -1) {
363 /* Ugh, Longhaul v1 didn't have the min/max MSRs. 378 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
364 Assume min=3.0x & max = whatever we booted at. */ 379 return -EINVAL;
380 }
381 fsb = guess_fsb(mult);
382 if (fsb == 0) {
383 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
384 return -EINVAL;
385 }
386 /* Get max multiplier - as we always did.
387 * Longhaul MSR is usefull only when voltage scaling is enabled.
388 * C3 is booting at max anyway. */
389 maxmult = mult;
390 /* Get min multiplier */
391 switch (cpu_model) {
392 case CPU_NEHEMIAH:
393 minmult = 50;
394 break;
395 case CPU_NEHEMIAH_C:
396 minmult = 40;
397 break;
398 default:
365 minmult = 30; 399 minmult = 30;
366 maxmult = mult = longhaul_get_cpu_mult();
367 break; 400 break;
368
369 case TYPE_POWERSAVER:
370 /* Ezra-T */
371 if (cpu_model==CPU_EZRA_T) {
372 minmult = 30;
373 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
374 invalue = longhaul.bits.MaxMHzBR;
375 if (longhaul.bits.MaxMHzBR4)
376 invalue += 16;
377 maxmult = mult = ezra_t_multipliers[invalue];
378 break;
379 }
380
381 /* Nehemiah */
382 if (cpu_model==CPU_NEHEMIAH) {
383 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
384
385 /*
386 * TODO: This code works, but raises a lot of questions.
387 * - Some Nehemiah's seem to have broken Min/MaxMHzBR's.
388 * We get around this by using a hardcoded multiplier of 4.0x
389 * for the minimimum speed, and the speed we booted up at for the max.
390 * This is done in longhaul_get_cpu_mult() by reading the EBLCR register.
391 * - According to some VIA documentation EBLCR is only
392 * in pre-Nehemiah C3s. How this still works is a mystery.
393 * We're possibly using something undocumented and unsupported,
394 * But it works, so we don't grumble.
395 */
396 minmult=40;
397 maxmult = mult = longhaul_get_cpu_mult();
398 break;
399 }
400 } 401 }
401 fsb = guess_fsb(mult);
402 402
403 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", 403 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
404 minmult/10, minmult%10, maxmult/10, maxmult%10); 404 minmult/10, minmult%10, maxmult/10, maxmult%10);
405 405
406 if (fsb == 0) {
407 printk (KERN_INFO PFX "Invalid (reserved) FSB!\n");
408 return -EINVAL;
409 }
410
411 highest_speed = calc_speed(maxmult); 406 highest_speed = calc_speed(maxmult);
412 lowest_speed = calc_speed(minmult); 407 lowest_speed = calc_speed(minmult);
413 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, 408 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
@@ -455,6 +450,7 @@ static void __init longhaul_setup_voltagescaling(void)
455 union msr_longhaul longhaul; 450 union msr_longhaul longhaul;
456 struct mV_pos minvid, maxvid; 451 struct mV_pos minvid, maxvid;
457 unsigned int j, speed, pos, kHz_step, numvscales; 452 unsigned int j, speed, pos, kHz_step, numvscales;
453 int min_vid_speed;
458 454
459 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); 455 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
460 if (!(longhaul.bits.RevisionID & 1)) { 456 if (!(longhaul.bits.RevisionID & 1)) {
@@ -468,14 +464,14 @@ static void __init longhaul_setup_voltagescaling(void)
468 mV_vrm_table = &mV_vrm85[0]; 464 mV_vrm_table = &mV_vrm85[0];
469 } else { 465 } else {
470 printk (KERN_INFO PFX "Mobile VRM\n"); 466 printk (KERN_INFO PFX "Mobile VRM\n");
467 if (cpu_model < CPU_NEHEMIAH)
468 return;
471 vrm_mV_table = &mobilevrm_mV[0]; 469 vrm_mV_table = &mobilevrm_mV[0];
472 mV_vrm_table = &mV_mobilevrm[0]; 470 mV_vrm_table = &mV_mobilevrm[0];
473 } 471 }
474 472
475 minvid = vrm_mV_table[longhaul.bits.MinimumVID]; 473 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
476 maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; 474 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
477 numvscales = maxvid.pos - minvid.pos + 1;
478 kHz_step = (highest_speed - lowest_speed) / numvscales;
479 475
480 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { 476 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
481 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " 477 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
@@ -491,20 +487,59 @@ static void __init longhaul_setup_voltagescaling(void)
491 return; 487 return;
492 } 488 }
493 489
494 printk(KERN_INFO PFX "Max VID=%d.%03d Min VID=%d.%03d, %d possible voltage scales\n", 490 /* How many voltage steps */
491 numvscales = maxvid.pos - minvid.pos + 1;
492 printk(KERN_INFO PFX
493 "Max VID=%d.%03d "
494 "Min VID=%d.%03d, "
495 "%d possible voltage scales\n",
495 maxvid.mV/1000, maxvid.mV%1000, 496 maxvid.mV/1000, maxvid.mV%1000,
496 minvid.mV/1000, minvid.mV%1000, 497 minvid.mV/1000, minvid.mV%1000,
497 numvscales); 498 numvscales);
498 499
500 /* Calculate max frequency at min voltage */
501 j = longhaul.bits.MinMHzBR;
502 if (longhaul.bits.MinMHzBR4)
503 j += 16;
504 min_vid_speed = eblcr_table[j];
505 if (min_vid_speed == -1)
506 return;
507 switch (longhaul.bits.MinMHzFSB) {
508 case 0:
509 min_vid_speed *= 13333;
510 break;
511 case 1:
512 min_vid_speed *= 10000;
513 break;
514 case 3:
515 min_vid_speed *= 6666;
516 break;
517 default:
518 return;
519 break;
520 }
521 if (min_vid_speed >= highest_speed)
522 return;
523 /* Calculate kHz for one voltage step */
524 kHz_step = (highest_speed - min_vid_speed) / numvscales;
525
526
499 j = 0; 527 j = 0;
500 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { 528 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
501 speed = longhaul_table[j].frequency; 529 speed = longhaul_table[j].frequency;
502 pos = (speed - lowest_speed) / kHz_step + minvid.pos; 530 if (speed > min_vid_speed)
531 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
532 else
533 pos = minvid.pos;
503 f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos]; 534 f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos];
535 f_msr_table[longhaul_table[j].index].pos = pos;
504 j++; 536 j++;
505 } 537 }
506 538
539 longhaul_pos = maxvid.pos;
507 can_scale_voltage = 1; 540 can_scale_voltage = 1;
541 printk(KERN_INFO PFX "Voltage scaling enabled. "
542 "Use of \"conservative\" governor is highly recommended.\n");
508} 543}
509 544
510 545
@@ -573,20 +608,51 @@ static int enable_arbiter_disable(void)
573 if (dev != NULL) { 608 if (dev != NULL) {
574 /* Enable access to port 0x22 */ 609 /* Enable access to port 0x22 */
575 pci_read_config_byte(dev, reg, &pci_cmd); 610 pci_read_config_byte(dev, reg, &pci_cmd);
576 if ( !(pci_cmd & 1<<7) ) { 611 if (!(pci_cmd & 1<<7)) {
577 pci_cmd |= 1<<7; 612 pci_cmd |= 1<<7;
578 pci_write_config_byte(dev, reg, pci_cmd); 613 pci_write_config_byte(dev, reg, pci_cmd);
614 pci_read_config_byte(dev, reg, &pci_cmd);
615 if (!(pci_cmd & 1<<7)) {
616 printk(KERN_ERR PFX
617 "Can't enable access to port 0x22.\n");
618 return 0;
619 }
579 } 620 }
580 return 1; 621 return 1;
581 } 622 }
582 return 0; 623 return 0;
583} 624}
584 625
626static int longhaul_setup_vt8235(void)
627{
628 struct pci_dev *dev;
629 u8 pci_cmd;
630
631 /* Find VT8235 southbridge */
632 dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
633 if (dev != NULL) {
634 /* Set transition time to max */
635 pci_read_config_byte(dev, 0xec, &pci_cmd);
636 pci_cmd &= ~(1 << 2);
637 pci_write_config_byte(dev, 0xec, pci_cmd);
638 pci_read_config_byte(dev, 0xe4, &pci_cmd);
639 pci_cmd &= ~(1 << 7);
640 pci_write_config_byte(dev, 0xe4, pci_cmd);
641 pci_read_config_byte(dev, 0xe5, &pci_cmd);
642 pci_cmd |= 1 << 7;
643 pci_write_config_byte(dev, 0xe5, pci_cmd);
644 return 1;
645 }
646 return 0;
647}
648
585static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 649static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
586{ 650{
587 struct cpuinfo_x86 *c = cpu_data; 651 struct cpuinfo_x86 *c = cpu_data;
588 char *cpuname=NULL; 652 char *cpuname=NULL;
589 int ret; 653 int ret;
654 u32 lo, hi;
655 int vt8235_present;
590 656
591 /* Check what we have on this motherboard */ 657 /* Check what we have on this motherboard */
592 switch (c->x86_model) { 658 switch (c->x86_model) {
@@ -599,16 +665,20 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
599 break; 665 break;
600 666
601 case 7: 667 case 7:
602 longhaul_version = TYPE_LONGHAUL_V1;
603 switch (c->x86_mask) { 668 switch (c->x86_mask) {
604 case 0: 669 case 0:
670 longhaul_version = TYPE_LONGHAUL_V1;
605 cpu_model = CPU_SAMUEL2; 671 cpu_model = CPU_SAMUEL2;
606 cpuname = "C3 'Samuel 2' [C5B]"; 672 cpuname = "C3 'Samuel 2' [C5B]";
607 /* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */ 673 /* Note, this is not a typo, early Samuel2's had
608 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); 674 * Samuel1 ratios. */
609 memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr)); 675 memcpy(clock_ratio, samuel1_clock_ratio,
676 sizeof(samuel1_clock_ratio));
677 memcpy(eblcr_table, samuel2_eblcr,
678 sizeof(samuel2_eblcr));
610 break; 679 break;
611 case 1 ... 15: 680 case 1 ... 15:
681 longhaul_version = TYPE_LONGHAUL_V2;
612 if (c->x86_mask < 8) { 682 if (c->x86_mask < 8) {
613 cpu_model = CPU_SAMUEL2; 683 cpu_model = CPU_SAMUEL2;
614 cpuname = "C3 'Samuel 2' [C5B]"; 684 cpuname = "C3 'Samuel 2' [C5B]";
@@ -616,8 +686,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
616 cpu_model = CPU_EZRA; 686 cpu_model = CPU_EZRA;
617 cpuname = "C3 'Ezra' [C5C]"; 687 cpuname = "C3 'Ezra' [C5C]";
618 } 688 }
619 memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio)); 689 memcpy(clock_ratio, ezra_clock_ratio,
620 memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr)); 690 sizeof(ezra_clock_ratio));
691 memcpy(eblcr_table, ezra_eblcr,
692 sizeof(ezra_eblcr));
621 break; 693 break;
622 } 694 }
623 break; 695 break;
@@ -632,24 +704,24 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
632 break; 704 break;
633 705
634 case 9: 706 case 9:
635 cpu_model = CPU_NEHEMIAH;
636 longhaul_version = TYPE_POWERSAVER; 707 longhaul_version = TYPE_POWERSAVER;
637 numscales=32; 708 numscales = 32;
709 memcpy(clock_ratio,
710 nehemiah_clock_ratio,
711 sizeof(nehemiah_clock_ratio));
712 memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
638 switch (c->x86_mask) { 713 switch (c->x86_mask) {
639 case 0 ... 1: 714 case 0 ... 1:
640 cpuname = "C3 'Nehemiah A' [C5N]"; 715 cpu_model = CPU_NEHEMIAH;
641 memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio)); 716 cpuname = "C3 'Nehemiah A' [C5XLOE]";
642 memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr));
643 break; 717 break;
644 case 2 ... 4: 718 case 2 ... 4:
645 cpuname = "C3 'Nehemiah B' [C5N]"; 719 cpu_model = CPU_NEHEMIAH;
646 memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio)); 720 cpuname = "C3 'Nehemiah B' [C5XLOH]";
647 memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr));
648 break; 721 break;
649 case 5 ... 15: 722 case 5 ... 15:
650 cpuname = "C3 'Nehemiah C' [C5N]"; 723 cpu_model = CPU_NEHEMIAH_C;
651 memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio)); 724 cpuname = "C3 'Nehemiah C' [C5P]";
652 memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr));
653 break; 725 break;
654 } 726 }
655 break; 727 break;
@@ -658,6 +730,13 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
658 cpuname = "Unknown"; 730 cpuname = "Unknown";
659 break; 731 break;
660 } 732 }
733 /* Check Longhaul ver. 2 */
734 if (longhaul_version == TYPE_LONGHAUL_V2) {
735 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
736 if (lo == 0 && hi == 0)
737 /* Looks like MSR isn't present */
738 longhaul_version = TYPE_LONGHAUL_V1;
739 }
661 740
662 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); 741 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
663 switch (longhaul_version) { 742 switch (longhaul_version) {
@@ -670,15 +749,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
670 break; 749 break;
671 }; 750 };
672 751
752 /* Doesn't hurt */
753 vt8235_present = longhaul_setup_vt8235();
754
673 /* Find ACPI data for processor */ 755 /* Find ACPI data for processor */
674 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, 756 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
675 &longhaul_walk_callback, NULL, (void *)&pr); 757 ACPI_UINT32_MAX, &longhaul_walk_callback,
758 NULL, (void *)&pr);
676 759
677 /* Check ACPI support for C3 state */ 760 /* Check ACPI support for C3 state */
678 if ((pr != NULL) && (longhaul_version == TYPE_POWERSAVER)) { 761 if (pr != NULL && longhaul_version != TYPE_LONGHAUL_V1) {
679 cx = &pr->power.states[ACPI_STATE_C3]; 762 cx = &pr->power.states[ACPI_STATE_C3];
680 if (cx->address > 0 && 763 if (cx->address > 0 && cx->latency <= 1000) {
681 (cx->latency <= 1000 || ignore_latency != 0) ) {
682 longhaul_flags |= USE_ACPI_C3; 764 longhaul_flags |= USE_ACPI_C3;
683 goto print_support_type; 765 goto print_support_type;
684 } 766 }
@@ -688,8 +770,11 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
688 longhaul_flags |= USE_NORTHBRIDGE; 770 longhaul_flags |= USE_NORTHBRIDGE;
689 goto print_support_type; 771 goto print_support_type;
690 } 772 }
691 773 /* Use VT8235 southbridge if present */
692 /* No ACPI C3 or we can't use it */ 774 if (longhaul_version == TYPE_POWERSAVER && vt8235_present) {
775 longhaul_flags |= USE_VT8235;
776 goto print_support_type;
777 }
693 /* Check ACPI support for bus master arbiter disable */ 778 /* Check ACPI support for bus master arbiter disable */
694 if ((pr == NULL) || !(pr->flags.bm_control)) { 779 if ((pr == NULL) || !(pr->flags.bm_control)) {
695 printk(KERN_ERR PFX 780 printk(KERN_ERR PFX
@@ -698,18 +783,18 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
698 } 783 }
699 784
700print_support_type: 785print_support_type:
701 if (!(longhaul_flags & USE_NORTHBRIDGE)) { 786 if (longhaul_flags & USE_NORTHBRIDGE)
702 printk (KERN_INFO PFX "Using ACPI support.\n");
703 } else {
704 printk (KERN_INFO PFX "Using northbridge support.\n"); 787 printk (KERN_INFO PFX "Using northbridge support.\n");
705 } 788 else if (longhaul_flags & USE_VT8235)
789 printk (KERN_INFO PFX "Using VT8235 support.\n");
790 else
791 printk (KERN_INFO PFX "Using ACPI support.\n");
706 792
707 ret = longhaul_get_ranges(); 793 ret = longhaul_get_ranges();
708 if (ret != 0) 794 if (ret != 0)
709 return ret; 795 return ret;
710 796
711 if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) && 797 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
712 (scale_voltage != 0))
713 longhaul_setup_voltagescaling(); 798 longhaul_setup_voltagescaling();
714 799
715 policy->governor = CPUFREQ_DEFAULT_GOVERNOR; 800 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
@@ -797,8 +882,6 @@ static void __exit longhaul_exit(void)
797 882
798module_param (scale_voltage, int, 0644); 883module_param (scale_voltage, int, 0644);
799MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 884MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
800module_param(ignore_latency, int, 0644);
801MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test");
802 885
803MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 886MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
804MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 887MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h
index bc4682aad69b..bb0a04b1d1ab 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h
@@ -235,84 +235,14 @@ static int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static int __initdata nehemiah_a_clock_ratio[32] = { 238static int __initdata nehemiah_clock_ratio[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 160, /* 0001 -> 16.0x */ 240 160, /* 0001 -> 16.0x */
241 -1, /* 0010 -> RESERVED */ 241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 -1, /* 0110 -> RESERVED */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 100, /* 0000 -> 10.0x */
256 -1, /* 0001 -> RESERVED */
257 120, /* 0010 -> 12.0x */
258 90, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 120, /* 1111 -> 12.0x */
271};
272
273static int __initdata nehemiah_b_clock_ratio[32] = {
274 100, /* 0000 -> 10.0x */
275 160, /* 0001 -> 16.0x */
276 -1, /* 0010 -> RESERVED */
277 90, /* 0011 -> 9.0x */
278 95, /* 0100 -> 9.5x */
279 -1, /* 0101 -> RESERVED */
280 -1, /* 0110 -> RESERVED */
281 55, /* 0111 -> 5.5x */
282 60, /* 1000 -> 6.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 50, /* 1011 -> 5.0x */
286 65, /* 1100 -> 6.5x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 120, /* 1111 -> 12.0x */
290 100, /* 0000 -> 10.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 90, /* 0011 -> 9.0x */
294 105, /* 0100 -> 10.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 135, /* 0111 -> 13.5x */
298 140, /* 1000 -> 14.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 130, /* 1011 -> 13.0x */
302 145, /* 1100 -> 14.5x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 120, /* 1111 -> 12.0x */
306};
307
308static int __initdata nehemiah_c_clock_ratio[32] = {
309 100, /* 0000 -> 10.0x */
310 160, /* 0001 -> 16.0x */
311 40, /* 0010 -> RESERVED */
312 90, /* 0011 -> 9.0x */ 242 90, /* 0011 -> 9.0x */
313 95, /* 0100 -> 9.5x */ 243 95, /* 0100 -> 9.5x */
314 -1, /* 0101 -> RESERVED */ 244 -1, /* 0101 -> RESERVED */
315 45, /* 0110 -> RESERVED */ 245 45, /* 0110 -> 4.5x */
316 55, /* 0111 -> 5.5x */ 246 55, /* 0111 -> 5.5x */
317 60, /* 1000 -> 6.0x */ 247 60, /* 1000 -> 6.0x */
318 70, /* 1001 -> 7.0x */ 248 70, /* 1001 -> 7.0x */
@@ -340,84 +270,14 @@ static int __initdata nehemiah_c_clock_ratio[32] = {
340 120, /* 1111 -> 12.0x */ 270 120, /* 1111 -> 12.0x */
341}; 271};
342 272
343static int __initdata nehemiah_a_eblcr[32] = { 273static int __initdata nehemiah_eblcr[32] = {
344 50, /* 0000 -> 5.0x */
345 160, /* 0001 -> 16.0x */
346 -1, /* 0010 -> RESERVED */
347 100, /* 0011 -> 10.0x */
348 55, /* 0100 -> 5.5x */
349 -1, /* 0101 -> RESERVED */
350 -1, /* 0110 -> RESERVED */
351 95, /* 0111 -> 9.5x */
352 90, /* 1000 -> 9.0x */
353 70, /* 1001 -> 7.0x */
354 80, /* 1010 -> 8.0x */
355 60, /* 1011 -> 6.0x */
356 120, /* 1100 -> 12.0x */
357 75, /* 1101 -> 7.5x */
358 85, /* 1110 -> 8.5x */
359 65, /* 1111 -> 6.5x */
360 90, /* 0000 -> 9.0x */
361 -1, /* 0001 -> RESERVED */
362 120, /* 0010 -> 12.0x */
363 100, /* 0011 -> 10.0x */
364 135, /* 0100 -> 13.5x */
365 115, /* 0101 -> 11.5x */
366 125, /* 0110 -> 12.5x */
367 105, /* 0111 -> 10.5x */
368 130, /* 1000 -> 13.0x */
369 150, /* 1001 -> 15.0x */
370 160, /* 1010 -> 16.0x */
371 140, /* 1011 -> 14.0x */
372 120, /* 1100 -> 12.0x */
373 155, /* 1101 -> 15.5x */
374 -1, /* 1110 -> RESERVED (13.0x) */
375 145 /* 1111 -> 14.5x */
376 /* end of table */
377};
378static int __initdata nehemiah_b_eblcr[32] = {
379 50, /* 0000 -> 5.0x */
380 160, /* 0001 -> 16.0x */
381 -1, /* 0010 -> RESERVED */
382 100, /* 0011 -> 10.0x */
383 55, /* 0100 -> 5.5x */
384 -1, /* 0101 -> RESERVED */
385 -1, /* 0110 -> RESERVED */
386 95, /* 0111 -> 9.5x */
387 90, /* 1000 -> 9.0x */
388 70, /* 1001 -> 7.0x */
389 80, /* 1010 -> 8.0x */
390 60, /* 1011 -> 6.0x */
391 120, /* 1100 -> 12.0x */
392 75, /* 1101 -> 7.5x */
393 85, /* 1110 -> 8.5x */
394 65, /* 1111 -> 6.5x */
395 90, /* 0000 -> 9.0x */
396 110, /* 0001 -> 11.0x */
397 120, /* 0010 -> 12.0x */
398 100, /* 0011 -> 10.0x */
399 135, /* 0100 -> 13.5x */
400 115, /* 0101 -> 11.5x */
401 125, /* 0110 -> 12.5x */
402 105, /* 0111 -> 10.5x */
403 130, /* 1000 -> 13.0x */
404 150, /* 1001 -> 15.0x */
405 160, /* 1010 -> 16.0x */
406 140, /* 1011 -> 14.0x */
407 120, /* 1100 -> 12.0x */
408 155, /* 1101 -> 15.5x */
409 -1, /* 1110 -> RESERVED (13.0x) */
410 145 /* 1111 -> 14.5x */
411 /* end of table */
412};
413static int __initdata nehemiah_c_eblcr[32] = {
414 50, /* 0000 -> 5.0x */ 274 50, /* 0000 -> 5.0x */
415 160, /* 0001 -> 16.0x */ 275 160, /* 0001 -> 16.0x */
416 40, /* 0010 -> RESERVED */ 276 40, /* 0010 -> 4.0x */
417 100, /* 0011 -> 10.0x */ 277 100, /* 0011 -> 10.0x */
418 55, /* 0100 -> 5.5x */ 278 55, /* 0100 -> 5.5x */
419 -1, /* 0101 -> RESERVED */ 279 -1, /* 0101 -> RESERVED */
420 45, /* 0110 -> RESERVED */ 280 45, /* 0110 -> 4.5x */
421 95, /* 0111 -> 9.5x */ 281 95, /* 0111 -> 9.5x */
422 90, /* 1000 -> 9.0x */ 282 90, /* 1000 -> 9.0x */
423 70, /* 1001 -> 7.0x */ 283 70, /* 1001 -> 7.0x */
@@ -443,7 +303,6 @@ static int __initdata nehemiah_c_eblcr[32] = {
443 155, /* 1101 -> 15.5x */ 303 155, /* 1101 -> 15.5x */
444 -1, /* 1110 -> RESERVED (13.0x) */ 304 -1, /* 1110 -> RESERVED (13.0x) */
445 145 /* 1111 -> 14.5x */ 305 145 /* 1111 -> 14.5x */
446 /* end of table */
447}; 306};
448 307
449/* 308/*
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 2d6491672559..fe3b67005ebb 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -1289,7 +1289,11 @@ static unsigned int powernowk8_get (unsigned int cpu)
1289 if (query_current_values_with_pending_wait(data)) 1289 if (query_current_values_with_pending_wait(data))
1290 goto out; 1290 goto out;
1291 1291
1292 khz = find_khz_freq_from_fid(data->currfid); 1292 if (cpu_family == CPU_HW_PSTATE)
1293 khz = find_khz_freq_from_fiddid(data->currfid, data->currdid);
1294 else
1295 khz = find_khz_freq_from_fid(data->currfid);
1296
1293 1297
1294out: 1298out:
1295 set_cpus_allowed(current, oldmask); 1299 set_cpus_allowed(current, oldmask);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index c0c3b59de32c..de27bd07bc9c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -6,6 +6,7 @@
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/timer.h> 8#include <asm/timer.h>
9#include <asm/pci-direct.h>
9 10
10#include "cpu.h" 11#include "cpu.h"
11 12
@@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void)
161static void __cpuinit geode_configure(void) 162static void __cpuinit geode_configure(void)
162{ 163{
163 unsigned long flags; 164 unsigned long flags;
164 u8 ccr3, ccr4; 165 u8 ccr3;
165 local_irq_save(flags); 166 local_irq_save(flags);
166 167
167 /* Suspend on halt power saving and enable #SUSP pin */ 168 /* Suspend on halt power saving and enable #SUSP pin */
168 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 169 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
169 170
170 ccr3 = getCx86(CX86_CCR3); 171 ccr3 = getCx86(CX86_CCR3);
171 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ 172 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
172
173 ccr4 = getCx86(CX86_CCR4);
174 ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */
175 173
176 setCx86(CX86_CCR3, ccr3); 174
175 /* FPU fast, DTE cache, Mem bypass */
176 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
177 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
177 178
178 set_cx86_memwb(); 179 set_cx86_memwb();
179 set_cx86_reorder(); 180 set_cx86_reorder();
@@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void)
183} 184}
184 185
185 186
186#ifdef CONFIG_PCI
187static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
188 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
189 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
190 { },
191};
192#endif
193
194static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 187static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
195{ 188{
196 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; 189 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
@@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
258 251
259 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ 252 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
260#ifdef CONFIG_PCI 253#ifdef CONFIG_PCI
254 {
255 u32 vendor, device;
261 /* It isn't really a PCI quirk directly, but the cure is the 256 /* It isn't really a PCI quirk directly, but the cure is the
262 same. The MediaGX has deep magic SMM stuff that handles the 257 same. The MediaGX has deep magic SMM stuff that handles the
263 SB emulation. It thows away the fifo on disable_dma() which 258 SB emulation. It thows away the fifo on disable_dma() which
@@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
273 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); 268 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
274 isa_dma_bridge_buggy = 2; 269 isa_dma_bridge_buggy = 2;
275 270
271 /* We do this before the PCI layer is running. However we
272 are safe here as we know the bridge must be a Cyrix
273 companion and must be present */
274 vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
275 device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
276 276
277 /* 277 /*
278 * The 5510/5520 companion chips have a funky PIT. 278 * The 5510/5520 companion chips have a funky PIT.
279 */ 279 */
280 if (pci_dev_present(cyrix_55x0)) 280 if (vendor == PCI_VENDOR_ID_CYRIX &&
281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
281 pit_latch_buggy = 1; 282 pit_latch_buggy = 1;
283 }
282#endif 284#endif
283 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ 285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
284 286
285 /* GXm supports extended cpuid levels 'ala' AMD */ 287 /* GXm supports extended cpuid levels 'ala' AMD */
286 if (c->cpuid_level == 2) { 288 if (c->cpuid_level == 2) {
287 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 289 /* Enable cxMMX extensions (GX1 Datasheet 54) */
288 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 290 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
289 291
290 /* GXlv/GXm/GX1 */ 292 /*
291 if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) 293 * GXm : 0x30 ... 0x5f GXm datasheet 51
294 * GXlv: 0x6x GXlv datasheet 54
295 * ? : 0x7x
296 * GX1 : 0x8x GX1 datasheet 56
297 */
298 if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
292 geode_configure(); 299 geode_configure();
293 get_model_name(c); /* get CPU marketing name */ 300 get_model_name(c); /* get CPU marketing name */
294 return; 301 return;
@@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
415 422
416 if (dir0 == 5 || dir0 == 3) 423 if (dir0 == 5 || dir0 == 3)
417 { 424 {
418 unsigned char ccr3, ccr4; 425 unsigned char ccr3;
419 unsigned long flags; 426 unsigned long flags;
420 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); 427 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
421 local_irq_save(flags); 428 local_irq_save(flags);
422 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
423 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
424 ccr4 = getCx86(CX86_CCR4); 431 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */
425 setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
426 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
427 local_irq_restore(flags); 433 local_irq_restore(flags);
428 } 434 }
429 } 435 }
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index d555bec0db99..4f10c62d180c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -12,6 +12,7 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/system.h> 14#include <asm/system.h>
15#include <asm/mce.h>
15 16
16#include "mce.h" 17#include "mce.h"
17 18
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
index 84fd4cf7d0fb..81fb6e2d35f3 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h>
2 3
3void amd_mcheck_init(struct cpuinfo_x86 *c); 4void amd_mcheck_init(struct cpuinfo_x86 *c);
4void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 5void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c);
9/* Call the installed machine check handler for this CPU setup. */ 10/* Call the installed machine check handler for this CPU setup. */
10extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); 11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
11 12
12extern int mce_disabled;
13extern int nr_mce_banks; 13extern int nr_mce_banks;
14 14
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..8359c19d3a23 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/idle.h>
15 16
16#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
17 18
@@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm
59 60
60fastcall void smp_thermal_interrupt(struct pt_regs *regs) 61fastcall void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
63 exit_idle();
62 irq_enter(); 64 irq_enter();
63 vendor_thermal_interrupt(regs); 65 vendor_thermal_interrupt(regs);
64 irq_exit(); 66 irq_exit();
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index ee771f305f96..c7d8f1756745 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
211 default: 211 default:
212 return -ENOTTY; 212 return -ENOTTY;
213 case MTRRIOC_ADD_ENTRY: 213 case MTRRIOC_ADD_ENTRY:
214#ifdef CONFIG_COMPAT
215 case MTRRIOC32_ADD_ENTRY:
216#endif
214 if (!capable(CAP_SYS_ADMIN)) 217 if (!capable(CAP_SYS_ADMIN))
215 return -EPERM; 218 return -EPERM;
216 err = 219 err =
@@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
218 file, 0); 221 file, 0);
219 break; 222 break;
220 case MTRRIOC_SET_ENTRY: 223 case MTRRIOC_SET_ENTRY:
224#ifdef CONFIG_COMPAT
225 case MTRRIOC32_SET_ENTRY:
226#endif
221 if (!capable(CAP_SYS_ADMIN)) 227 if (!capable(CAP_SYS_ADMIN))
222 return -EPERM; 228 return -EPERM;
223 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); 229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
224 break; 230 break;
225 case MTRRIOC_DEL_ENTRY: 231 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT
233 case MTRRIOC32_DEL_ENTRY:
234#endif
226 if (!capable(CAP_SYS_ADMIN)) 235 if (!capable(CAP_SYS_ADMIN))
227 return -EPERM; 236 return -EPERM;
228 err = mtrr_file_del(sentry.base, sentry.size, file, 0); 237 err = mtrr_file_del(sentry.base, sentry.size, file, 0);
229 break; 238 break;
230 case MTRRIOC_KILL_ENTRY: 239 case MTRRIOC_KILL_ENTRY:
240#ifdef CONFIG_COMPAT
241 case MTRRIOC32_KILL_ENTRY:
242#endif
231 if (!capable(CAP_SYS_ADMIN)) 243 if (!capable(CAP_SYS_ADMIN))
232 return -EPERM; 244 return -EPERM;
233 err = mtrr_del(-1, sentry.base, sentry.size); 245 err = mtrr_del(-1, sentry.base, sentry.size);
234 break; 246 break;
235 case MTRRIOC_GET_ENTRY: 247 case MTRRIOC_GET_ENTRY:
248#ifdef CONFIG_COMPAT
249 case MTRRIOC32_GET_ENTRY:
250#endif
236 if (gentry.regnum >= num_var_ranges) 251 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 252 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); 253 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
249 264
250 break; 265 break;
251 case MTRRIOC_ADD_PAGE_ENTRY: 266 case MTRRIOC_ADD_PAGE_ENTRY:
267#ifdef CONFIG_COMPAT
268 case MTRRIOC32_ADD_PAGE_ENTRY:
269#endif
252 if (!capable(CAP_SYS_ADMIN)) 270 if (!capable(CAP_SYS_ADMIN))
253 return -EPERM; 271 return -EPERM;
254 err = 272 err =
@@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
256 file, 1); 274 file, 1);
257 break; 275 break;
258 case MTRRIOC_SET_PAGE_ENTRY: 276 case MTRRIOC_SET_PAGE_ENTRY:
277#ifdef CONFIG_COMPAT
278 case MTRRIOC32_SET_PAGE_ENTRY:
279#endif
259 if (!capable(CAP_SYS_ADMIN)) 280 if (!capable(CAP_SYS_ADMIN))
260 return -EPERM; 281 return -EPERM;
261 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); 282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
262 break; 283 break;
263 case MTRRIOC_DEL_PAGE_ENTRY: 284 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT
286 case MTRRIOC32_DEL_PAGE_ENTRY:
287#endif
264 if (!capable(CAP_SYS_ADMIN)) 288 if (!capable(CAP_SYS_ADMIN))
265 return -EPERM; 289 return -EPERM;
266 err = mtrr_file_del(sentry.base, sentry.size, file, 1); 290 err = mtrr_file_del(sentry.base, sentry.size, file, 1);
267 break; 291 break;
268 case MTRRIOC_KILL_PAGE_ENTRY: 292 case MTRRIOC_KILL_PAGE_ENTRY:
293#ifdef CONFIG_COMPAT
294 case MTRRIOC32_KILL_PAGE_ENTRY:
295#endif
269 if (!capable(CAP_SYS_ADMIN)) 296 if (!capable(CAP_SYS_ADMIN))
270 return -EPERM; 297 return -EPERM;
271 err = mtrr_del_page(-1, sentry.base, sentry.size); 298 err = mtrr_del_page(-1, sentry.base, sentry.size);
272 break; 299 break;
273 case MTRRIOC_GET_PAGE_ENTRY: 300 case MTRRIOC_GET_PAGE_ENTRY:
301#ifdef CONFIG_COMPAT
302 case MTRRIOC32_GET_PAGE_ENTRY:
303#endif
274 if (gentry.regnum >= num_var_ranges) 304 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 305 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); 306 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 16bb7ea87145..0acfb6a5a220 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -50,7 +50,7 @@ u32 num_var_ranges = 0;
50unsigned int *usage_table; 50unsigned int *usage_table;
51static DEFINE_MUTEX(mtrr_mutex); 51static DEFINE_MUTEX(mtrr_mutex);
52 52
53u32 size_or_mask, size_and_mask; 53u64 size_or_mask, size_and_mask;
54 54
55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; 55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
56 56
@@ -662,8 +662,8 @@ void __init mtrr_bp_init(void)
662 boot_cpu_data.x86_mask == 0x4)) 662 boot_cpu_data.x86_mask == 0x4))
663 phys_addr = 36; 663 phys_addr = 36;
664 664
665 size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); 665 size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
666 size_and_mask = ~size_or_mask & 0xfff00000; 666 size_and_mask = ~size_or_mask & 0xfffff00000ULL;
667 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && 667 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
668 boot_cpu_data.x86 == 6) { 668 boot_cpu_data.x86 == 6) {
669 /* VIA C* family have Intel style MTRRs, but 669 /* VIA C* family have Intel style MTRRs, but
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index d61ea9db6cfe..289dfe6030e3 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -84,7 +84,7 @@ void get_mtrr_state(void);
84 84
85extern void set_mtrr_ops(struct mtrr_ops * ops); 85extern void set_mtrr_ops(struct mtrr_ops * ops);
86 86
87extern u32 size_or_mask, size_and_mask; 87extern u64 size_or_mask, size_and_mask;
88extern struct mtrr_ops * mtrr_if; 88extern struct mtrr_ops * mtrr_if;
89 89
90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 6624d8583c42..47e3ebbfb28d 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, 30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, 31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
32 NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", 32 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
33 33
34 /* Transmeta-defined */ 34 /* Transmeta-defined */
35 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, 35 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
47 /* Intel-defined (#2) */ 47 /* Intel-defined (#2) */
48 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", 48 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
49 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, 49 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
50 NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, 50 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
51 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 51 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
52 52
53 /* VIA/Cyrix/Centaur-defined */ 53 /* VIA/Cyrix/Centaur-defined */
@@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
57 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 57 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
58 58
59 /* AMD-defined (#2) */ 59 /* AMD-defined (#2) */
60 "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, 60 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
61 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 61 "sse4a", "misalignsse",
62 "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 64 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64 }; 65 };
@@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
69 "ttp", /* thermal trip */ 70 "ttp", /* thermal trip */
70 "tm", 71 "tm",
71 "stc", 72 "stc",
73 "100mhzsteps",
74 "hwpstate",
72 NULL, 75 NULL,
73 /* nothing */ /* constant_tsc - moved to flags */ 76 NULL, /* constant_tsc - moved to flags */
77 /* nothing */
74 }; 78 };
75 struct cpuinfo_x86 *c = v; 79 struct cpuinfo_x86 *c = v;
76 int i, n = c - cpu_data; 80 int i, n = c - cpu_data;
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 4056fb7d2cdf..5678d46863c6 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 9{
10 unsigned int cap_mask, uk, max, dummy; 10 unsigned int cap_mask, uk, max, dummy;
11 unsigned int cms_rev1, cms_rev2; 11 unsigned int cms_rev1, cms_rev2;
12 unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; 12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 13 char cpu_info[65];
14 14
15 get_model_name(c); /* Same as AMD/Cyrix */ 15 get_model_name(c); /* Same as AMD/Cyrix */
@@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
72 wrmsr(0x80860004, ~0, uk); 72 wrmsr(0x80860004, ~0, uk);
73 c->x86_capability[0] = cpuid_edx(0x00000001); 73 c->x86_capability[0] = cpuid_edx(0x00000001);
74 wrmsr(0x80860004, cap_mask, uk); 74 wrmsr(0x80860004, cap_mask, uk);
75
76 /* All Transmeta CPUs have a constant TSC */
77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
75 78
76 /* If we can run i686 user-space code, call us an i686 */ 79 /* If we can run i686 user-space code, call us an i686 */
77#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) 80#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 4da75fa3208d..eeae0d992337 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -48,7 +48,6 @@ static struct class *cpuid_class;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 49
50struct cpuid_command { 50struct cpuid_command {
51 int cpu;
52 u32 reg; 51 u32 reg;
53 u32 *data; 52 u32 *data;
54}; 53};
@@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block)
57{ 56{
58 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; 57 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
59 58
60 if (cmd->cpu == smp_processor_id()) 59 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
61 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
62 &cmd->data[3]); 60 &cmd->data[3]);
63} 61}
64 62
@@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data)
70 if (cpu == smp_processor_id()) { 68 if (cpu == smp_processor_id()) {
71 cpuid(reg, &data[0], &data[1], &data[2], &data[3]); 69 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
72 } else { 70 } else {
73 cmd.cpu = cpu;
74 cmd.reg = reg; 71 cmd.reg = reg;
75 cmd.data = data; 72 cmd.data = data;
76 73
77 smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); 74 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
78 } 75 }
79 preempt_enable(); 76 preempt_enable();
80} 77}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index f391abcf7da9..70f39560846a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/page.h> 15#include <asm/page.h>
16#include <asm/e820.h> 16#include <asm/e820.h>
17#include <asm/setup.h>
17 18
18#ifdef CONFIG_EFI 19#ifdef CONFIG_EFI
19int efi_enabled = 0; 20int efi_enabled = 0;
@@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { {
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO 157 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} }; 158} };
158 159
159static int romsignature(const unsigned char *x) 160#define ROMSIGNATURE 0xaa55
161
162static int __init romsignature(const unsigned char *rom)
160{ 163{
161 unsigned short sig; 164 unsigned short sig;
162 int ret = 0; 165
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0) 166 return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
164 ret = (sig == 0xaa55); 167 sig == ROMSIGNATURE;
165 return ret;
166} 168}
167 169
168static int __init romchecksum(unsigned char *rom, unsigned long length) 170static int __init romchecksum(unsigned char *rom, unsigned long length)
169{ 171{
170 unsigned char *p, sum = 0; 172 unsigned char sum;
171 173
172 for (p = rom; p < rom + length; p++) 174 for (sum = 0; length; length--)
173 sum += *p; 175 sum += *rom++;
174 return sum == 0; 176 return sum == 0;
175} 177}
176 178
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683fc63a..18bddcb8e9e8 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - %gs 33 * 24(%esp) - %fs
34 * 28(%esp) - orig_eax 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %eip 35 * 2C(%esp) - %eip
36 * 30(%esp) - %cs 36 * 30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK = 0x00020000
99 99
100#define SAVE_ALL \ 100#define SAVE_ALL \
101 cld; \ 101 cld; \
102 pushl %gs; \ 102 pushl %fs; \
103 CFI_ADJUST_CFA_OFFSET 4;\ 103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\ 104 /*CFI_REL_OFFSET fs, 0;*/\
105 pushl %es; \ 105 pushl %es; \
106 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
107 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK = 0x00020000
133 movl %edx, %ds; \ 133 movl %edx, %ds; \
134 movl %edx, %es; \ 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \ 135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs 136 movl %edx, %fs
137 137
138#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
139 popl %ebx; \ 139 popl %ebx; \
@@ -166,9 +166,9 @@ VM_MASK = 0x00020000
1662: popl %es; \ 1662: popl %es; \
167 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
168 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
1693: popl %gs; \ 1693: popl %fs; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 170 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE gs;*/\ 171 /*CFI_RESTORE fs;*/\
172.pushsection .fixup,"ax"; \ 172.pushsection .fixup,"ax"; \
1734: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \ 174 jmp 1b; \
@@ -227,6 +227,7 @@ ENTRY(ret_from_fork)
227 CFI_ADJUST_CFA_OFFSET -4 227 CFI_ADJUST_CFA_OFFSET -4
228 jmp syscall_exit 228 jmp syscall_exit
229 CFI_ENDPROC 229 CFI_ENDPROC
230END(ret_from_fork)
230 231
231/* 232/*
232 * Return to user mode is not as complex as all this looks, 233 * Return to user mode is not as complex as all this looks,
@@ -258,6 +259,7 @@ ENTRY(resume_userspace)
258 # int/exception return? 259 # int/exception return?
259 jne work_pending 260 jne work_pending
260 jmp restore_all 261 jmp restore_all
262END(ret_from_exception)
261 263
262#ifdef CONFIG_PREEMPT 264#ifdef CONFIG_PREEMPT
263ENTRY(resume_kernel) 265ENTRY(resume_kernel)
@@ -272,6 +274,7 @@ need_resched:
272 jz restore_all 274 jz restore_all
273 call preempt_schedule_irq 275 call preempt_schedule_irq
274 jmp need_resched 276 jmp need_resched
277END(resume_kernel)
275#endif 278#endif
276 CFI_ENDPROC 279 CFI_ENDPROC
277 280
@@ -349,16 +352,17 @@ sysenter_past_esp:
349 movl PT_OLDESP(%esp), %ecx 352 movl PT_OLDESP(%esp), %ecx
350 xorl %ebp,%ebp 353 xorl %ebp,%ebp
351 TRACE_IRQS_ON 354 TRACE_IRQS_ON
3521: mov PT_GS(%esp), %gs 3551: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 356 ENABLE_INTERRUPTS_SYSEXIT
354 CFI_ENDPROC 357 CFI_ENDPROC
355.pushsection .fixup,"ax" 358.pushsection .fixup,"ax"
3562: movl $0,PT_GS(%esp) 3592: movl $0,PT_FS(%esp)
357 jmp 1b 360 jmp 1b
358.section __ex_table,"a" 361.section __ex_table,"a"
359 .align 4 362 .align 4
360 .long 1b,2b 363 .long 1b,2b
361.popsection 364.popsection
365ENDPROC(sysenter_entry)
362 366
363 # system call handler stub 367 # system call handler stub
364ENTRY(system_call) 368ENTRY(system_call)
@@ -459,6 +463,7 @@ ldt_ss:
459 CFI_ADJUST_CFA_OFFSET -8 463 CFI_ADJUST_CFA_OFFSET -8
460 jmp restore_nocheck 464 jmp restore_nocheck
461 CFI_ENDPROC 465 CFI_ENDPROC
466ENDPROC(system_call)
462 467
463 # perform work that needs to be done immediately before resumption 468 # perform work that needs to be done immediately before resumption
464 ALIGN 469 ALIGN
@@ -504,6 +509,7 @@ work_notifysig_v86:
504 xorl %edx, %edx 509 xorl %edx, %edx
505 call do_notify_resume 510 call do_notify_resume
506 jmp resume_userspace_sig 511 jmp resume_userspace_sig
512END(work_pending)
507 513
508 # perform syscall exit tracing 514 # perform syscall exit tracing
509 ALIGN 515 ALIGN
@@ -519,6 +525,7 @@ syscall_trace_entry:
519 cmpl $(nr_syscalls), %eax 525 cmpl $(nr_syscalls), %eax
520 jnae syscall_call 526 jnae syscall_call
521 jmp syscall_exit 527 jmp syscall_exit
528END(syscall_trace_entry)
522 529
523 # perform syscall exit tracing 530 # perform syscall exit tracing
524 ALIGN 531 ALIGN
@@ -532,6 +539,7 @@ syscall_exit_work:
532 movl $1, %edx 539 movl $1, %edx
533 call do_syscall_trace 540 call do_syscall_trace
534 jmp resume_userspace 541 jmp resume_userspace
542END(syscall_exit_work)
535 CFI_ENDPROC 543 CFI_ENDPROC
536 544
537 RING0_INT_FRAME # can't unwind into user space anyway 545 RING0_INT_FRAME # can't unwind into user space anyway
@@ -542,15 +550,17 @@ syscall_fault:
542 GET_THREAD_INFO(%ebp) 550 GET_THREAD_INFO(%ebp)
543 movl $-EFAULT,PT_EAX(%esp) 551 movl $-EFAULT,PT_EAX(%esp)
544 jmp resume_userspace 552 jmp resume_userspace
553END(syscall_fault)
545 554
546syscall_badsys: 555syscall_badsys:
547 movl $-ENOSYS,PT_EAX(%esp) 556 movl $-ENOSYS,PT_EAX(%esp)
548 jmp resume_userspace 557 jmp resume_userspace
558END(syscall_badsys)
549 CFI_ENDPROC 559 CFI_ENDPROC
550 560
551#define FIXUP_ESPFIX_STACK \ 561#define FIXUP_ESPFIX_STACK \
552 /* since we are on a wrong stack, we cant make it a C code :( */ \ 562 /* since we are on a wrong stack, we cant make it a C code :( */ \
553 movl %gs:PDA_cpu, %ebx; \ 563 movl %fs:PDA_cpu, %ebx; \
554 PER_CPU(cpu_gdt_descr, %ebx); \ 564 PER_CPU(cpu_gdt_descr, %ebx); \
555 movl GDS_address(%ebx), %ebx; \ 565 movl GDS_address(%ebx), %ebx; \
556 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 566 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -581,9 +591,9 @@ syscall_badsys:
581ENTRY(interrupt) 591ENTRY(interrupt)
582.text 592.text
583 593
584vector=0
585ENTRY(irq_entries_start) 594ENTRY(irq_entries_start)
586 RING0_INT_FRAME 595 RING0_INT_FRAME
596vector=0
587.rept NR_IRQS 597.rept NR_IRQS
588 ALIGN 598 ALIGN
589 .if vector 599 .if vector
@@ -592,11 +602,16 @@ ENTRY(irq_entries_start)
5921: pushl $~(vector) 6021: pushl $~(vector)
593 CFI_ADJUST_CFA_OFFSET 4 603 CFI_ADJUST_CFA_OFFSET 4
594 jmp common_interrupt 604 jmp common_interrupt
595.data 605 .previous
596 .long 1b 606 .long 1b
597.text 607 .text
598vector=vector+1 608vector=vector+1
599.endr 609.endr
610END(irq_entries_start)
611
612.previous
613END(interrupt)
614.previous
600 615
601/* 616/*
602 * the CPU automatically disables interrupts when executing an IRQ vector, 617 * the CPU automatically disables interrupts when executing an IRQ vector,
@@ -609,6 +624,7 @@ common_interrupt:
609 movl %esp,%eax 624 movl %esp,%eax
610 call do_IRQ 625 call do_IRQ
611 jmp ret_from_intr 626 jmp ret_from_intr
627ENDPROC(common_interrupt)
612 CFI_ENDPROC 628 CFI_ENDPROC
613 629
614#define BUILD_INTERRUPT(name, nr) \ 630#define BUILD_INTERRUPT(name, nr) \
@@ -621,18 +637,24 @@ ENTRY(name) \
621 movl %esp,%eax; \ 637 movl %esp,%eax; \
622 call smp_/**/name; \ 638 call smp_/**/name; \
623 jmp ret_from_intr; \ 639 jmp ret_from_intr; \
624 CFI_ENDPROC 640 CFI_ENDPROC; \
641ENDPROC(name)
625 642
626/* The include is where all of the SMP etc. interrupts come from */ 643/* The include is where all of the SMP etc. interrupts come from */
627#include "entry_arch.h" 644#include "entry_arch.h"
628 645
646/* This alternate entry is needed because we hijack the apic LVTT */
647#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
648BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
649#endif
650
629KPROBE_ENTRY(page_fault) 651KPROBE_ENTRY(page_fault)
630 RING0_EC_FRAME 652 RING0_EC_FRAME
631 pushl $do_page_fault 653 pushl $do_page_fault
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 ALIGN 655 ALIGN
634error_code: 656error_code:
635 /* the function address is in %gs's slot on the stack */ 657 /* the function address is in %fs's slot on the stack */
636 pushl %es 658 pushl %es
637 CFI_ADJUST_CFA_OFFSET 4 659 CFI_ADJUST_CFA_OFFSET 4
638 /*CFI_REL_OFFSET es, 0*/ 660 /*CFI_REL_OFFSET es, 0*/
@@ -661,20 +683,20 @@ error_code:
661 CFI_ADJUST_CFA_OFFSET 4 683 CFI_ADJUST_CFA_OFFSET 4
662 CFI_REL_OFFSET ebx, 0 684 CFI_REL_OFFSET ebx, 0
663 cld 685 cld
664 pushl %gs 686 pushl %fs
665 CFI_ADJUST_CFA_OFFSET 4 687 CFI_ADJUST_CFA_OFFSET 4
666 /*CFI_REL_OFFSET gs, 0*/ 688 /*CFI_REL_OFFSET fs, 0*/
667 movl $(__KERNEL_PDA), %ecx 689 movl $(__KERNEL_PDA), %ecx
668 movl %ecx, %gs 690 movl %ecx, %fs
669 UNWIND_ESPFIX_STACK 691 UNWIND_ESPFIX_STACK
670 popl %ecx 692 popl %ecx
671 CFI_ADJUST_CFA_OFFSET -4 693 CFI_ADJUST_CFA_OFFSET -4
672 /*CFI_REGISTER es, ecx*/ 694 /*CFI_REGISTER es, ecx*/
673 movl PT_GS(%esp), %edi # get the function address 695 movl PT_FS(%esp), %edi # get the function address
674 movl PT_ORIG_EAX(%esp), %edx # get the error code 696 movl PT_ORIG_EAX(%esp), %edx # get the error code
675 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 697 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
676 mov %ecx, PT_GS(%esp) 698 mov %ecx, PT_FS(%esp)
677 /*CFI_REL_OFFSET gs, ES*/ 699 /*CFI_REL_OFFSET fs, ES*/
678 movl $(__USER_DS), %ecx 700 movl $(__USER_DS), %ecx
679 movl %ecx, %ds 701 movl %ecx, %ds
680 movl %ecx, %es 702 movl %ecx, %es
@@ -692,6 +714,7 @@ ENTRY(coprocessor_error)
692 CFI_ADJUST_CFA_OFFSET 4 714 CFI_ADJUST_CFA_OFFSET 4
693 jmp error_code 715 jmp error_code
694 CFI_ENDPROC 716 CFI_ENDPROC
717END(coprocessor_error)
695 718
696ENTRY(simd_coprocessor_error) 719ENTRY(simd_coprocessor_error)
697 RING0_INT_FRAME 720 RING0_INT_FRAME
@@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error)
701 CFI_ADJUST_CFA_OFFSET 4 724 CFI_ADJUST_CFA_OFFSET 4
702 jmp error_code 725 jmp error_code
703 CFI_ENDPROC 726 CFI_ENDPROC
727END(simd_coprocessor_error)
704 728
705ENTRY(device_not_available) 729ENTRY(device_not_available)
706 RING0_INT_FRAME 730 RING0_INT_FRAME
@@ -721,6 +745,7 @@ device_not_available_emulate:
721 CFI_ADJUST_CFA_OFFSET -4 745 CFI_ADJUST_CFA_OFFSET -4
722 jmp ret_from_exception 746 jmp ret_from_exception
723 CFI_ENDPROC 747 CFI_ENDPROC
748END(device_not_available)
724 749
725/* 750/*
726 * Debug traps and NMI can happen at the one SYSENTER instruction 751 * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -864,10 +889,12 @@ ENTRY(native_iret)
864 .align 4 889 .align 4
865 .long 1b,iret_exc 890 .long 1b,iret_exc
866.previous 891.previous
892END(native_iret)
867 893
868ENTRY(native_irq_enable_sysexit) 894ENTRY(native_irq_enable_sysexit)
869 sti 895 sti
870 sysexit 896 sysexit
897END(native_irq_enable_sysexit)
871#endif 898#endif
872 899
873KPROBE_ENTRY(int3) 900KPROBE_ENTRY(int3)
@@ -890,6 +917,7 @@ ENTRY(overflow)
890 CFI_ADJUST_CFA_OFFSET 4 917 CFI_ADJUST_CFA_OFFSET 4
891 jmp error_code 918 jmp error_code
892 CFI_ENDPROC 919 CFI_ENDPROC
920END(overflow)
893 921
894ENTRY(bounds) 922ENTRY(bounds)
895 RING0_INT_FRAME 923 RING0_INT_FRAME
@@ -899,6 +927,7 @@ ENTRY(bounds)
899 CFI_ADJUST_CFA_OFFSET 4 927 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 928 jmp error_code
901 CFI_ENDPROC 929 CFI_ENDPROC
930END(bounds)
902 931
903ENTRY(invalid_op) 932ENTRY(invalid_op)
904 RING0_INT_FRAME 933 RING0_INT_FRAME
@@ -908,6 +937,7 @@ ENTRY(invalid_op)
908 CFI_ADJUST_CFA_OFFSET 4 937 CFI_ADJUST_CFA_OFFSET 4
909 jmp error_code 938 jmp error_code
910 CFI_ENDPROC 939 CFI_ENDPROC
940END(invalid_op)
911 941
912ENTRY(coprocessor_segment_overrun) 942ENTRY(coprocessor_segment_overrun)
913 RING0_INT_FRAME 943 RING0_INT_FRAME
@@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun)
917 CFI_ADJUST_CFA_OFFSET 4 947 CFI_ADJUST_CFA_OFFSET 4
918 jmp error_code 948 jmp error_code
919 CFI_ENDPROC 949 CFI_ENDPROC
950END(coprocessor_segment_overrun)
920 951
921ENTRY(invalid_TSS) 952ENTRY(invalid_TSS)
922 RING0_EC_FRAME 953 RING0_EC_FRAME
@@ -924,6 +955,7 @@ ENTRY(invalid_TSS)
924 CFI_ADJUST_CFA_OFFSET 4 955 CFI_ADJUST_CFA_OFFSET 4
925 jmp error_code 956 jmp error_code
926 CFI_ENDPROC 957 CFI_ENDPROC
958END(invalid_TSS)
927 959
928ENTRY(segment_not_present) 960ENTRY(segment_not_present)
929 RING0_EC_FRAME 961 RING0_EC_FRAME
@@ -931,6 +963,7 @@ ENTRY(segment_not_present)
931 CFI_ADJUST_CFA_OFFSET 4 963 CFI_ADJUST_CFA_OFFSET 4
932 jmp error_code 964 jmp error_code
933 CFI_ENDPROC 965 CFI_ENDPROC
966END(segment_not_present)
934 967
935ENTRY(stack_segment) 968ENTRY(stack_segment)
936 RING0_EC_FRAME 969 RING0_EC_FRAME
@@ -938,6 +971,7 @@ ENTRY(stack_segment)
938 CFI_ADJUST_CFA_OFFSET 4 971 CFI_ADJUST_CFA_OFFSET 4
939 jmp error_code 972 jmp error_code
940 CFI_ENDPROC 973 CFI_ENDPROC
974END(stack_segment)
941 975
942KPROBE_ENTRY(general_protection) 976KPROBE_ENTRY(general_protection)
943 RING0_EC_FRAME 977 RING0_EC_FRAME
@@ -953,6 +987,7 @@ ENTRY(alignment_check)
953 CFI_ADJUST_CFA_OFFSET 4 987 CFI_ADJUST_CFA_OFFSET 4
954 jmp error_code 988 jmp error_code
955 CFI_ENDPROC 989 CFI_ENDPROC
990END(alignment_check)
956 991
957ENTRY(divide_error) 992ENTRY(divide_error)
958 RING0_INT_FRAME 993 RING0_INT_FRAME
@@ -962,6 +997,7 @@ ENTRY(divide_error)
962 CFI_ADJUST_CFA_OFFSET 4 997 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 998 jmp error_code
964 CFI_ENDPROC 999 CFI_ENDPROC
1000END(divide_error)
965 1001
966#ifdef CONFIG_X86_MCE 1002#ifdef CONFIG_X86_MCE
967ENTRY(machine_check) 1003ENTRY(machine_check)
@@ -972,6 +1008,7 @@ ENTRY(machine_check)
972 CFI_ADJUST_CFA_OFFSET 4 1008 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 1009 jmp error_code
974 CFI_ENDPROC 1010 CFI_ENDPROC
1011END(machine_check)
975#endif 1012#endif
976 1013
977ENTRY(spurious_interrupt_bug) 1014ENTRY(spurious_interrupt_bug)
@@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug)
982 CFI_ADJUST_CFA_OFFSET 4 1019 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 1020 jmp error_code
984 CFI_ENDPROC 1021 CFI_ENDPROC
1022END(spurious_interrupt_bug)
985 1023
986ENTRY(kernel_thread_helper) 1024ENTRY(kernel_thread_helper)
987 pushl $0 # fake return address for unwinder 1025 pushl $0 # fake return address for unwinder
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index cb9abdfced9b..3fa7f9389afe 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -53,6 +53,7 @@
53 * any particular GDT layout, because we load our own as soon as we 53 * any particular GDT layout, because we load our own as soon as we
54 * can. 54 * can.
55 */ 55 */
56.section .text.head,"ax",@progbits
56ENTRY(startup_32) 57ENTRY(startup_32)
57 58
58#ifdef CONFIG_PARAVIRT 59#ifdef CONFIG_PARAVIRT
@@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
141 jb 10b 142 jb 10b
142 movl %edi,(init_pg_tables_end - __PAGE_OFFSET) 143 movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
143 144
144#ifdef CONFIG_SMP
145 xorl %ebx,%ebx /* This is the boot CPU (BSP) */ 145 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
146 jmp 3f 146 jmp 3f
147
148/* 147/*
149 * Non-boot CPU entry point; entered from trampoline.S 148 * Non-boot CPU entry point; entered from trampoline.S
150 * We can't lgdt here, because lgdt itself uses a data segment, but 149 * We can't lgdt here, because lgdt itself uses a data segment, but
151 * we know the trampoline has already loaded the boot_gdt_table GDT 150 * we know the trampoline has already loaded the boot_gdt_table GDT
152 * for us. 151 * for us.
152 *
153 * If cpu hotplug is not supported then this code can go in init section
154 * which will be freed later
153 */ 155 */
156
157#ifdef CONFIG_HOTPLUG_CPU
158.section .text,"ax",@progbits
159#else
160.section .init.text,"ax",@progbits
161#endif
162
163#ifdef CONFIG_SMP
154ENTRY(startup_32_smp) 164ENTRY(startup_32_smp)
155 cld 165 cld
156 movl $(__BOOT_DS),%eax 166 movl $(__BOOT_DS),%eax
@@ -208,8 +218,8 @@ ENTRY(startup_32_smp)
208 xorl %ebx,%ebx 218 xorl %ebx,%ebx
209 incl %ebx 219 incl %ebx
210 220
2113:
212#endif /* CONFIG_SMP */ 221#endif /* CONFIG_SMP */
2223:
213 223
214/* 224/*
215 * Enable paging 225 * Enable paging
@@ -309,7 +319,7 @@ is386: movl $2,%ecx # set MP
309 319
310 call check_x87 320 call check_x87
311 call setup_pda 321 call setup_pda
312 lgdt cpu_gdt_descr 322 lgdt early_gdt_descr
313 lidt idt_descr 323 lidt idt_descr
314 ljmp $(__KERNEL_CS),$1f 324 ljmp $(__KERNEL_CS),$1f
3151: movl $(__KERNEL_DS),%eax # reload all the segment registers 3251: movl $(__KERNEL_DS),%eax # reload all the segment registers
@@ -319,12 +329,12 @@ is386: movl $2,%ecx # set MP
319 movl %eax,%ds 329 movl %eax,%ds
320 movl %eax,%es 330 movl %eax,%es
321 331
322 xorl %eax,%eax # Clear FS and LDT 332 xorl %eax,%eax # Clear GS and LDT
323 movl %eax,%fs 333 movl %eax,%gs
324 lldt %ax 334 lldt %ax
325 335
326 movl $(__KERNEL_PDA),%eax 336 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs 337 mov %eax,%fs
328 338
329 cld # gcc2 wants the direction flag cleared at all times 339 cld # gcc2 wants the direction flag cleared at all times
330 pushl $0 # fake return address for unwinder 340 pushl $0 # fake return address for unwinder
@@ -360,12 +370,12 @@ check_x87:
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be 370 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA. 371 * that CPU's GDT and PDA.
362 */ 372 */
363setup_pda: 373ENTRY(setup_pda)
364 /* get the PDA pointer */ 374 /* get the PDA pointer */
365 movl start_pda, %eax 375 movl start_pda, %eax
366 376
367 /* slot the PDA address into the GDT */ 377 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx 378 mov early_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ 379 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax 380 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ 381 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
@@ -492,6 +502,7 @@ ignore_int:
492#endif 502#endif
493 iret 503 iret
494 504
505.section .text
495#ifdef CONFIG_PARAVIRT 506#ifdef CONFIG_PARAVIRT
496startup_paravirt: 507startup_paravirt:
497 cld 508 cld
@@ -502,10 +513,11 @@ startup_paravirt:
502 pushl %ecx 513 pushl %ecx
503 pushl %eax 514 pushl %eax
504 515
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe 516 pushl $__start_paravirtprobe
5071: 5171:
508 movl 0(%esp), %eax 518 movl 0(%esp), %eax
519 cmpl $__stop_paravirtprobe, %eax
520 je unhandled_paravirt
509 pushl (%eax) 521 pushl (%eax)
510 movl 8(%esp), %eax 522 movl 8(%esp), %eax
511 call *(%esp) 523 call *(%esp)
@@ -517,6 +529,10 @@ startup_paravirt:
517 529
518 addl $4, (%esp) 530 addl $4, (%esp)
519 jmp 1b 531 jmp 1b
532
533unhandled_paravirt:
534 /* Nothing wanted us: we're screwed. */
535 ud2
520#endif 536#endif
521 537
522/* 538/*
@@ -581,7 +597,7 @@ idt_descr:
581 597
582# boot GDT descriptor (later on used by CPU#0): 598# boot GDT descriptor (later on used by CPU#0):
583 .word 0 # 32 bit align gdt_desc.address 599 .word 0 # 32 bit align gdt_desc.address
584ENTRY(cpu_gdt_descr) 600ENTRY(early_gdt_descr)
585 .word GDT_ENTRIES*8-1 601 .word GDT_ENTRIES*8-1
586 .long cpu_gdt_table 602 .long cpu_gdt_table
587 603
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 0b29d41322a2..e1006b7acc9e 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -1,4 +1,5 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h>
2#include <linux/errno.h> 3#include <linux/errno.h>
3#include <linux/hpet.h> 4#include <linux/hpet.h>
4#include <linux/init.h> 5#include <linux/init.h>
@@ -6,17 +7,278 @@
6#include <asm/hpet.h> 7#include <asm/hpet.h>
7#include <asm/io.h> 8#include <asm/io.h>
8 9
10extern struct clock_event_device *global_clock_event;
11
9#define HPET_MASK CLOCKSOURCE_MASK(32) 12#define HPET_MASK CLOCKSOURCE_MASK(32)
10#define HPET_SHIFT 22 13#define HPET_SHIFT 22
11 14
12/* FSEC = 10^-15 NSEC = 10^-9 */ 15/* FSEC = 10^-15 NSEC = 10^-9 */
13#define FSEC_PER_NSEC 1000000 16#define FSEC_PER_NSEC 1000000
14 17
15static void __iomem *hpet_ptr; 18/*
19 * HPET address is set in acpi/boot.c, when an ACPI entry exists
20 */
21unsigned long hpet_address;
22static void __iomem * hpet_virt_address;
23
24static inline unsigned long hpet_readl(unsigned long a)
25{
26 return readl(hpet_virt_address + a);
27}
28
29static inline void hpet_writel(unsigned long d, unsigned long a)
30{
31 writel(d, hpet_virt_address + a);
32}
33
34/*
35 * HPET command line enable / disable
36 */
37static int boot_hpet_disable;
38
39static int __init hpet_setup(char* str)
40{
41 if (str) {
42 if (!strncmp("disable", str, 7))
43 boot_hpet_disable = 1;
44 }
45 return 1;
46}
47__setup("hpet=", hpet_setup);
48
49static inline int is_hpet_capable(void)
50{
51 return (!boot_hpet_disable && hpet_address);
52}
53
54/*
55 * HPET timer interrupt enable / disable
56 */
57static int hpet_legacy_int_enabled;
58
59/**
60 * is_hpet_enabled - check whether the hpet timer interrupt is enabled
61 */
62int is_hpet_enabled(void)
63{
64 return is_hpet_capable() && hpet_legacy_int_enabled;
65}
66
67/*
68 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
69 * timer 0 and timer 1 in case of RTC emulation.
70 */
71#ifdef CONFIG_HPET
72static void hpet_reserve_platform_timers(unsigned long id)
73{
74 struct hpet __iomem *hpet = hpet_virt_address;
75 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
76 unsigned int nrtimers, i;
77 struct hpet_data hd;
78
79 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
80
81 memset(&hd, 0, sizeof (hd));
82 hd.hd_phys_address = hpet_address;
83 hd.hd_address = hpet_virt_address;
84 hd.hd_nirqs = nrtimers;
85 hd.hd_flags = HPET_DATA_PLATFORM;
86 hpet_reserve_timer(&hd, 0);
87
88#ifdef CONFIG_HPET_EMULATE_RTC
89 hpet_reserve_timer(&hd, 1);
90#endif
91
92 hd.hd_irq[0] = HPET_LEGACY_8254;
93 hd.hd_irq[1] = HPET_LEGACY_RTC;
94
95 for (i = 2; i < nrtimers; timer++, i++)
96 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
97 Tn_INT_ROUTE_CNF_SHIFT;
98
99 hpet_alloc(&hd);
100
101}
102#else
103static void hpet_reserve_platform_timers(unsigned long id) { }
104#endif
105
106/*
107 * Common hpet info
108 */
109static unsigned long hpet_period;
110
111static void hpet_set_mode(enum clock_event_mode mode,
112 struct clock_event_device *evt);
113static int hpet_next_event(unsigned long delta,
114 struct clock_event_device *evt);
115
116/*
117 * The hpet clock event device
118 */
119static struct clock_event_device hpet_clockevent = {
120 .name = "hpet",
121 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
122 .set_mode = hpet_set_mode,
123 .set_next_event = hpet_next_event,
124 .shift = 32,
125 .irq = 0,
126};
127
128static void hpet_start_counter(void)
129{
130 unsigned long cfg = hpet_readl(HPET_CFG);
131
132 cfg &= ~HPET_CFG_ENABLE;
133 hpet_writel(cfg, HPET_CFG);
134 hpet_writel(0, HPET_COUNTER);
135 hpet_writel(0, HPET_COUNTER + 4);
136 cfg |= HPET_CFG_ENABLE;
137 hpet_writel(cfg, HPET_CFG);
138}
139
140static void hpet_enable_int(void)
141{
142 unsigned long cfg = hpet_readl(HPET_CFG);
143
144 cfg |= HPET_CFG_LEGACY;
145 hpet_writel(cfg, HPET_CFG);
146 hpet_legacy_int_enabled = 1;
147}
148
149static void hpet_set_mode(enum clock_event_mode mode,
150 struct clock_event_device *evt)
151{
152 unsigned long cfg, cmp, now;
153 uint64_t delta;
154
155 switch(mode) {
156 case CLOCK_EVT_MODE_PERIODIC:
157 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
158 delta >>= hpet_clockevent.shift;
159 now = hpet_readl(HPET_COUNTER);
160 cmp = now + (unsigned long) delta;
161 cfg = hpet_readl(HPET_T0_CFG);
162 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
163 HPET_TN_SETVAL | HPET_TN_32BIT;
164 hpet_writel(cfg, HPET_T0_CFG);
165 /*
166 * The first write after writing TN_SETVAL to the
167 * config register sets the counter value, the second
168 * write sets the period.
169 */
170 hpet_writel(cmp, HPET_T0_CMP);
171 udelay(1);
172 hpet_writel((unsigned long) delta, HPET_T0_CMP);
173 break;
174
175 case CLOCK_EVT_MODE_ONESHOT:
176 cfg = hpet_readl(HPET_T0_CFG);
177 cfg &= ~HPET_TN_PERIODIC;
178 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
179 hpet_writel(cfg, HPET_T0_CFG);
180 break;
181
182 case CLOCK_EVT_MODE_UNUSED:
183 case CLOCK_EVT_MODE_SHUTDOWN:
184 cfg = hpet_readl(HPET_T0_CFG);
185 cfg &= ~HPET_TN_ENABLE;
186 hpet_writel(cfg, HPET_T0_CFG);
187 break;
188 }
189}
190
191static int hpet_next_event(unsigned long delta,
192 struct clock_event_device *evt)
193{
194 unsigned long cnt;
195
196 cnt = hpet_readl(HPET_COUNTER);
197 cnt += delta;
198 hpet_writel(cnt, HPET_T0_CMP);
199
200 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0);
201}
202
203/*
204 * Try to setup the HPET timer
205 */
206int __init hpet_enable(void)
207{
208 unsigned long id;
209 uint64_t hpet_freq;
210
211 if (!is_hpet_capable())
212 return 0;
213
214 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
215
216 /*
217 * Read the period and check for a sane value:
218 */
219 hpet_period = hpet_readl(HPET_PERIOD);
220 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
221 goto out_nohpet;
222
223 /*
224 * The period is a femto seconds value. We need to calculate the
225 * scaled math multiplication factor for nanosecond to hpet tick
226 * conversion.
227 */
228 hpet_freq = 1000000000000000ULL;
229 do_div(hpet_freq, hpet_period);
230 hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
231 NSEC_PER_SEC, 32);
232 /* Calculate the min / max delta */
233 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
234 &hpet_clockevent);
235 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
236 &hpet_clockevent);
237
238 /*
239 * Read the HPET ID register to retrieve the IRQ routing
240 * information and the number of channels
241 */
242 id = hpet_readl(HPET_ID);
243
244#ifdef CONFIG_HPET_EMULATE_RTC
245 /*
246 * The legacy routing mode needs at least two channels, tick timer
247 * and the rtc emulation channel.
248 */
249 if (!(id & HPET_ID_NUMBER))
250 goto out_nohpet;
251#endif
252
253 /* Start the counter */
254 hpet_start_counter();
255
256 if (id & HPET_ID_LEGSUP) {
257 hpet_enable_int();
258 hpet_reserve_platform_timers(id);
259 /*
260 * Start hpet with the boot cpu mask and make it
261 * global after the IO_APIC has been initialized.
262 */
263 hpet_clockevent.cpumask =cpumask_of_cpu(0);
264 clockevents_register_device(&hpet_clockevent);
265 global_clock_event = &hpet_clockevent;
266 return 1;
267 }
268 return 0;
16 269
270out_nohpet:
271 iounmap(hpet_virt_address);
272 hpet_virt_address = NULL;
273 return 0;
274}
275
276/*
277 * Clock source related code
278 */
17static cycle_t read_hpet(void) 279static cycle_t read_hpet(void)
18{ 280{
19 return (cycle_t)readl(hpet_ptr); 281 return (cycle_t)hpet_readl(HPET_COUNTER);
20} 282}
21 283
22static struct clocksource clocksource_hpet = { 284static struct clocksource clocksource_hpet = {
@@ -24,28 +286,17 @@ static struct clocksource clocksource_hpet = {
24 .rating = 250, 286 .rating = 250,
25 .read = read_hpet, 287 .read = read_hpet,
26 .mask = HPET_MASK, 288 .mask = HPET_MASK,
27 .mult = 0, /* set below */
28 .shift = HPET_SHIFT, 289 .shift = HPET_SHIFT,
29 .is_continuous = 1, 290 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
30}; 291};
31 292
32static int __init init_hpet_clocksource(void) 293static int __init init_hpet_clocksource(void)
33{ 294{
34 unsigned long hpet_period;
35 void __iomem* hpet_base;
36 u64 tmp; 295 u64 tmp;
37 int err;
38 296
39 if (!is_hpet_enabled()) 297 if (!hpet_virt_address)
40 return -ENODEV; 298 return -ENODEV;
41 299
42 /* calculate the hpet address: */
43 hpet_base = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
44 hpet_ptr = hpet_base + HPET_COUNTER;
45
46 /* calculate the frequency: */
47 hpet_period = readl(hpet_base + HPET_PERIOD);
48
49 /* 300 /*
50 * hpet period is in femto seconds per cycle 301 * hpet period is in femto seconds per cycle
51 * so we need to convert this to ns/cyc units 302 * so we need to convert this to ns/cyc units
@@ -61,11 +312,218 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 312 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 313 clocksource_hpet.mult = (u32)tmp;
63 314
64 err = clocksource_register(&clocksource_hpet); 315 return clocksource_register(&clocksource_hpet);
65 if (err)
66 iounmap(hpet_base);
67
68 return err;
69} 316}
70 317
71module_init(init_hpet_clocksource); 318module_init(init_hpet_clocksource);
319
320#ifdef CONFIG_HPET_EMULATE_RTC
321
322/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
323 * is enabled, we support RTC interrupt functionality in software.
324 * RTC has 3 kinds of interrupts:
325 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
326 * is updated
327 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
328 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
329 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
330 * (1) and (2) above are implemented using polling at a frequency of
331 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
332 * overhead. (DEFAULT_RTC_INT_FREQ)
333 * For (3), we use interrupts at 64Hz or user specified periodic
334 * frequency, whichever is higher.
335 */
336#include <linux/mc146818rtc.h>
337#include <linux/rtc.h>
338
339#define DEFAULT_RTC_INT_FREQ 64
340#define DEFAULT_RTC_SHIFT 6
341#define RTC_NUM_INTS 1
342
343static unsigned long hpet_rtc_flags;
344static unsigned long hpet_prev_update_sec;
345static struct rtc_time hpet_alarm_time;
346static unsigned long hpet_pie_count;
347static unsigned long hpet_t1_cmp;
348static unsigned long hpet_default_delta;
349static unsigned long hpet_pie_delta;
350static unsigned long hpet_pie_limit;
351
352/*
353 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
354 * is not supported by all HPET implementations for timer 1.
355 *
356 * hpet_rtc_timer_init() is called when the rtc is initialized.
357 */
358int hpet_rtc_timer_init(void)
359{
360 unsigned long cfg, cnt, delta, flags;
361
362 if (!is_hpet_enabled())
363 return 0;
364
365 if (!hpet_default_delta) {
366 uint64_t clc;
367
368 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
369 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
370 hpet_default_delta = (unsigned long) clc;
371 }
372
373 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
374 delta = hpet_default_delta;
375 else
376 delta = hpet_pie_delta;
377
378 local_irq_save(flags);
379
380 cnt = delta + hpet_readl(HPET_COUNTER);
381 hpet_writel(cnt, HPET_T1_CMP);
382 hpet_t1_cmp = cnt;
383
384 cfg = hpet_readl(HPET_T1_CFG);
385 cfg &= ~HPET_TN_PERIODIC;
386 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
387 hpet_writel(cfg, HPET_T1_CFG);
388
389 local_irq_restore(flags);
390
391 return 1;
392}
393
394/*
395 * The functions below are called from rtc driver.
396 * Return 0 if HPET is not being used.
397 * Otherwise do the necessary changes and return 1.
398 */
399int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
400{
401 if (!is_hpet_enabled())
402 return 0;
403
404 hpet_rtc_flags &= ~bit_mask;
405 return 1;
406}
407
408int hpet_set_rtc_irq_bit(unsigned long bit_mask)
409{
410 unsigned long oldbits = hpet_rtc_flags;
411
412 if (!is_hpet_enabled())
413 return 0;
414
415 hpet_rtc_flags |= bit_mask;
416
417 if (!oldbits)
418 hpet_rtc_timer_init();
419
420 return 1;
421}
422
423int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
424 unsigned char sec)
425{
426 if (!is_hpet_enabled())
427 return 0;
428
429 hpet_alarm_time.tm_hour = hrs;
430 hpet_alarm_time.tm_min = min;
431 hpet_alarm_time.tm_sec = sec;
432
433 return 1;
434}
435
436int hpet_set_periodic_freq(unsigned long freq)
437{
438 uint64_t clc;
439
440 if (!is_hpet_enabled())
441 return 0;
442
443 if (freq <= DEFAULT_RTC_INT_FREQ)
444 hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
445 else {
446 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
447 do_div(clc, freq);
448 clc >>= hpet_clockevent.shift;
449 hpet_pie_delta = (unsigned long) clc;
450 }
451 return 1;
452}
453
454int hpet_rtc_dropped_irq(void)
455{
456 return is_hpet_enabled();
457}
458
459static void hpet_rtc_timer_reinit(void)
460{
461 unsigned long cfg, delta;
462 int lost_ints = -1;
463
464 if (unlikely(!hpet_rtc_flags)) {
465 cfg = hpet_readl(HPET_T1_CFG);
466 cfg &= ~HPET_TN_ENABLE;
467 hpet_writel(cfg, HPET_T1_CFG);
468 return;
469 }
470
471 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
472 delta = hpet_default_delta;
473 else
474 delta = hpet_pie_delta;
475
476 /*
477 * Increment the comparator value until we are ahead of the
478 * current count.
479 */
480 do {
481 hpet_t1_cmp += delta;
482 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
483 lost_ints++;
484 } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
485
486 if (lost_ints) {
487 if (hpet_rtc_flags & RTC_PIE)
488 hpet_pie_count += lost_ints;
489 if (printk_ratelimit())
490 printk(KERN_WARNING "rtc: lost %d interrupts\n",
491 lost_ints);
492 }
493}
494
495irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
496{
497 struct rtc_time curr_time;
498 unsigned long rtc_int_flag = 0;
499
500 hpet_rtc_timer_reinit();
501
502 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
503 rtc_get_rtc_time(&curr_time);
504
505 if (hpet_rtc_flags & RTC_UIE &&
506 curr_time.tm_sec != hpet_prev_update_sec) {
507 rtc_int_flag = RTC_UF;
508 hpet_prev_update_sec = curr_time.tm_sec;
509 }
510
511 if (hpet_rtc_flags & RTC_PIE &&
512 ++hpet_pie_count >= hpet_pie_limit) {
513 rtc_int_flag |= RTC_PF;
514 hpet_pie_count = 0;
515 }
516
517 if (hpet_rtc_flags & RTC_PIE &&
518 (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
519 (curr_time.tm_min == hpet_alarm_time.tm_min) &&
520 (curr_time.tm_hour == hpet_alarm_time.tm_hour))
521 rtc_int_flag |= RTC_AF;
522
523 if (rtc_int_flag) {
524 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
525 rtc_interrupt(rtc_int_flag, dev_id);
526 }
527 return IRQ_HANDLED;
528}
529#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 9a0060b92e32..a6bc7bb38834 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -2,7 +2,7 @@
2 * i8253.c 8253/PIT functions 2 * i8253.c 8253/PIT functions
3 * 3 *
4 */ 4 */
5#include <linux/clocksource.h> 5#include <linux/clockchips.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/jiffies.h> 7#include <linux/jiffies.h>
8#include <linux/sysdev.h> 8#include <linux/sysdev.h>
@@ -19,17 +19,97 @@
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22void setup_pit_timer(void) 22/*
23 * HPET replaces the PIT, when enabled. So we need to know, which of
24 * the two timers is used
25 */
26struct clock_event_device *global_clock_event;
27
28/*
29 * Initialize the PIT timer.
30 *
31 * This is also called after resume to bring the PIT into operation again.
32 */
33static void init_pit_timer(enum clock_event_mode mode,
34 struct clock_event_device *evt)
35{
36 unsigned long flags;
37
38 spin_lock_irqsave(&i8253_lock, flags);
39
40 switch(mode) {
41 case CLOCK_EVT_MODE_PERIODIC:
42 /* binary, mode 2, LSB/MSB, ch 0 */
43 outb_p(0x34, PIT_MODE);
44 udelay(10);
45 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
46 udelay(10);
47 outb(LATCH >> 8 , PIT_CH0); /* MSB */
48 break;
49
50 case CLOCK_EVT_MODE_ONESHOT:
51 case CLOCK_EVT_MODE_SHUTDOWN:
52 case CLOCK_EVT_MODE_UNUSED:
53 /* One shot setup */
54 outb_p(0x38, PIT_MODE);
55 udelay(10);
56 break;
57 }
58 spin_unlock_irqrestore(&i8253_lock, flags);
59}
60
61/*
62 * Program the next event in oneshot mode
63 *
64 * Delta is given in PIT ticks
65 */
66static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
23{ 67{
24 unsigned long flags; 68 unsigned long flags;
25 69
26 spin_lock_irqsave(&i8253_lock, flags); 70 spin_lock_irqsave(&i8253_lock, flags);
27 outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 71 outb_p(delta & 0xff , PIT_CH0); /* LSB */
28 udelay(10); 72 outb(delta >> 8 , PIT_CH0); /* MSB */
29 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
30 udelay(10);
31 outb(LATCH >> 8 , PIT_CH0); /* MSB */
32 spin_unlock_irqrestore(&i8253_lock, flags); 73 spin_unlock_irqrestore(&i8253_lock, flags);
74
75 return 0;
76}
77
78/*
79 * On UP the PIT can serve all of the possible timer functions. On SMP systems
80 * it can be solely used for the global tick.
81 *
82 * The profiling and update capabilites are switched off once the local apic is
83 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
84 * !using_apic_timer decisions in do_timer_interrupt_hook()
85 */
86struct clock_event_device pit_clockevent = {
87 .name = "pit",
88 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
89 .set_mode = init_pit_timer,
90 .set_next_event = pit_next_event,
91 .shift = 32,
92 .irq = 0,
93};
94
95/*
96 * Initialize the conversion factor and the min/max deltas of the clock event
97 * structure and register the clock event source with the framework.
98 */
99void __init setup_pit_timer(void)
100{
101 /*
102 * Start pit with the boot cpu mask and make it global after the
103 * IO_APIC has been initialized.
104 */
105 pit_clockevent.cpumask = cpumask_of_cpu(0);
106 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
107 pit_clockevent.max_delta_ns =
108 clockevent_delta2ns(0x7FFF, &pit_clockevent);
109 pit_clockevent.min_delta_ns =
110 clockevent_delta2ns(0xF, &pit_clockevent);
111 clockevents_register_device(&pit_clockevent);
112 global_clock_event = &pit_clockevent;
33} 113}
34 114
35/* 115/*
@@ -46,7 +126,7 @@ static cycle_t pit_read(void)
46 static u32 old_jifs; 126 static u32 old_jifs;
47 127
48 spin_lock_irqsave(&i8253_lock, flags); 128 spin_lock_irqsave(&i8253_lock, flags);
49 /* 129 /*
50 * Although our caller may have the read side of xtime_lock, 130 * Although our caller may have the read side of xtime_lock,
51 * this is now a seqlock, and we are cheating in this routine 131 * this is now a seqlock, and we are cheating in this routine
52 * by having side effects on state that we cannot undo if 132 * by having side effects on state that we cannot undo if
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index c8d45821c788..03abfdb1a6e4 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -41,6 +41,7 @@ static void mask_and_ack_8259A(unsigned int);
41static struct irq_chip i8259A_chip = { 41static struct irq_chip i8259A_chip = {
42 .name = "XT-PIC", 42 .name = "XT-PIC",
43 .mask = disable_8259A_irq, 43 .mask = disable_8259A_irq,
44 .disable = disable_8259A_irq,
44 .unmask = enable_8259A_irq, 45 .unmask = enable_8259A_irq,
45 .mask_ack = mask_and_ack_8259A, 46 .mask_ack = mask_and_ack_8259A,
46}; 47};
@@ -410,12 +411,6 @@ void __init native_init_IRQ(void)
410 intr_init_hook(); 411 intr_init_hook();
411 412
412 /* 413 /*
413 * Set the clock to HZ Hz, we already have a valid
414 * vector now:
415 */
416 setup_pit_timer();
417
418 /*
419 * External FPU? Set up irq13 if so, for 414 * External FPU? Set up irq13 if so, for
420 * original braindamaged IBM FERR coupling. 415 * original braindamaged IBM FERR coupling.
421 */ 416 */
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index ba8d302a0b72..4ccebd454e25 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -482,8 +482,8 @@ static void do_irq_balance(void)
482 package_index = CPU_TO_PACKAGEINDEX(i); 482 package_index = CPU_TO_PACKAGEINDEX(i);
483 for (j = 0; j < NR_IRQS; j++) { 483 for (j = 0; j < NR_IRQS; j++) {
484 unsigned long value_now, delta; 484 unsigned long value_now, delta;
485 /* Is this an active IRQ? */ 485 /* Is this an active IRQ or balancing disabled ? */
486 if (!irq_desc[j].action) 486 if (!irq_desc[j].action || irq_balancing_disabled(j))
487 continue; 487 continue;
488 if ( package_index == i ) 488 if ( package_index == i )
489 IRQ_DELTA(package_index,j) = 0; 489 IRQ_DELTA(package_index,j) = 0;
@@ -1281,11 +1281,9 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1281 trigger == IOAPIC_LEVEL) 1281 trigger == IOAPIC_LEVEL)
1282 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1282 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1283 handle_fasteoi_irq, "fasteoi"); 1283 handle_fasteoi_irq, "fasteoi");
1284 else { 1284 else
1285 irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1286 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1285 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1287 handle_edge_irq, "edge"); 1286 handle_edge_irq, "edge");
1288 }
1289 set_intr_gate(vector, interrupt[irq]); 1287 set_intr_gate(vector, interrupt[irq]);
1290} 1288}
1291 1289
@@ -1588,7 +1586,7 @@ void /*__init*/ print_local_APIC(void * dummy)
1588 v = apic_read(APIC_LVR); 1586 v = apic_read(APIC_LVR);
1589 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1587 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1590 ver = GET_APIC_VERSION(v); 1588 ver = GET_APIC_VERSION(v);
1591 maxlvt = get_maxlvt(); 1589 maxlvt = lapic_get_maxlvt();
1592 1590
1593 v = apic_read(APIC_TASKPRI); 1591 v = apic_read(APIC_TASKPRI);
1594 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1592 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1920,7 +1918,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
1920static void __init setup_ioapic_ids_from_mpc(void) { } 1918static void __init setup_ioapic_ids_from_mpc(void) { }
1921#endif 1919#endif
1922 1920
1923static int no_timer_check __initdata; 1921int no_timer_check __initdata;
1924 1922
1925static int __init notimercheck(char *s) 1923static int __init notimercheck(char *s)
1926{ 1924{
@@ -2310,7 +2308,7 @@ static inline void __init check_timer(void)
2310 2308
2311 disable_8259A_irq(0); 2309 disable_8259A_irq(0);
2312 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, 2310 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2313 "fasteio"); 2311 "fasteoi");
2314 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2312 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2315 enable_8259A_irq(0); 2313 enable_8259A_irq(0);
2316 2314
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3201d421090a..0f2ca590bf23 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -10,7 +10,6 @@
10 * io_apic.c.) 10 * io_apic.c.)
11 */ 11 */
12 12
13#include <asm/uaccess.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/seq_file.h> 14#include <linux/seq_file.h>
16#include <linux/interrupt.h> 15#include <linux/interrupt.h>
@@ -19,19 +18,36 @@
19#include <linux/cpu.h> 18#include <linux/cpu.h>
20#include <linux/delay.h> 19#include <linux/delay.h>
21 20
21#include <asm/idle.h>
22
23#include <asm/apic.h>
24#include <asm/uaccess.h>
25
22DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; 26DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
23EXPORT_PER_CPU_SYMBOL(irq_stat); 27EXPORT_PER_CPU_SYMBOL(irq_stat);
24 28
25#ifndef CONFIG_X86_LOCAL_APIC
26/* 29/*
27 * 'what should we do if we get a hw irq event on an illegal vector'. 30 * 'what should we do if we get a hw irq event on an illegal vector'.
28 * each architecture has to answer this themselves. 31 * each architecture has to answer this themselves.
29 */ 32 */
30void ack_bad_irq(unsigned int irq) 33void ack_bad_irq(unsigned int irq)
31{ 34{
32 printk("unexpected IRQ trap at vector %02x\n", irq); 35 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
33} 36
37#ifdef CONFIG_X86_LOCAL_APIC
38 /*
39 * Currently unexpected vectors happen only on SMP and APIC.
40 * We _must_ ack these because every local APIC has only N
41 * irq slots per priority level, and a 'hanging, unacked' IRQ
42 * holds up an irq slot - in excessive cases (when multiple
43 * unexpected vectors occur) that might lock up the APIC
44 * completely.
45 * But only ack when the APIC is enabled -AK
46 */
47 if (cpu_has_apic)
48 ack_APIC_irq();
34#endif 49#endif
50}
35 51
36#ifdef CONFIG_4KSTACKS 52#ifdef CONFIG_4KSTACKS
37/* 53/*
@@ -61,6 +77,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
61 union irq_ctx *curctx, *irqctx; 77 union irq_ctx *curctx, *irqctx;
62 u32 *isp; 78 u32 *isp;
63#endif 79#endif
80 exit_idle();
64 81
65 if (unlikely((unsigned)irq >= NR_IRQS)) { 82 if (unlikely((unsigned)irq >= NR_IRQS)) {
66 printk(KERN_EMERG "%s: cannot handle IRQ %d\n", 83 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index af1d53344993..b545bc746fce 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
363 " pushf\n" 363 " pushf\n"
364 /* skip cs, eip, orig_eax */ 364 /* skip cs, eip, orig_eax */
365 " subl $12, %esp\n" 365 " subl $12, %esp\n"
366 " pushl %gs\n" 366 " pushl %fs\n"
367 " pushl %ds\n" 367 " pushl %ds\n"
368 " pushl %es\n" 368 " pushl %es\n"
369 " pushl %eax\n" 369 " pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
387 " popl %edi\n" 387 " popl %edi\n"
388 " popl %ebp\n" 388 " popl %ebp\n"
389 " popl %eax\n" 389 " popl %eax\n"
390 /* skip eip, orig_eax, es, ds, gs */ 390 /* skip eip, orig_eax, es, ds, fs */
391 " addl $20, %esp\n" 391 " addl $20, %esp\n"
392 " popf\n" 392 " popf\n"
393 " ret\n"); 393 " ret\n");
@@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
408 spin_lock_irqsave(&kretprobe_lock, flags); 408 spin_lock_irqsave(&kretprobe_lock, flags);
409 head = kretprobe_inst_table_head(current); 409 head = kretprobe_inst_table_head(current);
410 /* fixup registers */ 410 /* fixup registers */
411 regs->xcs = __KERNEL_CS; 411 regs->xcs = __KERNEL_CS | get_kernel_rpl();
412 regs->eip = trampoline_address; 412 regs->eip = trampoline_address;
413 regs->orig_eax = 0xffffffff; 413 regs->orig_eax = 0xffffffff;
414 414
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 381252bae3d8..b8f16633a6ec 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -384,7 +384,7 @@ static int do_microcode_update (void)
384{ 384{
385 long cursor = 0; 385 long cursor = 0;
386 int error = 0; 386 int error = 0;
387 void *new_mc; 387 void *new_mc = NULL;
388 int cpu; 388 int cpu;
389 cpumask_t old; 389 cpumask_t old;
390 390
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 4e14264f392a..bcaa6e9b6197 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
68#ifdef CONFIG_SMP 68#ifdef CONFIG_SMP
69 69
70struct msr_command { 70struct msr_command {
71 int cpu;
72 int err; 71 int err;
73 u32 reg; 72 u32 reg;
74 u32 data[2]; 73 u32 data[2];
@@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block)
78{ 77{
79 struct msr_command *cmd = (struct msr_command *)cmd_block; 78 struct msr_command *cmd = (struct msr_command *)cmd_block;
80 79
81 if (cmd->cpu == smp_processor_id()) 80 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
82 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
83} 81}
84 82
85static void msr_smp_rdmsr(void *cmd_block) 83static void msr_smp_rdmsr(void *cmd_block)
86{ 84{
87 struct msr_command *cmd = (struct msr_command *)cmd_block; 85 struct msr_command *cmd = (struct msr_command *)cmd_block;
88 86
89 if (cmd->cpu == smp_processor_id()) 87 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
90 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
91} 88}
92 89
93static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) 90static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
@@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
99 if (cpu == smp_processor_id()) { 96 if (cpu == smp_processor_id()) {
100 ret = wrmsr_eio(reg, eax, edx); 97 ret = wrmsr_eio(reg, eax, edx);
101 } else { 98 } else {
102 cmd.cpu = cpu;
103 cmd.reg = reg; 99 cmd.reg = reg;
104 cmd.data[0] = eax; 100 cmd.data[0] = eax;
105 cmd.data[1] = edx; 101 cmd.data[1] = edx;
106 102
107 smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); 103 smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1);
108 ret = cmd.err; 104 ret = cmd.err;
109 } 105 }
110 preempt_enable(); 106 preempt_enable();
@@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
120 if (cpu == smp_processor_id()) { 116 if (cpu == smp_processor_id()) {
121 ret = rdmsr_eio(reg, eax, edx); 117 ret = rdmsr_eio(reg, eax, edx);
122 } else { 118 } else {
123 cmd.cpu = cpu;
124 cmd.reg = reg; 119 cmd.reg = reg;
125 120
126 smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); 121 smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1);
127 122
128 *eax = cmd.data[0]; 123 *eax = cmd.data[0];
129 *edx = cmd.data[1]; 124 *edx = cmd.data[1];
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 1a6f8bb8881c..821df34d2b3a 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -23,6 +23,7 @@
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/kernel_stat.h>
26 27
27#include <asm/smp.h> 28#include <asm/smp.h>
28#include <asm/nmi.h> 29#include <asm/nmi.h>
@@ -185,7 +186,8 @@ static __cpuinit inline int nmi_known_cpu(void)
185{ 186{
186 switch (boot_cpu_data.x86_vendor) { 187 switch (boot_cpu_data.x86_vendor) {
187 case X86_VENDOR_AMD: 188 case X86_VENDOR_AMD:
188 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); 189 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
190 || (boot_cpu_data.x86 == 16));
189 case X86_VENDOR_INTEL: 191 case X86_VENDOR_INTEL:
190 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 192 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
191 return 1; 193 return 1;
@@ -216,6 +218,28 @@ static __init void nmi_cpu_busy(void *data)
216} 218}
217#endif 219#endif
218 220
221static unsigned int adjust_for_32bit_ctr(unsigned int hz)
222{
223 u64 counter_val;
224 unsigned int retval = hz;
225
226 /*
227 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
228 * are writable, with higher bits sign extending from bit 31.
229 * So, we can only program the counter with 31 bit values and
230 * 32nd bit should be 1, for 33.. to be 1.
231 * Find the appropriate nmi_hz
232 */
233 counter_val = (u64)cpu_khz * 1000;
234 do_div(counter_val, retval);
235 if (counter_val > 0x7fffffffULL) {
236 u64 count = (u64)cpu_khz * 1000;
237 do_div(count, 0x7fffffffUL);
238 retval = count + 1;
239 }
240 return retval;
241}
242
219static int __init check_nmi_watchdog(void) 243static int __init check_nmi_watchdog(void)
220{ 244{
221 unsigned int *prev_nmi_count; 245 unsigned int *prev_nmi_count;
@@ -281,18 +305,10 @@ static int __init check_nmi_watchdog(void)
281 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 305 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
282 306
283 nmi_hz = 1; 307 nmi_hz = 1;
284 /* 308
285 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter 309 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
286 * are writable, with higher bits sign extending from bit 31. 310 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
287 * So, we can only program the counter with 31 bit values and 311 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
288 * 32nd bit should be 1, for 33.. to be 1.
289 * Find the appropriate nmi_hz
290 */
291 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
292 ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
293 u64 count = (u64)cpu_khz * 1000;
294 do_div(count, 0x7fffffffUL);
295 nmi_hz = count + 1;
296 } 312 }
297 } 313 }
298 314
@@ -369,6 +385,34 @@ void enable_timer_nmi_watchdog(void)
369 } 385 }
370} 386}
371 387
388static void __acpi_nmi_disable(void *__unused)
389{
390 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
391}
392
393/*
394 * Disable timer based NMIs on all CPUs:
395 */
396void acpi_nmi_disable(void)
397{
398 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
399 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
400}
401
402static void __acpi_nmi_enable(void *__unused)
403{
404 apic_write_around(APIC_LVT0, APIC_DM_NMI);
405}
406
407/*
408 * Enable timer based NMIs on all CPUs:
409 */
410void acpi_nmi_enable(void)
411{
412 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
413 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
414}
415
372#ifdef CONFIG_PM 416#ifdef CONFIG_PM
373 417
374static int nmi_pm_active; /* nmi_active before suspend */ 418static int nmi_pm_active; /* nmi_active before suspend */
@@ -442,6 +486,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
442 wrmsrl(perfctr_msr, 0 - count); 486 wrmsrl(perfctr_msr, 0 - count);
443} 487}
444 488
489static void write_watchdog_counter32(unsigned int perfctr_msr,
490 const char *descr)
491{
492 u64 count = (u64)cpu_khz * 1000;
493
494 do_div(count, nmi_hz);
495 if(descr)
496 Dprintk("setting %s to -0x%08Lx\n", descr, count);
497 wrmsr(perfctr_msr, (u32)(-count), 0);
498}
499
445/* Note that these events don't tick when the CPU idles. This means 500/* Note that these events don't tick when the CPU idles. This means
446 the frequency varies with CPU load. */ 501 the frequency varies with CPU load. */
447 502
@@ -531,7 +586,8 @@ static int setup_p6_watchdog(void)
531 586
532 /* setup the timer */ 587 /* setup the timer */
533 wrmsr(evntsel_msr, evntsel, 0); 588 wrmsr(evntsel_msr, evntsel, 0);
534 write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); 589 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
590 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
535 apic_write(APIC_LVTPC, APIC_DM_NMI); 591 apic_write(APIC_LVTPC, APIC_DM_NMI);
536 evntsel |= P6_EVNTSEL0_ENABLE; 592 evntsel |= P6_EVNTSEL0_ENABLE;
537 wrmsr(evntsel_msr, evntsel, 0); 593 wrmsr(evntsel_msr, evntsel, 0);
@@ -704,7 +760,8 @@ static int setup_intel_arch_watchdog(void)
704 760
705 /* setup the timer */ 761 /* setup the timer */
706 wrmsr(evntsel_msr, evntsel, 0); 762 wrmsr(evntsel_msr, evntsel, 0);
707 write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); 763 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
764 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
708 apic_write(APIC_LVTPC, APIC_DM_NMI); 765 apic_write(APIC_LVTPC, APIC_DM_NMI);
709 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 766 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
710 wrmsr(evntsel_msr, evntsel, 0); 767 wrmsr(evntsel_msr, evntsel, 0);
@@ -762,7 +819,8 @@ void setup_apic_nmi_watchdog (void *unused)
762 if (nmi_watchdog == NMI_LOCAL_APIC) { 819 if (nmi_watchdog == NMI_LOCAL_APIC) {
763 switch (boot_cpu_data.x86_vendor) { 820 switch (boot_cpu_data.x86_vendor) {
764 case X86_VENDOR_AMD: 821 case X86_VENDOR_AMD:
765 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) 822 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
823 boot_cpu_data.x86 != 16)
766 return; 824 return;
767 if (!setup_k7_watchdog()) 825 if (!setup_k7_watchdog())
768 return; 826 return;
@@ -916,9 +974,13 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
916 cpu_clear(cpu, backtrace_mask); 974 cpu_clear(cpu, backtrace_mask);
917 } 975 }
918 976
919 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 977 /*
978 * Take the local apic timer and PIT/HPET into account. We don't
979 * know which one is active, when we have highres/dyntick on
980 */
981 sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
920 982
921 /* if the apic timer isn't firing, this cpu isn't doing much */ 983 /* if the none of the timers isn't firing, this cpu isn't doing much */
922 if (!touched && last_irq_sums[cpu] == sum) { 984 if (!touched && last_irq_sums[cpu] == sum) {
923 /* 985 /*
924 * Ayiee, looks like this CPU is stuck ... 986 * Ayiee, looks like this CPU is stuck ...
@@ -956,6 +1018,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
956 dummy &= ~P4_CCCR_OVF; 1018 dummy &= ~P4_CCCR_OVF;
957 wrmsrl(wd->cccr_msr, dummy); 1019 wrmsrl(wd->cccr_msr, dummy);
958 apic_write(APIC_LVTPC, APIC_DM_NMI); 1020 apic_write(APIC_LVTPC, APIC_DM_NMI);
1021 /* start the cycle over again */
1022 write_watchdog_counter(wd->perfctr_msr, NULL);
959 } 1023 }
960 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || 1024 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
961 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { 1025 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
@@ -964,9 +1028,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
964 * other P6 variant. 1028 * other P6 variant.
965 * ArchPerfom/Core Duo also needs this */ 1029 * ArchPerfom/Core Duo also needs this */
966 apic_write(APIC_LVTPC, APIC_DM_NMI); 1030 apic_write(APIC_LVTPC, APIC_DM_NMI);
1031 /* P6/ARCH_PERFMON has 32 bit counter write */
1032 write_watchdog_counter32(wd->perfctr_msr, NULL);
1033 } else {
1034 /* start the cycle over again */
1035 write_watchdog_counter(wd->perfctr_msr, NULL);
967 } 1036 }
968 /* start the cycle over again */
969 write_watchdog_counter(wd->perfctr_msr, NULL);
970 rc = 1; 1037 rc = 1;
971 } else if (nmi_watchdog == NMI_IO_APIC) { 1038 } else if (nmi_watchdog == NMI_IO_APIC) {
972 /* don't know how to accurately check for this. 1039 /* don't know how to accurately check for this.
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index e55fd05da0f5..c156ecfa3872 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
92 return insn_len; 92 return insn_len;
93} 93}
94 94
95static fastcall unsigned long native_get_debugreg(int regno) 95static unsigned long native_get_debugreg(int regno)
96{ 96{
97 unsigned long val = 0; /* Damn you, gcc! */ 97 unsigned long val = 0; /* Damn you, gcc! */
98 98
@@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno)
115 return val; 115 return val;
116} 116}
117 117
118static fastcall void native_set_debugreg(int regno, unsigned long value) 118static void native_set_debugreg(int regno, unsigned long value)
119{ 119{
120 switch (regno) { 120 switch (regno) {
121 case 0: 121 case 0:
@@ -146,55 +146,55 @@ void init_IRQ(void)
146 paravirt_ops.init_IRQ(); 146 paravirt_ops.init_IRQ();
147} 147}
148 148
149static fastcall void native_clts(void) 149static void native_clts(void)
150{ 150{
151 asm volatile ("clts"); 151 asm volatile ("clts");
152} 152}
153 153
154static fastcall unsigned long native_read_cr0(void) 154static unsigned long native_read_cr0(void)
155{ 155{
156 unsigned long val; 156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); 157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val; 158 return val;
159} 159}
160 160
161static fastcall void native_write_cr0(unsigned long val) 161static void native_write_cr0(unsigned long val)
162{ 162{
163 asm volatile("movl %0,%%cr0": :"r" (val)); 163 asm volatile("movl %0,%%cr0": :"r" (val));
164} 164}
165 165
166static fastcall unsigned long native_read_cr2(void) 166static unsigned long native_read_cr2(void)
167{ 167{
168 unsigned long val; 168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); 169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val; 170 return val;
171} 171}
172 172
173static fastcall void native_write_cr2(unsigned long val) 173static void native_write_cr2(unsigned long val)
174{ 174{
175 asm volatile("movl %0,%%cr2": :"r" (val)); 175 asm volatile("movl %0,%%cr2": :"r" (val));
176} 176}
177 177
178static fastcall unsigned long native_read_cr3(void) 178static unsigned long native_read_cr3(void)
179{ 179{
180 unsigned long val; 180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); 181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val; 182 return val;
183} 183}
184 184
185static fastcall void native_write_cr3(unsigned long val) 185static void native_write_cr3(unsigned long val)
186{ 186{
187 asm volatile("movl %0,%%cr3": :"r" (val)); 187 asm volatile("movl %0,%%cr3": :"r" (val));
188} 188}
189 189
190static fastcall unsigned long native_read_cr4(void) 190static unsigned long native_read_cr4(void)
191{ 191{
192 unsigned long val; 192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); 193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val; 194 return val;
195} 195}
196 196
197static fastcall unsigned long native_read_cr4_safe(void) 197static unsigned long native_read_cr4_safe(void)
198{ 198{
199 unsigned long val; 199 unsigned long val;
200 /* This could fault if %cr4 does not exist */ 200 /* This could fault if %cr4 does not exist */
@@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void)
207 return val; 207 return val;
208} 208}
209 209
210static fastcall void native_write_cr4(unsigned long val) 210static void native_write_cr4(unsigned long val)
211{ 211{
212 asm volatile("movl %0,%%cr4": :"r" (val)); 212 asm volatile("movl %0,%%cr4": :"r" (val));
213} 213}
214 214
215static fastcall unsigned long native_save_fl(void) 215static unsigned long native_save_fl(void)
216{ 216{
217 unsigned long f; 217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); 218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f; 219 return f;
220} 220}
221 221
222static fastcall void native_restore_fl(unsigned long f) 222static void native_restore_fl(unsigned long f)
223{ 223{
224 asm volatile("pushl %0 ; popfl": /* no output */ 224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f) 225 :"g" (f)
226 :"memory", "cc"); 226 :"memory", "cc");
227} 227}
228 228
229static fastcall void native_irq_disable(void) 229static void native_irq_disable(void)
230{ 230{
231 asm volatile("cli": : :"memory"); 231 asm volatile("cli": : :"memory");
232} 232}
233 233
234static fastcall void native_irq_enable(void) 234static void native_irq_enable(void)
235{ 235{
236 asm volatile("sti": : :"memory"); 236 asm volatile("sti": : :"memory");
237} 237}
238 238
239static fastcall void native_safe_halt(void) 239static void native_safe_halt(void)
240{ 240{
241 asm volatile("sti; hlt": : :"memory"); 241 asm volatile("sti; hlt": : :"memory");
242} 242}
243 243
244static fastcall void native_halt(void) 244static void native_halt(void)
245{ 245{
246 asm volatile("hlt": : :"memory"); 246 asm volatile("hlt": : :"memory");
247} 247}
248 248
249static fastcall void native_wbinvd(void) 249static void native_wbinvd(void)
250{ 250{
251 asm volatile("wbinvd": : :"memory"); 251 asm volatile("wbinvd": : :"memory");
252} 252}
253 253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) 254static unsigned long long native_read_msr(unsigned int msr, int *err)
255{ 255{
256 unsigned long long val; 256 unsigned long long val;
257 257
@@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
270 return val; 270 return val;
271} 271}
272 272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val) 273static int native_write_msr(unsigned int msr, unsigned long long val)
274{ 274{
275 int err; 275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n" 276 asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
288 return err; 288 return err;
289} 289}
290 290
291static fastcall unsigned long long native_read_tsc(void) 291static unsigned long long native_read_tsc(void)
292{ 292{
293 unsigned long long val; 293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val)); 294 asm volatile("rdtsc" : "=A" (val));
295 return val; 295 return val;
296} 296}
297 297
298static fastcall unsigned long long native_read_pmc(void) 298static unsigned long long native_read_pmc(void)
299{ 299{
300 unsigned long long val; 300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val)); 301 asm volatile("rdpmc" : "=A" (val));
302 return val; 302 return val;
303} 303}
304 304
305static fastcall void native_load_tr_desc(void) 305static void native_load_tr_desc(void)
306{ 306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); 307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308} 308}
309 309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) 310static void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{ 311{
312 asm volatile("lgdt %0"::"m" (*dtr)); 312 asm volatile("lgdt %0"::"m" (*dtr));
313} 313}
314 314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) 315static void native_load_idt(const struct Xgt_desc_struct *dtr)
316{ 316{
317 asm volatile("lidt %0"::"m" (*dtr)); 317 asm volatile("lidt %0"::"m" (*dtr));
318} 318}
319 319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) 320static void native_store_gdt(struct Xgt_desc_struct *dtr)
321{ 321{
322 asm ("sgdt %0":"=m" (*dtr)); 322 asm ("sgdt %0":"=m" (*dtr));
323} 323}
324 324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) 325static void native_store_idt(struct Xgt_desc_struct *dtr)
326{ 326{
327 asm ("sidt %0":"=m" (*dtr)); 327 asm ("sidt %0":"=m" (*dtr));
328} 328}
329 329
330static fastcall unsigned long native_store_tr(void) 330static unsigned long native_store_tr(void)
331{ 331{
332 unsigned long tr; 332 unsigned long tr;
333 asm ("str %0":"=r" (tr)); 333 asm ("str %0":"=r" (tr));
334 return tr; 334 return tr;
335} 335}
336 336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) 337static void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{ 338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] 339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2); 340 C(0); C(1); C(2);
@@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32
348 lp[1] = entry_high; 348 lp[1] = entry_high;
349} 349}
350 350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) 351static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{ 352{
353 native_write_dt_entry(dt, entrynum, low, high); 353 native_write_dt_entry(dt, entrynum, low, high);
354} 354}
355 355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) 356static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{ 357{
358 native_write_dt_entry(dt, entrynum, low, high); 358 native_write_dt_entry(dt, entrynum, low, high);
359} 359}
360 360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) 361static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{ 362{
363 native_write_dt_entry(dt, entrynum, low, high); 363 native_write_dt_entry(dt, entrynum, low, high);
364} 364}
365 365
366static fastcall void native_load_esp0(struct tss_struct *tss, 366static void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread) 367 struct thread_struct *thread)
368{ 368{
369 tss->esp0 = thread->esp0; 369 tss->esp0 = thread->esp0;
@@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss,
375 } 375 }
376} 376}
377 377
378static fastcall void native_io_delay(void) 378static void native_io_delay(void)
379{ 379{
380 asm volatile("outb %al,$0x80"); 380 asm volatile("outb %al,$0x80");
381} 381}
382 382
383static fastcall void native_flush_tlb(void) 383static void native_flush_tlb(void)
384{ 384{
385 __native_flush_tlb(); 385 __native_flush_tlb();
386} 386}
@@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void)
389 * Global pages have to be flushed a bit differently. Not a real 389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often. 390 * performance problem because this does not happen often.
391 */ 391 */
392static fastcall void native_flush_tlb_global(void) 392static void native_flush_tlb_global(void)
393{ 393{
394 __native_flush_tlb_global(); 394 __native_flush_tlb_global();
395} 395}
396 396
397static fastcall void native_flush_tlb_single(u32 addr) 397static void native_flush_tlb_single(u32 addr)
398{ 398{
399 __native_flush_tlb_single(addr); 399 __native_flush_tlb_single(addr);
400} 400}
401 401
402#ifndef CONFIG_X86_PAE 402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) 403static void native_set_pte(pte_t *ptep, pte_t pteval)
404{ 404{
405 *ptep = pteval; 405 *ptep = pteval;
406} 406}
407 407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) 408static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{ 409{
410 *ptep = pteval; 410 *ptep = pteval;
411} 411}
412 412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) 413static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{ 414{
415 *pmdp = pmdval; 415 *pmdp = pmdval;
416} 416}
417 417
418#else /* CONFIG_X86_PAE */ 418#else /* CONFIG_X86_PAE */
419 419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte) 420static void native_set_pte(pte_t *ptep, pte_t pte)
421{ 421{
422 ptep->pte_high = pte.pte_high; 422 ptep->pte_high = pte.pte_high;
423 smp_wmb(); 423 smp_wmb();
424 ptep->pte_low = pte.pte_low; 424 ptep->pte_low = pte.pte_low;
425} 425}
426 426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) 427static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{ 428{
429 ptep->pte_high = pte.pte_high; 429 ptep->pte_high = pte.pte_high;
430 smp_wmb(); 430 smp_wmb();
431 ptep->pte_low = pte.pte_low; 431 ptep->pte_low = pte.pte_low;
432} 432}
433 433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 434static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{ 435{
436 ptep->pte_low = 0; 436 ptep->pte_low = 0;
437 smp_wmb(); 437 smp_wmb();
@@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long
440 ptep->pte_low = pte.pte_low; 440 ptep->pte_low = pte.pte_low;
441} 441}
442 442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) 443static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{ 444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval)); 445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446} 446}
447 447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) 448static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{ 449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); 450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451} 451}
452 452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) 453static void native_set_pud(pud_t *pudp, pud_t pudval)
454{ 454{
455 *pudp = pudval; 455 *pudp = pudval;
456} 456}
457 457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 458static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{ 459{
460 ptep->pte_low = 0; 460 ptep->pte_low = 0;
461 smp_wmb(); 461 smp_wmb();
462 ptep->pte_high = 0; 462 ptep->pte_high = 0;
463} 463}
464 464
465static fastcall void native_pmd_clear(pmd_t *pmd) 465static void native_pmd_clear(pmd_t *pmd)
466{ 466{
467 u32 *tmp = (u32 *)pmd; 467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0; 468 *tmp = 0;
@@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd)
472#endif /* CONFIG_X86_PAE */ 472#endif /* CONFIG_X86_PAE */
473 473
474/* These are in entry.S */ 474/* These are in entry.S */
475extern fastcall void native_iret(void); 475extern void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void); 476extern void native_irq_enable_sysexit(void);
477 477
478static int __init print_banner(void) 478static int __init print_banner(void)
479{ 479{
@@ -482,9 +482,6 @@ static int __init print_banner(void)
482} 482}
483core_initcall(print_banner); 483core_initcall(print_banner);
484 484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = { 485struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware", 486 .name = "bare hardware",
490 .paravirt_enabled = 0, 487 .paravirt_enabled = 0,
@@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = {
544 .apic_write = native_apic_write, 541 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic, 542 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read, 543 .apic_read = native_apic_read,
544 .setup_boot_clock = setup_boot_APIC_clock,
545 .setup_secondary_clock = setup_secondary_APIC_clock,
547#endif 546#endif
547 .set_lazy_mode = (void *)native_nop,
548 548
549 .flush_tlb_user = native_flush_tlb, 549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global, 550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single, 551 .flush_tlb_single = native_flush_tlb_single,
552 552
553 .alloc_pt = (void *)native_nop,
554 .alloc_pd = (void *)native_nop,
555 .alloc_pd_clone = (void *)native_nop,
556 .release_pt = (void *)native_nop,
557 .release_pd = (void *)native_nop,
558
553 .set_pte = native_set_pte, 559 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at, 560 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd, 561 .set_pmd = native_set_pmd,
@@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = {
565 571
566 .irq_enable_sysexit = native_irq_enable_sysexit, 572 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret, 573 .iret = native_iret,
574
575 .startup_ipi_hook = (void *)native_nop,
568}; 576};
569 577
570/* 578/*
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/i386/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
1#include <linux/platform_device.h>
2#include <linux/errno.h>
3#include <linux/init.h>
4
5static __init int add_pcspkr(void)
6{
7 struct platform_device *pd;
8 int ret;
9
10 pd = platform_device_alloc("pcspkr", -1);
11 if (!pd)
12 return -ENOMEM;
13
14 ret = platform_device_add(pd);
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19}
20device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index c641056233a6..bea304d48cdb 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -38,6 +38,7 @@
38#include <linux/ptrace.h> 38#include <linux/ptrace.h>
39#include <linux/random.h> 39#include <linux/random.h>
40#include <linux/personality.h> 40#include <linux/personality.h>
41#include <linux/tick.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -48,6 +49,7 @@
48#include <asm/i387.h> 49#include <asm/i387.h>
49#include <asm/desc.h> 50#include <asm/desc.h>
50#include <asm/vm86.h> 51#include <asm/vm86.h>
52#include <asm/idle.h>
51#ifdef CONFIG_MATH_EMULATION 53#ifdef CONFIG_MATH_EMULATION
52#include <asm/math_emu.h> 54#include <asm/math_emu.h>
53#endif 55#endif
@@ -80,6 +82,42 @@ void (*pm_idle)(void);
80EXPORT_SYMBOL(pm_idle); 82EXPORT_SYMBOL(pm_idle);
81static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 83static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
82 84
85static ATOMIC_NOTIFIER_HEAD(idle_notifier);
86
87void idle_notifier_register(struct notifier_block *n)
88{
89 atomic_notifier_chain_register(&idle_notifier, n);
90}
91
92void idle_notifier_unregister(struct notifier_block *n)
93{
94 atomic_notifier_chain_unregister(&idle_notifier, n);
95}
96
97static DEFINE_PER_CPU(volatile unsigned long, idle_state);
98
99void enter_idle(void)
100{
101 /* needs to be atomic w.r.t. interrupts, not against other CPUs */
102 __set_bit(0, &__get_cpu_var(idle_state));
103 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
104}
105
106static void __exit_idle(void)
107{
108 /* needs to be atomic w.r.t. interrupts, not against other CPUs */
109 if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0)
110 return;
111 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
112}
113
114void exit_idle(void)
115{
116 if (current->pid)
117 return;
118 __exit_idle();
119}
120
83void disable_hlt(void) 121void disable_hlt(void)
84{ 122{
85 hlt_counter++; 123 hlt_counter++;
@@ -130,6 +168,7 @@ EXPORT_SYMBOL(default_idle);
130 */ 168 */
131static void poll_idle (void) 169static void poll_idle (void)
132{ 170{
171 local_irq_enable();
133 cpu_relax(); 172 cpu_relax();
134} 173}
135 174
@@ -173,6 +212,7 @@ void cpu_idle(void)
173 212
174 /* endless idle loop with no priority at all */ 213 /* endless idle loop with no priority at all */
175 while (1) { 214 while (1) {
215 tick_nohz_stop_sched_tick();
176 while (!need_resched()) { 216 while (!need_resched()) {
177 void (*idle)(void); 217 void (*idle)(void);
178 218
@@ -189,8 +229,18 @@ void cpu_idle(void)
189 play_dead(); 229 play_dead();
190 230
191 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 231 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
232
233 /*
234 * Idle routines should keep interrupts disabled
235 * from here on, until they go to idle.
236 * Otherwise, idle callbacks can misfire.
237 */
238 local_irq_disable();
239 enter_idle();
192 idle(); 240 idle();
241 __exit_idle();
193 } 242 }
243 tick_nohz_restart_sched_tick();
194 preempt_enable_no_resched(); 244 preempt_enable_no_resched();
195 schedule(); 245 schedule();
196 preempt_disable(); 246 preempt_disable();
@@ -243,7 +293,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
243 __monitor((void *)&current_thread_info()->flags, 0, 0); 293 __monitor((void *)&current_thread_info()->flags, 0, 0);
244 smp_mb(); 294 smp_mb();
245 if (!need_resched()) 295 if (!need_resched())
246 __mwait(eax, ecx); 296 __sti_mwait(eax, ecx);
297 else
298 local_irq_enable();
299 } else {
300 local_irq_enable();
247 } 301 }
248} 302}
249 303
@@ -308,8 +362,8 @@ void show_regs(struct pt_regs * regs)
308 regs->eax,regs->ebx,regs->ecx,regs->edx); 362 regs->eax,regs->ebx,regs->ecx,regs->edx);
309 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 363 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
310 regs->esi, regs->edi, regs->ebp); 364 regs->esi, regs->edi, regs->ebp);
311 printk(" DS: %04x ES: %04x GS: %04x\n", 365 printk(" DS: %04x ES: %04x FS: %04x\n",
312 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); 366 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
313 367
314 cr0 = read_cr0(); 368 cr0 = read_cr0();
315 cr2 = read_cr2(); 369 cr2 = read_cr2();
@@ -340,7 +394,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
340 394
341 regs.xds = __USER_DS; 395 regs.xds = __USER_DS;
342 regs.xes = __USER_DS; 396 regs.xes = __USER_DS;
343 regs.xgs = __KERNEL_PDA; 397 regs.xfs = __KERNEL_PDA;
344 regs.orig_eax = -1; 398 regs.orig_eax = -1;
345 regs.eip = (unsigned long) kernel_thread_helper; 399 regs.eip = (unsigned long) kernel_thread_helper;
346 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 400 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +479,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
425 479
426 p->thread.eip = (unsigned long) ret_from_fork; 480 p->thread.eip = (unsigned long) ret_from_fork;
427 481
428 savesegment(fs,p->thread.fs); 482 savesegment(gs,p->thread.gs);
429 483
430 tsk = current; 484 tsk = current;
431 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 485 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +555,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
501 dump->regs.eax = regs->eax; 555 dump->regs.eax = regs->eax;
502 dump->regs.ds = regs->xds; 556 dump->regs.ds = regs->xds;
503 dump->regs.es = regs->xes; 557 dump->regs.es = regs->xes;
504 savesegment(fs,dump->regs.fs); 558 dump->regs.fs = regs->xfs;
505 dump->regs.gs = regs->xgs; 559 savesegment(gs,dump->regs.gs);
506 dump->regs.orig_eax = regs->orig_eax; 560 dump->regs.orig_eax = regs->orig_eax;
507 dump->regs.eip = regs->eip; 561 dump->regs.eip = regs->eip;
508 dump->regs.cs = regs->xcs; 562 dump->regs.cs = regs->xcs;
@@ -653,7 +707,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
653 load_esp0(tss, next); 707 load_esp0(tss, next);
654 708
655 /* 709 /*
656 * Save away %fs. No need to save %gs, as it was saved on the 710 * Save away %gs. No need to save %fs, as it was saved on the
657 * stack on entry. No need to save %es and %ds, as those are 711 * stack on entry. No need to save %es and %ds, as those are
658 * always kernel segments while inside the kernel. Doing this 712 * always kernel segments while inside the kernel. Doing this
659 * before setting the new TLS descriptors avoids the situation 713 * before setting the new TLS descriptors avoids the situation
@@ -662,7 +716,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
662 * used %fs or %gs (it does not today), or if the kernel is 716 * used %fs or %gs (it does not today), or if the kernel is
663 * running inside of a hypervisor layer. 717 * running inside of a hypervisor layer.
664 */ 718 */
665 savesegment(fs, prev->fs); 719 savesegment(gs, prev->gs);
666 720
667 /* 721 /*
668 * Load the per-thread Thread-Local Storage descriptor. 722 * Load the per-thread Thread-Local Storage descriptor.
@@ -670,14 +724,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
670 load_TLS(next, cpu); 724 load_TLS(next, cpu);
671 725
672 /* 726 /*
673 * Restore %fs if needed. 727 * Restore IOPL if needed. In normal use, the flags restore
674 * 728 * in the switch assembly will handle this. But if the kernel
675 * Glibc normally makes %fs be zero. 729 * is running virtualized at a non-zero CPL, the popf will
730 * not restore flags, so it must be done in a separate step.
676 */ 731 */
677 if (unlikely(prev->fs | next->fs)) 732 if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
678 loadsegment(fs, next->fs); 733 set_iopl_mask(next->iopl);
679
680 write_pda(pcurrent, next_p);
681 734
682 /* 735 /*
683 * Now maybe handle debug registers and/or IO bitmaps 736 * Now maybe handle debug registers and/or IO bitmaps
@@ -688,6 +741,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
688 741
689 disable_tsc(prev_p, next_p); 742 disable_tsc(prev_p, next_p);
690 743
744 /*
745 * Leave lazy mode, flushing any hypercalls made here.
746 * This must be done before restoring TLS segments so
747 * the GDT and LDT are properly updated, and must be
748 * done before math_state_restore, so the TS bit is up
749 * to date.
750 */
751 arch_leave_lazy_cpu_mode();
752
691 /* If the task has used fpu the last 5 timeslices, just do a full 753 /* If the task has used fpu the last 5 timeslices, just do a full
692 * restore of the math state immediately to avoid the trap; the 754 * restore of the math state immediately to avoid the trap; the
693 * chances of needing FPU soon are obviously high now 755 * chances of needing FPU soon are obviously high now
@@ -695,6 +757,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
695 if (next_p->fpu_counter > 5) 757 if (next_p->fpu_counter > 5)
696 math_state_restore(); 758 math_state_restore();
697 759
760 /*
761 * Restore %gs if needed (which is common)
762 */
763 if (prev->gs | next->gs)
764 loadsegment(gs, next->gs);
765
766 write_pda(pcurrent, next_p);
767
698 return prev_p; 768 return prev_p;
699} 769}
700 770
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index af8aabe85800..4a8f8a259723 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *child,
89 unsigned long regno, unsigned long value) 89 unsigned long regno, unsigned long value)
90{ 90{
91 switch (regno >> 2) { 91 switch (regno >> 2) {
92 case FS: 92 case GS:
93 if (value && (value & 3) != 3) 93 if (value && (value & 3) != 3)
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.gs = value;
96 return 0; 96 return 0;
97 case DS: 97 case DS:
98 case ES: 98 case ES:
99 case GS: 99 case FS:
100 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
101 return -EIO; 101 return -EIO;
102 value &= 0xffff; 102 value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *child,
112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
113 break; 113 break;
114 } 114 }
115 if (regno > ES*4) 115 if (regno > FS*4)
116 regno -= 1*4; 116 regno -= 1*4;
117 put_stack_long(child, regno, value); 117 put_stack_long(child, regno, value);
118 return 0; 118 return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child,
124 unsigned long retval = ~0UL; 124 unsigned long retval = ~0UL;
125 125
126 switch (regno >> 2) { 126 switch (regno >> 2) {
127 case FS: 127 case GS:
128 retval = child->thread.fs; 128 retval = child->thread.gs;
129 break; 129 break;
130 case DS: 130 case DS:
131 case ES: 131 case ES:
132 case GS: 132 case FS:
133 case SS: 133 case SS:
134 case CS: 134 case CS:
135 retval = 0xffff; 135 retval = 0xffff;
136 /* fall through */ 136 /* fall through */
137 default: 137 default:
138 if (regno > ES*4) 138 if (regno > FS*4)
139 regno -= 1*4; 139 regno -= 1*4;
140 retval &= get_stack_long(child, regno); 140 retval &= get_stack_long(child, regno);
141 } 141 }
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 4694ac980cd2..122623dcc6e1 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -33,7 +33,6 @@
33#include <linux/initrd.h> 33#include <linux/initrd.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/platform_device.h>
37#include <linux/console.h> 36#include <linux/console.h>
38#include <linux/mca.h> 37#include <linux/mca.h>
39#include <linux/root_dev.h> 38#include <linux/root_dev.h>
@@ -60,6 +59,7 @@
60#include <asm/io_apic.h> 59#include <asm/io_apic.h>
61#include <asm/ist.h> 60#include <asm/ist.h>
62#include <asm/io.h> 61#include <asm/io.h>
62#include <asm/vmi.h>
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
@@ -581,6 +581,14 @@ void __init setup_arch(char **cmdline_p)
581 581
582 max_low_pfn = setup_memory(); 582 max_low_pfn = setup_memory();
583 583
584#ifdef CONFIG_VMI
585 /*
586 * Must be after max_low_pfn is determined, and before kernel
587 * pagetables are setup.
588 */
589 vmi_init();
590#endif
591
584 /* 592 /*
585 * NOTE: before this point _nobody_ is allowed to allocate 593 * NOTE: before this point _nobody_ is allowed to allocate
586 * any memory using the bootmem allocator. Although the 594 * any memory using the bootmem allocator. Although the
@@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p)
651#endif 659#endif
652 tsc_init(); 660 tsc_init();
653} 661}
654
655static __init int add_pcspkr(void)
656{
657 struct platform_device *pd;
658 int ret;
659
660 pd = platform_device_alloc("pcspkr", -1);
661 if (!pd)
662 return -ENOMEM;
663
664 ret = platform_device_add(pd);
665 if (ret)
666 platform_device_put(pd);
667
668 return ret;
669}
670device_initcall(add_pcspkr);
671
672/*
673 * Local Variables:
674 * mode:c
675 * c-file-style:"k&r"
676 * c-basic-offset:8
677 * End:
678 */
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 65d7620eaa09..4f99e870c986 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/elf.h> 23#include <linux/elf.h>
24#include <linux/binfmts.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/ucontext.h> 26#include <asm/ucontext.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 129 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 130 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 131
131 COPY_SEG(gs); 132 GET_SEG(gs);
132 GET_SEG(fs); 133 COPY_SEG(fs);
133 COPY_SEG(es); 134 COPY_SEG(es);
134 COPY_SEG(ds); 135 COPY_SEG(ds);
135 COPY(edi); 136 COPY(edi);
@@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 245{
245 int tmp, err = 0; 246 int tmp, err = 0;
246 247
247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); 248 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
248 savesegment(fs, tmp); 249 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 250 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 251
251 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); 252 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
252 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); 253 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
@@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
349 goto give_sigsegv; 350 goto give_sigsegv;
350 } 351 }
351 352
352 restorer = (void *)VDSO_SYM(&__kernel_sigreturn); 353 if (current->binfmt->hasvdso)
354 restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
355 else
356 restorer = (void *)&frame->retcode;
353 if (ka->sa.sa_flags & SA_RESTORER) 357 if (ka->sa.sa_flags & SA_RESTORER)
354 restorer = ka->sa.sa_restorer; 358 restorer = ka->sa.sa_restorer;
355 359
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 5285aff8367f..9bd9637ae692 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -23,6 +23,7 @@
23 23
24#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26#include <asm/idle.h>
26#include <mach_apic.h> 27#include <mach_apic.h>
27 28
28/* 29/*
@@ -374,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
374 /* 375 /*
375 * i'm not happy about this global shared spinlock in the 376 * i'm not happy about this global shared spinlock in the
376 * MM hot path, but we'll see how contended it is. 377 * MM hot path, but we'll see how contended it is.
377 * Temporarily this turns IRQs off, so that lockups are 378 * AK: x86-64 has a faster method that could be ported.
378 * detected by the NMI watchdog.
379 */ 379 */
380 spin_lock(&tlbstate_lock); 380 spin_lock(&tlbstate_lock);
381 381
@@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
400 400
401 while (!cpus_empty(flush_cpumask)) 401 while (!cpus_empty(flush_cpumask))
402 /* nothing. lockup detection does not belong here */ 402 /* nothing. lockup detection does not belong here */
403 mb(); 403 cpu_relax();
404 404
405 flush_mm = NULL; 405 flush_mm = NULL;
406 flush_va = 0; 406 flush_va = 0;
@@ -624,6 +624,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
624 /* 624 /*
625 * At this point the info structure may be out of scope unless wait==1 625 * At this point the info structure may be out of scope unless wait==1
626 */ 626 */
627 exit_idle();
627 irq_enter(); 628 irq_enter();
628 (*func)(info); 629 (*func)(info);
629 irq_exit(); 630 irq_exit();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8c6c8c52b95c..48bfcaa13ecc 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -63,6 +63,7 @@
63#include <mach_apic.h> 63#include <mach_apic.h>
64#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
65#include <smpboot_hooks.h> 65#include <smpboot_hooks.h>
66#include <asm/vmi.h>
66 67
67/* Set if we find a B stepping CPU */ 68/* Set if we find a B stepping CPU */
68static int __devinitdata smp_b_stepping; 69static int __devinitdata smp_b_stepping;
@@ -93,12 +94,6 @@ cpumask_t cpu_possible_map;
93EXPORT_SYMBOL(cpu_possible_map); 94EXPORT_SYMBOL(cpu_possible_map);
94static cpumask_t smp_commenced_mask; 95static cpumask_t smp_commenced_mask;
95 96
96/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
97 * is no way to resync one AP against BP. TBD: for prescott and above, we
98 * should use IA64's algorithm
99 */
100static int __devinitdata tsc_sync_disabled;
101
102/* Per CPU bogomips and other parameters */ 97/* Per CPU bogomips and other parameters */
103struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 98struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
104EXPORT_SYMBOL(cpu_data); 99EXPORT_SYMBOL(cpu_data);
@@ -215,151 +210,6 @@ valid_k7:
215 ; 210 ;
216} 211}
217 212
218/*
219 * TSC synchronization.
220 *
221 * We first check whether all CPUs have their TSC's synchronized,
222 * then we print a warning if not, and always resync.
223 */
224
225static struct {
226 atomic_t start_flag;
227 atomic_t count_start;
228 atomic_t count_stop;
229 unsigned long long values[NR_CPUS];
230} tsc __cpuinitdata = {
231 .start_flag = ATOMIC_INIT(0),
232 .count_start = ATOMIC_INIT(0),
233 .count_stop = ATOMIC_INIT(0),
234};
235
236#define NR_LOOPS 5
237
238static void __init synchronize_tsc_bp(void)
239{
240 int i;
241 unsigned long long t0;
242 unsigned long long sum, avg;
243 long long delta;
244 unsigned int one_usec;
245 int buggy = 0;
246
247 printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
248
249 /* convert from kcyc/sec to cyc/usec */
250 one_usec = cpu_khz / 1000;
251
252 atomic_set(&tsc.start_flag, 1);
253 wmb();
254
255 /*
256 * We loop a few times to get a primed instruction cache,
257 * then the last pass is more or less synchronized and
258 * the BP and APs set their cycle counters to zero all at
259 * once. This reduces the chance of having random offsets
260 * between the processors, and guarantees that the maximum
261 * delay between the cycle counters is never bigger than
262 * the latency of information-passing (cachelines) between
263 * two CPUs.
264 */
265 for (i = 0; i < NR_LOOPS; i++) {
266 /*
267 * all APs synchronize but they loop on '== num_cpus'
268 */
269 while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
270 cpu_relax();
271 atomic_set(&tsc.count_stop, 0);
272 wmb();
273 /*
274 * this lets the APs save their current TSC:
275 */
276 atomic_inc(&tsc.count_start);
277
278 rdtscll(tsc.values[smp_processor_id()]);
279 /*
280 * We clear the TSC in the last loop:
281 */
282 if (i == NR_LOOPS-1)
283 write_tsc(0, 0);
284
285 /*
286 * Wait for all APs to leave the synchronization point:
287 */
288 while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
289 cpu_relax();
290 atomic_set(&tsc.count_start, 0);
291 wmb();
292 atomic_inc(&tsc.count_stop);
293 }
294
295 sum = 0;
296 for (i = 0; i < NR_CPUS; i++) {
297 if (cpu_isset(i, cpu_callout_map)) {
298 t0 = tsc.values[i];
299 sum += t0;
300 }
301 }
302 avg = sum;
303 do_div(avg, num_booting_cpus());
304
305 for (i = 0; i < NR_CPUS; i++) {
306 if (!cpu_isset(i, cpu_callout_map))
307 continue;
308 delta = tsc.values[i] - avg;
309 if (delta < 0)
310 delta = -delta;
311 /*
312 * We report bigger than 2 microseconds clock differences.
313 */
314 if (delta > 2*one_usec) {
315 long long realdelta;
316
317 if (!buggy) {
318 buggy = 1;
319 printk("\n");
320 }
321 realdelta = delta;
322 do_div(realdelta, one_usec);
323 if (tsc.values[i] < avg)
324 realdelta = -realdelta;
325
326 if (realdelta)
327 printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
328 "skew, fixed it up.\n", i, realdelta);
329 }
330 }
331 if (!buggy)
332 printk("passed.\n");
333}
334
335static void __cpuinit synchronize_tsc_ap(void)
336{
337 int i;
338
339 /*
340 * Not every cpu is online at the time
341 * this gets called, so we first wait for the BP to
342 * finish SMP initialization:
343 */
344 while (!atomic_read(&tsc.start_flag))
345 cpu_relax();
346
347 for (i = 0; i < NR_LOOPS; i++) {
348 atomic_inc(&tsc.count_start);
349 while (atomic_read(&tsc.count_start) != num_booting_cpus())
350 cpu_relax();
351
352 rdtscll(tsc.values[smp_processor_id()]);
353 if (i == NR_LOOPS-1)
354 write_tsc(0, 0);
355
356 atomic_inc(&tsc.count_stop);
357 while (atomic_read(&tsc.count_stop) != num_booting_cpus())
358 cpu_relax();
359 }
360}
361#undef NR_LOOPS
362
363extern void calibrate_delay(void); 213extern void calibrate_delay(void);
364 214
365static atomic_t init_deasserted; 215static atomic_t init_deasserted;
@@ -437,20 +287,12 @@ static void __cpuinit smp_callin(void)
437 /* 287 /*
438 * Save our processor parameters 288 * Save our processor parameters
439 */ 289 */
440 smp_store_cpu_info(cpuid); 290 smp_store_cpu_info(cpuid);
441
442 disable_APIC_timer();
443 291
444 /* 292 /*
445 * Allow the master to continue. 293 * Allow the master to continue.
446 */ 294 */
447 cpu_set(cpuid, cpu_callin_map); 295 cpu_set(cpuid, cpu_callin_map);
448
449 /*
450 * Synchronize the TSC with the BP
451 */
452 if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
453 synchronize_tsc_ap();
454} 296}
455 297
456static int cpucount; 298static int cpucount;
@@ -545,18 +387,25 @@ static void __cpuinit start_secondary(void *unused)
545 * booting is too fragile that we want to limit the 387 * booting is too fragile that we want to limit the
546 * things done here to the most necessary things. 388 * things done here to the most necessary things.
547 */ 389 */
390#ifdef CONFIG_VMI
391 vmi_bringup();
392#endif
548 secondary_cpu_init(); 393 secondary_cpu_init();
549 preempt_disable(); 394 preempt_disable();
550 smp_callin(); 395 smp_callin();
551 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 396 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
552 rep_nop(); 397 rep_nop();
553 setup_secondary_APIC_clock(); 398 /*
399 * Check TSC synchronization with the BP:
400 */
401 check_tsc_sync_target();
402
403 setup_secondary_clock();
554 if (nmi_watchdog == NMI_IO_APIC) { 404 if (nmi_watchdog == NMI_IO_APIC) {
555 disable_8259A_irq(0); 405 disable_8259A_irq(0);
556 enable_NMI_through_LVT0(NULL); 406 enable_NMI_through_LVT0(NULL);
557 enable_8259A_irq(0); 407 enable_8259A_irq(0);
558 } 408 }
559 enable_APIC_timer();
560 /* 409 /*
561 * low-memory mappings have been cleared, flush them from 410 * low-memory mappings have been cleared, flush them from
562 * the local TLBs too. 411 * the local TLBs too.
@@ -619,7 +468,6 @@ extern struct {
619 unsigned short ss; 468 unsigned short ss;
620} stack_start; 469} stack_start;
621extern struct i386_pda *start_pda; 470extern struct i386_pda *start_pda;
622extern struct Xgt_desc_struct cpu_gdt_descr;
623 471
624#ifdef CONFIG_NUMA 472#ifdef CONFIG_NUMA
625 473
@@ -749,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
749 /* 597 /*
750 * Due to the Pentium erratum 3AP. 598 * Due to the Pentium erratum 3AP.
751 */ 599 */
752 maxlvt = get_maxlvt(); 600 maxlvt = lapic_get_maxlvt();
753 if (maxlvt > 3) { 601 if (maxlvt > 3) {
754 apic_read_around(APIC_SPIV); 602 apic_read_around(APIC_SPIV);
755 apic_write(APIC_ESR, 0); 603 apic_write(APIC_ESR, 0);
@@ -835,11 +683,18 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
835 num_starts = 0; 683 num_starts = 0;
836 684
837 /* 685 /*
686 * Paravirt / VMI wants a startup IPI hook here to set up the
687 * target processor state.
688 */
689 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
690 (unsigned long) stack_start.esp);
691
692 /*
838 * Run STARTUP IPI loop. 693 * Run STARTUP IPI loop.
839 */ 694 */
840 Dprintk("#startup loops: %d.\n", num_starts); 695 Dprintk("#startup loops: %d.\n", num_starts);
841 696
842 maxlvt = get_maxlvt(); 697 maxlvt = lapic_get_maxlvt();
843 698
844 for (j = 1; j <= num_starts; j++) { 699 for (j = 1; j <= num_starts; j++) {
845 Dprintk("Sending STARTUP #%d.\n",j); 700 Dprintk("Sending STARTUP #%d.\n",j);
@@ -1115,8 +970,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1115 info.cpu = cpu; 970 info.cpu = cpu;
1116 INIT_WORK(&info.task, do_warm_boot_cpu); 971 INIT_WORK(&info.task, do_warm_boot_cpu);
1117 972
1118 tsc_sync_disabled = 1;
1119
1120 /* init low mem mapping */ 973 /* init low mem mapping */
1121 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 974 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
1122 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); 975 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
@@ -1124,7 +977,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1124 schedule_work(&info.task); 977 schedule_work(&info.task);
1125 wait_for_completion(&done); 978 wait_for_completion(&done);
1126 979
1127 tsc_sync_disabled = 0;
1128 zap_low_mappings(); 980 zap_low_mappings();
1129 ret = 0; 981 ret = 0;
1130exit: 982exit:
@@ -1320,13 +1172,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1320 1172
1321 smpboot_setup_io_apic(); 1173 smpboot_setup_io_apic();
1322 1174
1323 setup_boot_APIC_clock(); 1175 setup_boot_clock();
1324
1325 /*
1326 * Synchronize the TSC with the AP
1327 */
1328 if (cpu_has_tsc && cpucount && cpu_khz)
1329 synchronize_tsc_bp();
1330} 1176}
1331 1177
1332/* These are wrappers to interface to the new boot process. Someone 1178/* These are wrappers to interface to the new boot process. Someone
@@ -1461,9 +1307,16 @@ int __cpuinit __cpu_up(unsigned int cpu)
1461 } 1307 }
1462 1308
1463 local_irq_enable(); 1309 local_irq_enable();
1310
1464 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1311 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1465 /* Unleash the CPU! */ 1312 /* Unleash the CPU! */
1466 cpu_set(cpu, smp_commenced_mask); 1313 cpu_set(cpu, smp_commenced_mask);
1314
1315 /*
1316 * Check TSC synchronization with the AP:
1317 */
1318 check_tsc_sync_source(cpu);
1319
1467 while (!cpu_isset(cpu, cpu_online_map)) 1320 while (!cpu_isset(cpu, cpu_online_map))
1468 cpu_relax(); 1321 cpu_relax();
1469 1322
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index bc882a2b1db6..13ca54a85a1c 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -78,7 +78,7 @@ int __init sysenter_setup(void)
78 syscall_pages[0] = virt_to_page(syscall_page); 78 syscall_pages[0] = virt_to_page(syscall_page);
79 79
80#ifdef CONFIG_COMPAT_VDSO 80#ifdef CONFIG_COMPAT_VDSO
81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); 81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
83#endif 83#endif
84 84
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index c505b16c0990..a5350059557a 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs)
131 unsigned long pc = instruction_pointer(regs); 131 unsigned long pc = instruction_pointer(regs);
132 132
133#ifdef CONFIG_SMP 133#ifdef CONFIG_SMP
134 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 134 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
135 in_lock_functions(pc)) {
135#ifdef CONFIG_FRAME_POINTER 136#ifdef CONFIG_FRAME_POINTER
136 return *(unsigned long *)(regs->ebp + 4); 137 return *(unsigned long *)(regs->ebp + 4);
137#else 138#else
138 unsigned long *sp; 139 unsigned long *sp = (unsigned long *)&regs->esp;
139 if ((regs->xcs & 3) == 0) 140
140 sp = (unsigned long *)&regs->esp;
141 else
142 sp = (unsigned long *)regs->esp;
143 /* Return address is either directly at stack pointer 141 /* Return address is either directly at stack pointer
144 or above a saved eflags. Eflags has bits 22-31 zero, 142 or above a saved eflags. Eflags has bits 22-31 zero,
145 kernel addresses don't. */ 143 kernel addresses don't. */
@@ -161,15 +159,6 @@ EXPORT_SYMBOL(profile_pc);
161 */ 159 */
162irqreturn_t timer_interrupt(int irq, void *dev_id) 160irqreturn_t timer_interrupt(int irq, void *dev_id)
163{ 161{
164 /*
165 * Here we are in the timer irq handler. We just have irqs locally
166 * disabled but we don't know if the timer_bh is running on the other
167 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
168 * the irq version of write_lock because as just said we have irq
169 * locally disabled. -arca
170 */
171 write_seqlock(&xtime_lock);
172
173#ifdef CONFIG_X86_IO_APIC 162#ifdef CONFIG_X86_IO_APIC
174 if (timer_ack) { 163 if (timer_ack) {
175 /* 164 /*
@@ -188,7 +177,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
188 177
189 do_timer_interrupt_hook(); 178 do_timer_interrupt_hook();
190 179
191
192 if (MCA_bus) { 180 if (MCA_bus) {
193 /* The PS/2 uses level-triggered interrupts. You can't 181 /* The PS/2 uses level-triggered interrupts. You can't
194 turn them off, nor would you want to (any attempt to 182 turn them off, nor would you want to (any attempt to
@@ -203,18 +191,11 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
203 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 191 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
204 } 192 }
205 193
206 write_sequnlock(&xtime_lock);
207
208#ifdef CONFIG_X86_LOCAL_APIC
209 if (using_apic_timer)
210 smp_send_timer_broadcast_ipi();
211#endif
212
213 return IRQ_HANDLED; 194 return IRQ_HANDLED;
214} 195}
215 196
216/* not static: needed by APM */ 197/* not static: needed by APM */
217unsigned long get_cmos_time(void) 198unsigned long read_persistent_clock(void)
218{ 199{
219 unsigned long retval; 200 unsigned long retval;
220 unsigned long flags; 201 unsigned long flags;
@@ -227,11 +208,11 @@ unsigned long get_cmos_time(void)
227 208
228 return retval; 209 return retval;
229} 210}
230EXPORT_SYMBOL(get_cmos_time);
231 211
232static void sync_cmos_clock(unsigned long dummy); 212static void sync_cmos_clock(unsigned long dummy);
233 213
234static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 214static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
215int no_sync_cmos_clock;
235 216
236static void sync_cmos_clock(unsigned long dummy) 217static void sync_cmos_clock(unsigned long dummy)
237{ 218{
@@ -275,117 +256,20 @@ static void sync_cmos_clock(unsigned long dummy)
275 256
276void notify_arch_cmos_timer(void) 257void notify_arch_cmos_timer(void)
277{ 258{
278 mod_timer(&sync_cmos_timer, jiffies + 1); 259 if (!no_sync_cmos_clock)
279} 260 mod_timer(&sync_cmos_timer, jiffies + 1);
280
281static long clock_cmos_diff;
282static unsigned long sleep_start;
283
284static int timer_suspend(struct sys_device *dev, pm_message_t state)
285{
286 /*
287 * Estimate time zone so that set_time can update the clock
288 */
289 unsigned long ctime = get_cmos_time();
290
291 clock_cmos_diff = -ctime;
292 clock_cmos_diff += get_seconds();
293 sleep_start = ctime;
294 return 0;
295}
296
297static int timer_resume(struct sys_device *dev)
298{
299 unsigned long flags;
300 unsigned long sec;
301 unsigned long ctime = get_cmos_time();
302 long sleep_length = (ctime - sleep_start) * HZ;
303 struct timespec ts;
304
305 if (sleep_length < 0) {
306 printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
307 /* The time after the resume must not be earlier than the time
308 * before the suspend or some nasty things will happen
309 */
310 sleep_length = 0;
311 ctime = sleep_start;
312 }
313#ifdef CONFIG_HPET_TIMER
314 if (is_hpet_enabled())
315 hpet_reenable();
316#endif
317 setup_pit_timer();
318
319 sec = ctime + clock_cmos_diff;
320 ts.tv_sec = sec;
321 ts.tv_nsec = 0;
322 do_settimeofday(&ts);
323 write_seqlock_irqsave(&xtime_lock, flags);
324 jiffies_64 += sleep_length;
325 write_sequnlock_irqrestore(&xtime_lock, flags);
326 touch_softlockup_watchdog();
327 return 0;
328}
329
330static struct sysdev_class timer_sysclass = {
331 .resume = timer_resume,
332 .suspend = timer_suspend,
333 set_kset_name("timer"),
334};
335
336
337/* XXX this driverfs stuff should probably go elsewhere later -john */
338static struct sys_device device_timer = {
339 .id = 0,
340 .cls = &timer_sysclass,
341};
342
343static int time_init_device(void)
344{
345 int error = sysdev_class_register(&timer_sysclass);
346 if (!error)
347 error = sysdev_register(&device_timer);
348 return error;
349} 261}
350 262
351device_initcall(time_init_device);
352
353#ifdef CONFIG_HPET_TIMER
354extern void (*late_time_init)(void); 263extern void (*late_time_init)(void);
355/* Duplicate of time_init() below, with hpet_enable part added */ 264/* Duplicate of time_init() below, with hpet_enable part added */
356static void __init hpet_time_init(void) 265static void __init hpet_time_init(void)
357{ 266{
358 struct timespec ts; 267 if (!hpet_enable())
359 ts.tv_sec = get_cmos_time(); 268 setup_pit_timer();
360 ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
361
362 do_settimeofday(&ts);
363
364 if ((hpet_enable() >= 0) && hpet_use_timer) {
365 printk("Using HPET for base-timer\n");
366 }
367
368 do_time_init(); 269 do_time_init();
369} 270}
370#endif
371 271
372void __init time_init(void) 272void __init time_init(void)
373{ 273{
374 struct timespec ts; 274 late_time_init = hpet_time_init;
375#ifdef CONFIG_HPET_TIMER
376 if (is_hpet_capable()) {
377 /*
378 * HPET initialization needs to do memory-mapped io. So, let
379 * us do a late initialization after mem_init().
380 */
381 late_time_init = hpet_time_init;
382 return;
383 }
384#endif
385 ts.tv_sec = get_cmos_time();
386 ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
387
388 do_settimeofday(&ts);
389
390 do_time_init();
391} 275}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
deleted file mode 100644
index 1e4702dfcd01..000000000000
--- a/arch/i386/kernel/time_hpet.c
+++ /dev/null
@@ -1,497 +0,0 @@
1/*
2 * linux/arch/i386/kernel/time_hpet.c
3 * This code largely copied from arch/x86_64/kernel/time.c
4 * See that file for credits.
5 *
6 * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support
7 */
8
9#include <linux/errno.h>
10#include <linux/kernel.h>
11#include <linux/param.h>
12#include <linux/string.h>
13#include <linux/init.h>
14#include <linux/smp.h>
15
16#include <asm/timer.h>
17#include <asm/fixmap.h>
18#include <asm/apic.h>
19
20#include <linux/timex.h>
21
22#include <asm/hpet.h>
23#include <linux/hpet.h>
24
25static unsigned long hpet_period; /* fsecs / HPET clock */
26unsigned long hpet_tick; /* hpet clks count per tick */
27unsigned long hpet_address; /* hpet memory map physical address */
28int hpet_use_timer;
29
30static int use_hpet; /* can be used for runtime check of hpet */
31static int boot_hpet_disable; /* boottime override for HPET timer */
32static void __iomem * hpet_virt_address; /* hpet kernel virtual address */
33
34#define FSEC_TO_USEC (1000000000UL)
35
36int hpet_readl(unsigned long a)
37{
38 return readl(hpet_virt_address + a);
39}
40
41static void hpet_writel(unsigned long d, unsigned long a)
42{
43 writel(d, hpet_virt_address + a);
44}
45
46#ifdef CONFIG_X86_LOCAL_APIC
47/*
48 * HPET counters dont wrap around on every tick. They just change the
49 * comparator value and continue. Next tick can be caught by checking
50 * for a change in the comparator value. Used in apic.c.
51 */
52static void __devinit wait_hpet_tick(void)
53{
54 unsigned int start_cmp_val, end_cmp_val;
55
56 start_cmp_val = hpet_readl(HPET_T0_CMP);
57 do {
58 end_cmp_val = hpet_readl(HPET_T0_CMP);
59 } while (start_cmp_val == end_cmp_val);
60}
61#endif
62
63static int hpet_timer_stop_set_go(unsigned long tick)
64{
65 unsigned int cfg;
66
67 /*
68 * Stop the timers and reset the main counter.
69 */
70 cfg = hpet_readl(HPET_CFG);
71 cfg &= ~HPET_CFG_ENABLE;
72 hpet_writel(cfg, HPET_CFG);
73 hpet_writel(0, HPET_COUNTER);
74 hpet_writel(0, HPET_COUNTER + 4);
75
76 if (hpet_use_timer) {
77 /*
78 * Set up timer 0, as periodic with first interrupt to happen at
79 * hpet_tick, and period also hpet_tick.
80 */
81 cfg = hpet_readl(HPET_T0_CFG);
82 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
83 HPET_TN_SETVAL | HPET_TN_32BIT;
84 hpet_writel(cfg, HPET_T0_CFG);
85
86 /*
87 * The first write after writing TN_SETVAL to the config register sets
88 * the counter value, the second write sets the threshold.
89 */
90 hpet_writel(tick, HPET_T0_CMP);
91 hpet_writel(tick, HPET_T0_CMP);
92 }
93 /*
94 * Go!
95 */
96 cfg = hpet_readl(HPET_CFG);
97 if (hpet_use_timer)
98 cfg |= HPET_CFG_LEGACY;
99 cfg |= HPET_CFG_ENABLE;
100 hpet_writel(cfg, HPET_CFG);
101
102 return 0;
103}
104
105/*
106 * Check whether HPET was found by ACPI boot parse. If yes setup HPET
107 * counter 0 for kernel base timer.
108 */
109int __init hpet_enable(void)
110{
111 unsigned int id;
112 unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */
113 unsigned long hpet_tick_rem;
114
115 if (boot_hpet_disable)
116 return -1;
117
118 if (!hpet_address) {
119 return -1;
120 }
121 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
122 /*
123 * Read the period, compute tick and quotient.
124 */
125 id = hpet_readl(HPET_ID);
126
127 /*
128 * We are checking for value '1' or more in number field if
129 * CONFIG_HPET_EMULATE_RTC is set because we will need an
130 * additional timer for RTC emulation.
131 * However, we can do with one timer otherwise using the
132 * the single HPET timer for system time.
133 */
134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
138 return -1;
139 }
140#endif
141
142
143 hpet_period = hpet_readl(HPET_PERIOD);
144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
147 return -1;
148 }
149
150 /*
151 * 64 bit math
152 * First changing tick into fsec
153 * Then 64 bit div to find number of hpet clk per tick
154 */
155 ASM_MUL64_REG(tick_fsec_low, tick_fsec_high,
156 KERNEL_TICK_USEC, FSEC_TO_USEC);
157 ASM_DIV64_REG(hpet_tick, hpet_tick_rem,
158 hpet_period, tick_fsec_low, tick_fsec_high);
159
160 if (hpet_tick_rem > (hpet_period >> 1))
161 hpet_tick++; /* rounding the result */
162
163 hpet_use_timer = id & HPET_ID_LEGSUP;
164
165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
168 return -1;
169 }
170
171 use_hpet = 1;
172
173#ifdef CONFIG_HPET
174 {
175 struct hpet_data hd;
176 unsigned int ntimer;
177
178 memset(&hd, 0, sizeof (hd));
179
180 ntimer = hpet_readl(HPET_ID);
181 ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
182 ntimer++;
183
184 /*
185 * Register with driver.
186 * Timer0 and Timer1 is used by platform.
187 */
188 hd.hd_phys_address = hpet_address;
189 hd.hd_address = hpet_virt_address;
190 hd.hd_nirqs = ntimer;
191 hd.hd_flags = HPET_DATA_PLATFORM;
192 hpet_reserve_timer(&hd, 0);
193#ifdef CONFIG_HPET_EMULATE_RTC
194 hpet_reserve_timer(&hd, 1);
195#endif
196 hd.hd_irq[0] = HPET_LEGACY_8254;
197 hd.hd_irq[1] = HPET_LEGACY_RTC;
198 if (ntimer > 2) {
199 struct hpet __iomem *hpet;
200 struct hpet_timer __iomem *timer;
201 int i;
202
203 hpet = hpet_virt_address;
204
205 for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
206 timer++, i++)
207 hd.hd_irq[i] = (timer->hpet_config &
208 Tn_INT_ROUTE_CNF_MASK) >>
209 Tn_INT_ROUTE_CNF_SHIFT;
210
211 }
212
213 hpet_alloc(&hd);
214 }
215#endif
216
217#ifdef CONFIG_X86_LOCAL_APIC
218 if (hpet_use_timer)
219 wait_timer_tick = wait_hpet_tick;
220#endif
221 return 0;
222}
223
224int hpet_reenable(void)
225{
226 return hpet_timer_stop_set_go(hpet_tick);
227}
228
229int is_hpet_enabled(void)
230{
231 return use_hpet;
232}
233
234int is_hpet_capable(void)
235{
236 if (!boot_hpet_disable && hpet_address)
237 return 1;
238 return 0;
239}
240
241static int __init hpet_setup(char* str)
242{
243 if (str) {
244 if (!strncmp("disable", str, 7))
245 boot_hpet_disable = 1;
246 }
247 return 1;
248}
249
250__setup("hpet=", hpet_setup);
251
252#ifdef CONFIG_HPET_EMULATE_RTC
253/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
254 * is enabled, we support RTC interrupt functionality in software.
255 * RTC has 3 kinds of interrupts:
256 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
257 * is updated
258 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
259 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
260 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
261 * (1) and (2) above are implemented using polling at a frequency of
262 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
263 * overhead. (DEFAULT_RTC_INT_FREQ)
264 * For (3), we use interrupts at 64Hz or user specified periodic
265 * frequency, whichever is higher.
266 */
267#include <linux/mc146818rtc.h>
268#include <linux/rtc.h>
269
270#define DEFAULT_RTC_INT_FREQ 64
271#define RTC_NUM_INTS 1
272
273static unsigned long UIE_on;
274static unsigned long prev_update_sec;
275
276static unsigned long AIE_on;
277static struct rtc_time alarm_time;
278
279static unsigned long PIE_on;
280static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
281static unsigned long PIE_count;
282
283static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
284static unsigned int hpet_t1_cmp; /* cached comparator register */
285
286/*
287 * Timer 1 for RTC, we do not use periodic interrupt feature,
288 * even if HPET supports periodic interrupts on Timer 1.
289 * The reason being, to set up a periodic interrupt in HPET, we need to
290 * stop the main counter. And if we do that everytime someone diables/enables
291 * RTC, we will have adverse effect on main kernel timer running on Timer 0.
292 * So, for the time being, simulate the periodic interrupt in software.
293 *
294 * hpet_rtc_timer_init() is called for the first time and during subsequent
295 * interuppts reinit happens through hpet_rtc_timer_reinit().
296 */
297int hpet_rtc_timer_init(void)
298{
299 unsigned int cfg, cnt;
300 unsigned long flags;
301
302 if (!is_hpet_enabled())
303 return 0;
304 /*
305 * Set the counter 1 and enable the interrupts.
306 */
307 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
308 hpet_rtc_int_freq = PIE_freq;
309 else
310 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
311
312 local_irq_save(flags);
313
314 cnt = hpet_readl(HPET_COUNTER);
315 cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
316 hpet_writel(cnt, HPET_T1_CMP);
317 hpet_t1_cmp = cnt;
318
319 cfg = hpet_readl(HPET_T1_CFG);
320 cfg &= ~HPET_TN_PERIODIC;
321 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
322 hpet_writel(cfg, HPET_T1_CFG);
323
324 local_irq_restore(flags);
325
326 return 1;
327}
328
329static void hpet_rtc_timer_reinit(void)
330{
331 unsigned int cfg, cnt, ticks_per_int, lost_ints;
332
333 if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
334 cfg = hpet_readl(HPET_T1_CFG);
335 cfg &= ~HPET_TN_ENABLE;
336 hpet_writel(cfg, HPET_T1_CFG);
337 return;
338 }
339
340 if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
341 hpet_rtc_int_freq = PIE_freq;
342 else
343 hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
344
345 /* It is more accurate to use the comparator value than current count.*/
346 ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
347 hpet_t1_cmp += ticks_per_int;
348 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
349
350 /*
351 * If the interrupt handler was delayed too long, the write above tries
352 * to schedule the next interrupt in the past and the hardware would
353 * not interrupt until the counter had wrapped around.
354 * So we have to check that the comparator wasn't set to a past time.
355 */
356 cnt = hpet_readl(HPET_COUNTER);
357 if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
358 lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
359 /* Make sure that, even with the time needed to execute
360 * this code, the next scheduled interrupt has been moved
361 * back to the future: */
362 lost_ints++;
363
364 hpet_t1_cmp += lost_ints * ticks_per_int;
365 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
366
367 if (PIE_on)
368 PIE_count += lost_ints;
369
370 printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
371 hpet_rtc_int_freq);
372 }
373}
374
375/*
376 * The functions below are called from rtc driver.
377 * Return 0 if HPET is not being used.
378 * Otherwise do the necessary changes and return 1.
379 */
380int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
381{
382 if (!is_hpet_enabled())
383 return 0;
384
385 if (bit_mask & RTC_UIE)
386 UIE_on = 0;
387 if (bit_mask & RTC_PIE)
388 PIE_on = 0;
389 if (bit_mask & RTC_AIE)
390 AIE_on = 0;
391
392 return 1;
393}
394
395int hpet_set_rtc_irq_bit(unsigned long bit_mask)
396{
397 int timer_init_reqd = 0;
398
399 if (!is_hpet_enabled())
400 return 0;
401
402 if (!(PIE_on | AIE_on | UIE_on))
403 timer_init_reqd = 1;
404
405 if (bit_mask & RTC_UIE) {
406 UIE_on = 1;
407 }
408 if (bit_mask & RTC_PIE) {
409 PIE_on = 1;
410 PIE_count = 0;
411 }
412 if (bit_mask & RTC_AIE) {
413 AIE_on = 1;
414 }
415
416 if (timer_init_reqd)
417 hpet_rtc_timer_init();
418
419 return 1;
420}
421
422int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
423{
424 if (!is_hpet_enabled())
425 return 0;
426
427 alarm_time.tm_hour = hrs;
428 alarm_time.tm_min = min;
429 alarm_time.tm_sec = sec;
430
431 return 1;
432}
433
434int hpet_set_periodic_freq(unsigned long freq)
435{
436 if (!is_hpet_enabled())
437 return 0;
438
439 PIE_freq = freq;
440 PIE_count = 0;
441
442 return 1;
443}
444
445int hpet_rtc_dropped_irq(void)
446{
447 if (!is_hpet_enabled())
448 return 0;
449
450 return 1;
451}
452
453irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
454{
455 struct rtc_time curr_time;
456 unsigned long rtc_int_flag = 0;
457 int call_rtc_interrupt = 0;
458
459 hpet_rtc_timer_reinit();
460
461 if (UIE_on | AIE_on) {
462 rtc_get_rtc_time(&curr_time);
463 }
464 if (UIE_on) {
465 if (curr_time.tm_sec != prev_update_sec) {
466 /* Set update int info, call real rtc int routine */
467 call_rtc_interrupt = 1;
468 rtc_int_flag = RTC_UF;
469 prev_update_sec = curr_time.tm_sec;
470 }
471 }
472 if (PIE_on) {
473 PIE_count++;
474 if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
475 /* Set periodic int info, call real rtc int routine */
476 call_rtc_interrupt = 1;
477 rtc_int_flag |= RTC_PF;
478 PIE_count = 0;
479 }
480 }
481 if (AIE_on) {
482 if ((curr_time.tm_sec == alarm_time.tm_sec) &&
483 (curr_time.tm_min == alarm_time.tm_min) &&
484 (curr_time.tm_hour == alarm_time.tm_hour)) {
485 /* Set alarm int info, call real rtc int routine */
486 call_rtc_interrupt = 1;
487 rtc_int_flag |= RTC_AF;
488 }
489 }
490 if (call_rtc_interrupt) {
491 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
492 rtc_interrupt(rtc_int_flag, dev_id);
493 }
494 return IRQ_HANDLED;
495}
496#endif
497
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 0efad8aeb41a..af0d3f70a817 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void);
94asmlinkage void machine_check(void); 94asmlinkage void machine_check(void);
95 95
96int kstack_depth_to_print = 24; 96int kstack_depth_to_print = 24;
97static unsigned int code_bytes = 64;
97ATOMIC_NOTIFIER_HEAD(i386die_chain); 98ATOMIC_NOTIFIER_HEAD(i386die_chain);
98 99
99int register_die_notifier(struct notifier_block *nb) 100int register_die_notifier(struct notifier_block *nb)
@@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs)
291 int i; 292 int i;
292 int in_kernel = 1; 293 int in_kernel = 1;
293 unsigned long esp; 294 unsigned long esp;
294 unsigned short ss; 295 unsigned short ss, gs;
295 296
296 esp = (unsigned long) (&regs->esp); 297 esp = (unsigned long) (&regs->esp);
297 savesegment(ss, ss); 298 savesegment(ss, ss);
299 savesegment(gs, gs);
298 if (user_mode_vm(regs)) { 300 if (user_mode_vm(regs)) {
299 in_kernel = 0; 301 in_kernel = 0;
300 esp = regs->esp; 302 esp = regs->esp;
@@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs)
313 regs->eax, regs->ebx, regs->ecx, regs->edx); 315 regs->eax, regs->ebx, regs->ecx, regs->edx);
314 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 316 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
315 regs->esi, regs->edi, regs->ebp, esp); 317 regs->esi, regs->edi, regs->ebp, esp);
316 printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", 318 printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
317 regs->xds & 0xffff, regs->xes & 0xffff, ss); 319 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
318 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 320 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
319 TASK_COMM_LEN, current->comm, current->pid, 321 TASK_COMM_LEN, current->comm, current->pid,
320 current_thread_info(), current, current->thread_info); 322 current_thread_info(), current, current->thread_info);
@@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs)
324 */ 326 */
325 if (in_kernel) { 327 if (in_kernel) {
326 u8 *eip; 328 u8 *eip;
327 int code_bytes = 64; 329 unsigned int code_prologue = code_bytes * 43 / 64;
330 unsigned int code_len = code_bytes;
328 unsigned char c; 331 unsigned char c;
329 332
330 printk("\n" KERN_EMERG "Stack: "); 333 printk("\n" KERN_EMERG "Stack: ");
@@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs)
332 335
333 printk(KERN_EMERG "Code: "); 336 printk(KERN_EMERG "Code: ");
334 337
335 eip = (u8 *)regs->eip - 43; 338 eip = (u8 *)regs->eip - code_prologue;
336 if (eip < (u8 *)PAGE_OFFSET || 339 if (eip < (u8 *)PAGE_OFFSET ||
337 probe_kernel_address(eip, c)) { 340 probe_kernel_address(eip, c)) {
338 /* try starting at EIP */ 341 /* try starting at EIP */
339 eip = (u8 *)regs->eip; 342 eip = (u8 *)regs->eip;
340 code_bytes = 32; 343 code_len = code_len - code_prologue + 1;
341 } 344 }
342 for (i = 0; i < code_bytes; i++, eip++) { 345 for (i = 0; i < code_len; i++, eip++) {
343 if (eip < (u8 *)PAGE_OFFSET || 346 if (eip < (u8 *)PAGE_OFFSET ||
344 probe_kernel_address(eip, c)) { 347 probe_kernel_address(eip, c)) {
345 printk(" Bad EIP value."); 348 printk(" Bad EIP value.");
@@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s)
1191 return 1; 1194 return 1;
1192} 1195}
1193__setup("kstack=", kstack_setup); 1196__setup("kstack=", kstack_setup);
1197
1198static int __init code_bytes_setup(char *s)
1199{
1200 code_bytes = simple_strtoul(s, NULL, 0);
1201 if (code_bytes > 8192)
1202 code_bytes = 8192;
1203
1204 return 1;
1205}
1206__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 2cfc7b09b925..3082a418635c 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
23 * an extra value to store the TSC freq 23 * an extra value to store the TSC freq
24 */ 24 */
25unsigned int tsc_khz; 25unsigned int tsc_khz;
26unsigned long long (*custom_sched_clock)(void);
26 27
27int tsc_disable; 28int tsc_disable;
28 29
@@ -59,12 +60,6 @@ static inline int check_tsc_unstable(void)
59 return tsc_unstable; 60 return tsc_unstable;
60} 61}
61 62
62void mark_tsc_unstable(void)
63{
64 tsc_unstable = 1;
65}
66EXPORT_SYMBOL_GPL(mark_tsc_unstable);
67
68/* Accellerators for sched_clock() 63/* Accellerators for sched_clock()
69 * convert from cycles(64bits) => nanoseconds (64bits) 64 * convert from cycles(64bits) => nanoseconds (64bits)
70 * basic equation: 65 * basic equation:
@@ -107,14 +102,14 @@ unsigned long long sched_clock(void)
107{ 102{
108 unsigned long long this_offset; 103 unsigned long long this_offset;
109 104
105 if (unlikely(custom_sched_clock))
106 return (*custom_sched_clock)();
107
110 /* 108 /*
111 * in the NUMA case we dont use the TSC as they are not 109 * Fall back to jiffies if there's no TSC available:
112 * synchronized across all CPUs.
113 */ 110 */
114#ifndef CONFIG_NUMA 111 if (unlikely(tsc_disable))
115 if (!cpu_khz || check_tsc_unstable()) 112 /* No locking but a rare wrong value is not a big deal: */
116#endif
117 /* no locking but a rare wrong value is not a big deal */
118 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 113 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
119 114
120 /* read the Time Stamp Counter: */ 115 /* read the Time Stamp Counter: */
@@ -194,13 +189,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
194void __init tsc_init(void) 189void __init tsc_init(void)
195{ 190{
196 if (!cpu_has_tsc || tsc_disable) 191 if (!cpu_has_tsc || tsc_disable)
197 return; 192 goto out_no_tsc;
198 193
199 cpu_khz = calculate_cpu_khz(); 194 cpu_khz = calculate_cpu_khz();
200 tsc_khz = cpu_khz; 195 tsc_khz = cpu_khz;
201 196
202 if (!cpu_khz) 197 if (!cpu_khz)
203 return; 198 goto out_no_tsc;
204 199
205 printk("Detected %lu.%03lu MHz processor.\n", 200 printk("Detected %lu.%03lu MHz processor.\n",
206 (unsigned long)cpu_khz / 1000, 201 (unsigned long)cpu_khz / 1000,
@@ -208,37 +203,18 @@ void __init tsc_init(void)
208 203
209 set_cyc2ns_scale(cpu_khz); 204 set_cyc2ns_scale(cpu_khz);
210 use_tsc_delay(); 205 use_tsc_delay();
211} 206 return;
212 207
213#ifdef CONFIG_CPU_FREQ 208out_no_tsc:
214 209 /*
215static unsigned int cpufreq_delayed_issched = 0; 210 * Set the tsc_disable flag if there's no TSC support, this
216static unsigned int cpufreq_init = 0; 211 * makes it a fast flag for the kernel to see whether it
217static struct work_struct cpufreq_delayed_get_work; 212 * should be using the TSC.
218 213 */
219static void handle_cpufreq_delayed_get(struct work_struct *work) 214 tsc_disable = 1;
220{
221 unsigned int cpu;
222
223 for_each_online_cpu(cpu)
224 cpufreq_get(cpu);
225
226 cpufreq_delayed_issched = 0;
227} 215}
228 216
229/* 217#ifdef CONFIG_CPU_FREQ
230 * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
231 * to verify the CPU frequency the timing core thinks the CPU is running
232 * at is still correct.
233 */
234static inline void cpufreq_delayed_get(void)
235{
236 if (cpufreq_init && !cpufreq_delayed_issched) {
237 cpufreq_delayed_issched = 1;
238 printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
239 schedule_work(&cpufreq_delayed_get_work);
240 }
241}
242 218
243/* 219/*
244 * if the CPU frequency is scaled, TSC-based delays will need a different 220 * if the CPU frequency is scaled, TSC-based delays will need a different
@@ -303,17 +279,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
303 279
304static int __init cpufreq_tsc(void) 280static int __init cpufreq_tsc(void)
305{ 281{
306 int ret; 282 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
307 283 CPUFREQ_TRANSITION_NOTIFIER);
308 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
309 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
310 CPUFREQ_TRANSITION_NOTIFIER);
311 if (!ret)
312 cpufreq_init = 1;
313
314 return ret;
315} 284}
316
317core_initcall(cpufreq_tsc); 285core_initcall(cpufreq_tsc);
318 286
319#endif 287#endif
@@ -321,7 +289,6 @@ core_initcall(cpufreq_tsc);
321/* clock source code */ 289/* clock source code */
322 290
323static unsigned long current_tsc_khz = 0; 291static unsigned long current_tsc_khz = 0;
324static int tsc_update_callback(void);
325 292
326static cycle_t read_tsc(void) 293static cycle_t read_tsc(void)
327{ 294{
@@ -339,37 +306,28 @@ static struct clocksource clocksource_tsc = {
339 .mask = CLOCKSOURCE_MASK(64), 306 .mask = CLOCKSOURCE_MASK(64),
340 .mult = 0, /* to be set */ 307 .mult = 0, /* to be set */
341 .shift = 22, 308 .shift = 22,
342 .update_callback = tsc_update_callback, 309 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
343 .is_continuous = 1, 310 CLOCK_SOURCE_MUST_VERIFY,
344}; 311};
345 312
346static int tsc_update_callback(void) 313void mark_tsc_unstable(void)
347{ 314{
348 int change = 0; 315 if (!tsc_unstable) {
349 316 tsc_unstable = 1;
350 /* check to see if we should switch to the safe clocksource: */ 317 /* Can be called before registration */
351 if (clocksource_tsc.rating != 0 && check_tsc_unstable()) { 318 if (clocksource_tsc.mult)
352 clocksource_tsc.rating = 0; 319 clocksource_change_rating(&clocksource_tsc, 0);
353 clocksource_reselect(); 320 else
354 change = 1; 321 clocksource_tsc.rating = 0;
355 }
356
357 /* only update if tsc_khz has changed: */
358 if (current_tsc_khz != tsc_khz) {
359 current_tsc_khz = tsc_khz;
360 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
361 clocksource_tsc.shift);
362 change = 1;
363 } 322 }
364
365 return change;
366} 323}
324EXPORT_SYMBOL_GPL(mark_tsc_unstable);
367 325
368static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) 326static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
369{ 327{
370 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", 328 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
371 d->ident); 329 d->ident);
372 mark_tsc_unstable(); 330 tsc_unstable = 1;
373 return 0; 331 return 0;
374} 332}
375 333
@@ -386,65 +344,44 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
386 {} 344 {}
387}; 345};
388 346
389#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */
390static struct timer_list verify_tsc_freq_timer;
391
392/* XXX - Probably should add locking */
393static void verify_tsc_freq(unsigned long unused)
394{
395 static u64 last_tsc;
396 static unsigned long last_jiffies;
397
398 u64 now_tsc, interval_tsc;
399 unsigned long now_jiffies, interval_jiffies;
400
401
402 if (check_tsc_unstable())
403 return;
404
405 rdtscll(now_tsc);
406 now_jiffies = jiffies;
407
408 if (!last_jiffies) {
409 goto out;
410 }
411
412 interval_jiffies = now_jiffies - last_jiffies;
413 interval_tsc = now_tsc - last_tsc;
414 interval_tsc *= HZ;
415 do_div(interval_tsc, cpu_khz*1000);
416
417 if (interval_tsc < (interval_jiffies * 3 / 4)) {
418 printk("TSC appears to be running slowly. "
419 "Marking it as unstable\n");
420 mark_tsc_unstable();
421 return;
422 }
423
424out:
425 last_tsc = now_tsc;
426 last_jiffies = now_jiffies;
427 /* set us up to go off on the next interval: */
428 mod_timer(&verify_tsc_freq_timer,
429 jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL));
430}
431
432/* 347/*
433 * Make an educated guess if the TSC is trustworthy and synchronized 348 * Make an educated guess if the TSC is trustworthy and synchronized
434 * over all CPUs. 349 * over all CPUs.
435 */ 350 */
436static __init int unsynchronized_tsc(void) 351__cpuinit int unsynchronized_tsc(void)
437{ 352{
353 if (!cpu_has_tsc || tsc_unstable)
354 return 1;
438 /* 355 /*
439 * Intel systems are normally all synchronized. 356 * Intel systems are normally all synchronized.
440 * Exceptions must mark TSC as unstable: 357 * Exceptions must mark TSC as unstable:
441 */ 358 */
442 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) 359 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
443 return 0; 360 /* assume multi socket systems are not synchronized: */
361 if (num_possible_cpus() > 1)
362 tsc_unstable = 1;
363 }
364 return tsc_unstable;
365}
366
367/*
368 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
369 */
370#ifdef CONFIG_MGEODE_LX
371/* RTSC counts during suspend */
372#define RTSC_SUSP 0x100
373
374static void __init check_geode_tsc_reliable(void)
375{
376 unsigned long val;
444 377
445 /* assume multi socket systems are not synchronized: */ 378 rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
446 return num_possible_cpus() > 1; 379 if ((val & RTSC_SUSP))
380 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
447} 381}
382#else
383static inline void check_geode_tsc_reliable(void) { }
384#endif
448 385
449static int __init init_tsc_clocksource(void) 386static int __init init_tsc_clocksource(void)
450{ 387{
@@ -453,20 +390,16 @@ static int __init init_tsc_clocksource(void)
453 /* check blacklist */ 390 /* check blacklist */
454 dmi_check_system(bad_tsc_dmi_table); 391 dmi_check_system(bad_tsc_dmi_table);
455 392
456 if (unsynchronized_tsc()) /* mark unstable if unsynced */ 393 unsynchronized_tsc();
457 mark_tsc_unstable(); 394 check_geode_tsc_reliable();
458 current_tsc_khz = tsc_khz; 395 current_tsc_khz = tsc_khz;
459 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, 396 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
460 clocksource_tsc.shift); 397 clocksource_tsc.shift);
461 /* lower the rating if we already know its unstable: */ 398 /* lower the rating if we already know its unstable: */
462 if (check_tsc_unstable()) 399 if (check_tsc_unstable()) {
463 clocksource_tsc.rating = 0; 400 clocksource_tsc.rating = 0;
464 401 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
465 init_timer(&verify_tsc_freq_timer); 402 }
466 verify_tsc_freq_timer.function = verify_tsc_freq;
467 verify_tsc_freq_timer.expires =
468 jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
469 add_timer(&verify_tsc_freq_timer);
470 403
471 return clocksource_register(&clocksource_tsc); 404 return clocksource_register(&clocksource_tsc);
472 } 405 }
diff --git a/arch/i386/kernel/tsc_sync.c b/arch/i386/kernel/tsc_sync.c
new file mode 100644
index 000000000000..12424629af87
--- /dev/null
+++ b/arch/i386/kernel/tsc_sync.c
@@ -0,0 +1 @@
#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index be2f96e67f78..d1b8f2b7aea6 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
96{ 96{
97 int ret = 0; 97 int ret = 0;
98 98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to 99 /* kernel_vm86_regs is missing xgs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */ 100 (but not including) orig_eax, and then rest including orig_eax. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); 101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs, 102 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
103 sizeof(struct kernel_vm86_regs) - 103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs)); 104 offsetof(struct kernel_vm86_regs, pt.orig_eax));
105 105
106 return ret; 106 return ret;
107} 107}
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
113{ 113{
114 int ret = 0; 114 int ret = 0;
115 115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); 116 /* copy eax-xfs inclusive */
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs, 117 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
118 /* copy orig_eax-__gsh+extra */
119 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
118 sizeof(struct kernel_vm86_regs) - 120 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) + 121 offsetof(struct kernel_vm86_regs, pt.orig_eax) +
120 extra); 122 extra);
121
122 return ret; 123 return ret;
123} 124}
124 125
@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
157 158
158 ret = KVM86->regs32; 159 ret = KVM86->regs32;
159 160
160 loadsegment(fs, current->thread.saved_fs); 161 ret->xfs = current->thread.saved_fs;
161 ret->xgs = current->thread.saved_gs; 162 loadsegment(gs, current->thread.saved_gs);
162 163
163 return ret; 164 return ret;
164} 165}
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
285 */ 286 */
286 info->regs.pt.xds = 0; 287 info->regs.pt.xds = 0;
287 info->regs.pt.xes = 0; 288 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0; 289 info->regs.pt.xfs = 0;
289 290
290/* we are clearing fs later just before "jmp resume_userspace", 291/* we are clearing gs later just before "jmp resume_userspace",
291 * because it is not saved/restored. 292 * because it is not saved/restored.
292 */ 293 */
293 294
@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
321 */ 322 */
322 info->regs32->eax = 0; 323 info->regs32->eax = 0;
323 tsk->thread.saved_esp0 = tsk->thread.esp0; 324 tsk->thread.saved_esp0 = tsk->thread.esp0;
324 savesegment(fs, tsk->thread.saved_fs); 325 tsk->thread.saved_fs = info->regs32->xfs;
325 tsk->thread.saved_gs = info->regs32->xgs; 326 savesegment(gs, tsk->thread.saved_gs);
326 327
327 tss = &per_cpu(init_tss, get_cpu()); 328 tss = &per_cpu(init_tss, get_cpu());
328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 329 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
342 __asm__ __volatile__( 343 __asm__ __volatile__(
343 "movl %0,%%esp\n\t" 344 "movl %0,%%esp\n\t"
344 "movl %1,%%ebp\n\t" 345 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t" 346 "mov %2, %%gs\n\t"
346 "jmp resume_userspace" 347 "jmp resume_userspace"
347 : /* no outputs */ 348 : /* no outputs */
348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
new file mode 100644
index 000000000000..bb5a7abf949c
--- /dev/null
+++ b/arch/i386/kernel/vmi.c
@@ -0,0 +1,949 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/license.h>
27#include <linux/cpu.h>
28#include <linux/bootmem.h>
29#include <linux/mm.h>
30#include <asm/vmi.h>
31#include <asm/io.h>
32#include <asm/fixmap.h>
33#include <asm/apicdef.h>
34#include <asm/apic.h>
35#include <asm/processor.h>
36#include <asm/timer.h>
37#include <asm/vmi_time.h>
38
39/* Convenient for calling VMI functions indirectly in the ROM */
40typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
41typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
42
43#define call_vrom_func(rom,func) \
44 (((VROMFUNC *)(rom->func))())
45
46#define call_vrom_long_func(rom,func,arg) \
47 (((VROMLONGFUNC *)(rom->func)) (arg))
48
49static struct vrom_header *vmi_rom;
50static int license_gplok;
51static int disable_nodelay;
52static int disable_pge;
53static int disable_pse;
54static int disable_sep;
55static int disable_tsc;
56static int disable_mtrr;
57
58/* Cached VMI operations */
59struct {
60 void (*cpuid)(void /* non-c */);
61 void (*_set_ldt)(u32 selector);
62 void (*set_tr)(u32 selector);
63 void (*set_kernel_stack)(u32 selector, u32 esp0);
64 void (*allocate_page)(u32, u32, u32, u32, u32);
65 void (*release_page)(u32, u32);
66 void (*set_pte)(pte_t, pte_t *, unsigned);
67 void (*update_pte)(pte_t *, unsigned);
68 void (*set_linear_mapping)(int, u32, u32, u32);
69 void (*flush_tlb)(int);
70 void (*set_initial_ap_state)(int, int);
71 void (*halt)(void);
72} vmi_ops;
73
74/* XXX move this to alternative.h */
75extern struct paravirt_patch __start_parainstructions[],
76 __stop_parainstructions[];
77
78/*
79 * VMI patching routines.
80 */
81#define MNEM_CALL 0xe8
82#define MNEM_JMP 0xe9
83#define MNEM_RET 0xc3
84
85static char irq_save_disable_callout[] = {
86 MNEM_CALL, 0, 0, 0, 0,
87 MNEM_CALL, 0, 0, 0, 0,
88 MNEM_RET
89};
90#define IRQ_PATCH_INT_MASK 0
91#define IRQ_PATCH_DISABLE 5
92
93static inline void patch_offset(unsigned char *eip, unsigned char *dest)
94{
95 *(unsigned long *)(eip+1) = dest-eip-5;
96}
97
98static unsigned patch_internal(int call, unsigned len, void *insns)
99{
100 u64 reloc;
101 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
102 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
103 switch(rel->type) {
104 case VMI_RELOCATION_CALL_REL:
105 BUG_ON(len < 5);
106 *(char *)insns = MNEM_CALL;
107 patch_offset(insns, rel->eip);
108 return 5;
109
110 case VMI_RELOCATION_JUMP_REL:
111 BUG_ON(len < 5);
112 *(char *)insns = MNEM_JMP;
113 patch_offset(insns, rel->eip);
114 return 5;
115
116 case VMI_RELOCATION_NOP:
117 /* obliterate the whole thing */
118 return 0;
119
120 case VMI_RELOCATION_NONE:
121 /* leave native code in place */
122 break;
123
124 default:
125 BUG();
126 }
127 return len;
128}
129
130/*
131 * Apply patch if appropriate, return length of new instruction
132 * sequence. The callee does nop padding for us.
133 */
134static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
135{
136 switch (type) {
137 case PARAVIRT_IRQ_DISABLE:
138 return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
139 case PARAVIRT_IRQ_ENABLE:
140 return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
141 case PARAVIRT_RESTORE_FLAGS:
142 return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
143 case PARAVIRT_SAVE_FLAGS:
144 return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
145 case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
146 if (len >= 10) {
147 patch_internal(VMI_CALL_GetInterruptMask, len, insns);
148 patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
149 return 10;
150 } else {
151 /*
152 * You bastards didn't leave enough room to
153 * patch save_flags_irq_disable inline. Patch
154 * to a helper
155 */
156 BUG_ON(len < 5);
157 *(char *)insns = MNEM_CALL;
158 patch_offset(insns, irq_save_disable_callout);
159 return 5;
160 }
161 case PARAVIRT_INTERRUPT_RETURN:
162 return patch_internal(VMI_CALL_IRET, len, insns);
163 case PARAVIRT_STI_SYSEXIT:
164 return patch_internal(VMI_CALL_SYSEXIT, len, insns);
165 default:
166 break;
167 }
168 return len;
169}
170
171/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
172static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
173 unsigned int *ecx, unsigned int *edx)
174{
175 int override = 0;
176 if (*eax == 1)
177 override = 1;
178 asm volatile ("call *%6"
179 : "=a" (*eax),
180 "=b" (*ebx),
181 "=c" (*ecx),
182 "=d" (*edx)
183 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
184 if (override) {
185 if (disable_pse)
186 *edx &= ~X86_FEATURE_PSE;
187 if (disable_pge)
188 *edx &= ~X86_FEATURE_PGE;
189 if (disable_sep)
190 *edx &= ~X86_FEATURE_SEP;
191 if (disable_tsc)
192 *edx &= ~X86_FEATURE_TSC;
193 if (disable_mtrr)
194 *edx &= ~X86_FEATURE_MTRR;
195 }
196}
197
198static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
199{
200 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
201 write_gdt_entry(gdt, nr, new->a, new->b);
202}
203
204static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
205{
206 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
207 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
208 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
209 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
210}
211
212static void vmi_set_ldt(const void *addr, unsigned entries)
213{
214 unsigned cpu = smp_processor_id();
215 u32 low, high;
216
217 pack_descriptor(&low, &high, (unsigned long)addr,
218 entries * sizeof(struct desc_struct) - 1,
219 DESCTYPE_LDT, 0);
220 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
221 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
222}
223
224static void vmi_set_tr(void)
225{
226 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
227}
228
229static void vmi_load_esp0(struct tss_struct *tss,
230 struct thread_struct *thread)
231{
232 tss->esp0 = thread->esp0;
233
234 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
235 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
236 tss->ss1 = thread->sysenter_cs;
237 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
238 }
239 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
240}
241
242static void vmi_flush_tlb_user(void)
243{
244 vmi_ops.flush_tlb(VMI_FLUSH_TLB);
245}
246
247static void vmi_flush_tlb_kernel(void)
248{
249 vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
250}
251
252/* Stub to do nothing at all; used for delays and unimplemented calls */
253static void vmi_nop(void)
254{
255}
256
257/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
258#ifdef CONFIG_NO_IDLE_HZ
259static fastcall void vmi_safe_halt(void)
260{
261 int idle = vmi_stop_hz_timer();
262 vmi_ops.halt();
263 if (idle) {
264 local_irq_disable();
265 vmi_account_time_restart_hz_timer();
266 local_irq_enable();
267 }
268}
269#endif
270
271#ifdef CONFIG_DEBUG_PAGE_TYPE
272
273#ifdef CONFIG_X86_PAE
274#define MAX_BOOT_PTS (2048+4+1)
275#else
276#define MAX_BOOT_PTS (1024+1)
277#endif
278
279/*
280 * During boot, mem_map is not yet available in paging_init, so stash
281 * all the boot page allocations here.
282 */
283static struct {
284 u32 pfn;
285 int type;
286} boot_page_allocations[MAX_BOOT_PTS];
287static int num_boot_page_allocations;
288static int boot_allocations_applied;
289
290void vmi_apply_boot_page_allocations(void)
291{
292 int i;
293 BUG_ON(!mem_map);
294 for (i = 0; i < num_boot_page_allocations; i++) {
295 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
296 page->type = boot_page_allocations[i].type;
297 page->type = boot_page_allocations[i].type &
298 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
299 }
300 boot_allocations_applied = 1;
301}
302
303static void record_page_type(u32 pfn, int type)
304{
305 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
306 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
307 boot_page_allocations[num_boot_page_allocations].type = type;
308 num_boot_page_allocations++;
309}
310
311static void check_zeroed_page(u32 pfn, int type, struct page *page)
312{
313 u32 *ptr;
314 int i;
315 int limit = PAGE_SIZE / sizeof(int);
316
317 if (page_address(page))
318 ptr = (u32 *)page_address(page);
319 else
320 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
321 /*
322 * When cloning the root in non-PAE mode, only the userspace
323 * pdes need to be zeroed.
324 */
325 if (type & VMI_PAGE_CLONE)
326 limit = USER_PTRS_PER_PGD;
327 for (i = 0; i < limit; i++)
328 BUG_ON(ptr[i]);
329}
330
331/*
332 * We stash the page type into struct page so we can verify the page
333 * types are used properly.
334 */
335static void vmi_set_page_type(u32 pfn, int type)
336{
337 /* PAE can have multiple roots per page - don't track */
338 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
339 return;
340
341 if (boot_allocations_applied) {
342 struct page *page = pfn_to_page(pfn);
343 if (type != VMI_PAGE_NORMAL)
344 BUG_ON(page->type);
345 else
346 BUG_ON(page->type == VMI_PAGE_NORMAL);
347 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
348 if (type & VMI_PAGE_ZEROED)
349 check_zeroed_page(pfn, type, page);
350 } else {
351 record_page_type(pfn, type);
352 }
353}
354
355static void vmi_check_page_type(u32 pfn, int type)
356{
357 /* PAE can have multiple roots per page - skip checks */
358 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
359 return;
360
361 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
362 if (boot_allocations_applied) {
363 struct page *page = pfn_to_page(pfn);
364 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
365 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
366 BUG_ON((type & page->type) == 0);
367 }
368}
369#else
370#define vmi_set_page_type(p,t) do { } while (0)
371#define vmi_check_page_type(p,t) do { } while (0)
372#endif
373
374static void vmi_allocate_pt(u32 pfn)
375{
376 vmi_set_page_type(pfn, VMI_PAGE_L1);
377 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
378}
379
380static void vmi_allocate_pd(u32 pfn)
381{
382 /*
383 * This call comes in very early, before mem_map is setup.
384 * It is called only for swapper_pg_dir, which already has
385 * data on it.
386 */
387 vmi_set_page_type(pfn, VMI_PAGE_L2);
388 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
389}
390
391static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
392{
393 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
394 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
395 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
396}
397
398static void vmi_release_pt(u32 pfn)
399{
400 vmi_ops.release_page(pfn, VMI_PAGE_L1);
401 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
402}
403
404static void vmi_release_pd(u32 pfn)
405{
406 vmi_ops.release_page(pfn, VMI_PAGE_L2);
407 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
408}
409
410/*
411 * Helper macros for MMU update flags. We can defer updates until a flush
412 * or page invalidation only if the update is to the current address space
413 * (otherwise, there is no flush). We must check against init_mm, since
414 * this could be a kernel update, which usually passes init_mm, although
415 * sometimes this check can be skipped if we know the particular function
416 * is only called on user mode PTEs. We could change the kernel to pass
417 * current->active_mm here, but in particular, I was unsure if changing
418 * mm/highmem.c to do this would still be correct on other architectures.
419 */
420#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
421 (!mustbeuser && (mm) == &init_mm))
422#define vmi_flags_addr(mm, addr, level, user) \
423 ((level) | (is_current_as(mm, user) ? \
424 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
425#define vmi_flags_addr_defer(mm, addr, level, user) \
426 ((level) | (is_current_as(mm, user) ? \
427 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
428
429static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
430{
431 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
432 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
433}
434
435static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
436{
437 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
438 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
439}
440
441static void vmi_set_pte(pte_t *ptep, pte_t pte)
442{
443 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
444 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
445 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
446}
447
448static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
449{
450 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
451 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
452}
453
454static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
455{
456#ifdef CONFIG_X86_PAE
457 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
458 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
459#else
460 const pte_t pte = { pmdval.pud.pgd.pgd };
461 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
462#endif
463 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
464}
465
466#ifdef CONFIG_X86_PAE
467
468static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
469{
470 /*
471 * XXX This is called from set_pmd_pte, but at both PT
472 * and PD layers so the VMI_PAGE_PT flag is wrong. But
473 * it is only called for large page mapping changes,
474 * the Xen backend, doesn't support large pages, and the
475 * ESX backend doesn't depend on the flag.
476 */
477 set_64bit((unsigned long long *)ptep,pte_val(pteval));
478 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
479}
480
481static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
482{
483 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
484 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
485}
486
487static void vmi_set_pud(pud_t *pudp, pud_t pudval)
488{
489 /* Um, eww */
490 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
491 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
492 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
493}
494
495static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
496{
497 const pte_t pte = { 0 };
498 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
499 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
500}
501
502void vmi_pmd_clear(pmd_t *pmd)
503{
504 const pte_t pte = { 0 };
505 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
506 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
507}
508#endif
509
510#ifdef CONFIG_SMP
511struct vmi_ap_state ap;
512extern void setup_pda(void);
513
514static void __init /* XXX cpu hotplug */
515vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
516 unsigned long start_esp)
517{
518 /* Default everything to zero. This is fine for most GPRs. */
519 memset(&ap, 0, sizeof(struct vmi_ap_state));
520
521 ap.gdtr_limit = GDT_SIZE - 1;
522 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
523
524 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
525 ap.idtr_base = (unsigned long) idt_table;
526
527 ap.ldtr = 0;
528
529 ap.cs = __KERNEL_CS;
530 ap.eip = (unsigned long) start_eip;
531 ap.ss = __KERNEL_DS;
532 ap.esp = (unsigned long) start_esp;
533
534 ap.ds = __USER_DS;
535 ap.es = __USER_DS;
536 ap.fs = __KERNEL_PDA;
537 ap.gs = 0;
538
539 ap.eflags = 0;
540
541 setup_pda();
542
543#ifdef CONFIG_X86_PAE
544 /* efer should match BSP efer. */
545 if (cpu_has_nx) {
546 unsigned l, h;
547 rdmsr(MSR_EFER, l, h);
548 ap.efer = (unsigned long long) h << 32 | l;
549 }
550#endif
551
552 ap.cr3 = __pa(swapper_pg_dir);
553 /* Protected mode, paging, AM, WP, NE, MP. */
554 ap.cr0 = 0x80050023;
555 ap.cr4 = mmu_cr4_features;
556 vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
557}
558#endif
559
560static inline int __init check_vmi_rom(struct vrom_header *rom)
561{
562 struct pci_header *pci;
563 struct pnp_header *pnp;
564 const char *manufacturer = "UNKNOWN";
565 const char *product = "UNKNOWN";
566 const char *license = "unspecified";
567
568 if (rom->rom_signature != 0xaa55)
569 return 0;
570 if (rom->vrom_signature != VMI_SIGNATURE)
571 return 0;
572 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
573 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
574 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
575 rom->api_version_maj,
576 rom->api_version_min);
577 return 0;
578 }
579
580 /*
581 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
582 * the PCI header and device type to make sure this is really a
583 * VMI device.
584 */
585 if (!rom->pci_header_offs) {
586 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
587 return 0;
588 }
589
590 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
591 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
592 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
593 /* Allow it to run... anyways, but warn */
594 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
595 }
596
597 if (rom->pnp_header_offs) {
598 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
599 if (pnp->manufacturer_offset)
600 manufacturer = (const char *)rom+pnp->manufacturer_offset;
601 if (pnp->product_offset)
602 product = (const char *)rom+pnp->product_offset;
603 }
604
605 if (rom->license_offs)
606 license = (char *)rom+rom->license_offs;
607
608 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
609 manufacturer, product,
610 rom->api_version_maj, rom->api_version_min,
611 pci->rom_version_maj, pci->rom_version_min);
612
613 license_gplok = license_is_gpl_compatible(license);
614 if (!license_gplok) {
615 printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
616 "inlining disabled\n",
617 license);
618 add_taint(TAINT_PROPRIETARY_MODULE);
619 }
620 return 1;
621}
622
623/*
624 * Probe for the VMI option ROM
625 */
626static inline int __init probe_vmi_rom(void)
627{
628 unsigned long base;
629
630 /* VMI ROM is in option ROM area, check signature */
631 for (base = 0xC0000; base < 0xE0000; base += 2048) {
632 struct vrom_header *romstart;
633 romstart = (struct vrom_header *)isa_bus_to_virt(base);
634 if (check_vmi_rom(romstart)) {
635 vmi_rom = romstart;
636 return 1;
637 }
638 }
639 return 0;
640}
641
642/*
643 * VMI setup common to all processors
644 */
645void vmi_bringup(void)
646{
647 /* We must establish the lowmem mapping for MMU ops to work */
648 if (vmi_rom)
649 vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
650}
651
652/*
653 * Return a pointer to the VMI function or a NOP stub
654 */
655static void *vmi_get_function(int vmicall)
656{
657 u64 reloc;
658 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
659 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
660 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
661 if (rel->type == VMI_RELOCATION_CALL_REL)
662 return (void *)rel->eip;
663 else
664 return (void *)vmi_nop;
665}
666
667/*
668 * Helper macro for making the VMI paravirt-ops fill code readable.
669 * For unimplemented operations, fall back to default.
670 */
671#define para_fill(opname, vmicall) \
672do { \
673 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
674 VMI_CALL_##vmicall); \
675 if (rel->type != VMI_RELOCATION_NONE) { \
676 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \
677 paravirt_ops.opname = (void *)rel->eip; \
678 } \
679} while (0)
680
681/*
682 * Activate the VMI interface and switch into paravirtualized mode
683 */
684static inline int __init activate_vmi(void)
685{
686 short kernel_cs;
687 u64 reloc;
688 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
689
690 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
691 printk(KERN_ERR "VMI ROM failed to initialize!");
692 return 0;
693 }
694 savesegment(cs, kernel_cs);
695
696 paravirt_ops.paravirt_enabled = 1;
697 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
698
699 paravirt_ops.patch = vmi_patch;
700 paravirt_ops.name = "vmi";
701
702 /*
703 * Many of these operations are ABI compatible with VMI.
704 * This means we can fill in the paravirt-ops with direct
705 * pointers into the VMI ROM. If the calling convention for
706 * these operations changes, this code needs to be updated.
707 *
708 * Exceptions
709 * CPUID paravirt-op uses pointers, not the native ISA
710 * halt has no VMI equivalent; all VMI halts are "safe"
711 * no MSR support yet - just trap and emulate. VMI uses the
712 * same ABI as the native ISA, but Linux wants exceptions
713 * from bogus MSR read / write handled
714 * rdpmc is not yet used in Linux
715 */
716
717 /* CPUID is special, so very special */
718 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID);
719 if (rel->type != VMI_RELOCATION_NONE) {
720 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
721 vmi_ops.cpuid = (void *)rel->eip;
722 paravirt_ops.cpuid = vmi_cpuid;
723 }
724
725 para_fill(clts, CLTS);
726 para_fill(get_debugreg, GetDR);
727 para_fill(set_debugreg, SetDR);
728 para_fill(read_cr0, GetCR0);
729 para_fill(read_cr2, GetCR2);
730 para_fill(read_cr3, GetCR3);
731 para_fill(read_cr4, GetCR4);
732 para_fill(write_cr0, SetCR0);
733 para_fill(write_cr2, SetCR2);
734 para_fill(write_cr3, SetCR3);
735 para_fill(write_cr4, SetCR4);
736 para_fill(save_fl, GetInterruptMask);
737 para_fill(restore_fl, SetInterruptMask);
738 para_fill(irq_disable, DisableInterrupts);
739 para_fill(irq_enable, EnableInterrupts);
740 /* irq_save_disable !!! sheer pain */
741 patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
742 (char *)paravirt_ops.save_fl);
743 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
744 (char *)paravirt_ops.irq_disable);
745#ifndef CONFIG_NO_IDLE_HZ
746 para_fill(safe_halt, Halt);
747#else
748 vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
749 paravirt_ops.safe_halt = vmi_safe_halt;
750#endif
751 para_fill(wbinvd, WBINVD);
752 /* paravirt_ops.read_msr = vmi_rdmsr */
753 /* paravirt_ops.write_msr = vmi_wrmsr */
754 para_fill(read_tsc, RDTSC);
755 /* paravirt_ops.rdpmc = vmi_rdpmc */
756
757 /* TR interface doesn't pass TR value */
758 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR);
759 if (rel->type != VMI_RELOCATION_NONE) {
760 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
761 vmi_ops.set_tr = (void *)rel->eip;
762 paravirt_ops.load_tr_desc = vmi_set_tr;
763 }
764
765 /* LDT is special, too */
766 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT);
767 if (rel->type != VMI_RELOCATION_NONE) {
768 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
769 vmi_ops._set_ldt = (void *)rel->eip;
770 paravirt_ops.set_ldt = vmi_set_ldt;
771 }
772
773 para_fill(load_gdt, SetGDT);
774 para_fill(load_idt, SetIDT);
775 para_fill(store_gdt, GetGDT);
776 para_fill(store_idt, GetIDT);
777 para_fill(store_tr, GetTR);
778 paravirt_ops.load_tls = vmi_load_tls;
779 para_fill(write_ldt_entry, WriteLDTEntry);
780 para_fill(write_gdt_entry, WriteGDTEntry);
781 para_fill(write_idt_entry, WriteIDTEntry);
782 reloc = call_vrom_long_func(vmi_rom, get_reloc,
783 VMI_CALL_UpdateKernelStack);
784 if (rel->type != VMI_RELOCATION_NONE) {
785 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
786 vmi_ops.set_kernel_stack = (void *)rel->eip;
787 paravirt_ops.load_esp0 = vmi_load_esp0;
788 }
789
790 para_fill(set_iopl_mask, SetIOPLMask);
791 paravirt_ops.io_delay = (void *)vmi_nop;
792 if (!disable_nodelay) {
793 paravirt_ops.const_udelay = (void *)vmi_nop;
794 }
795
796 para_fill(set_lazy_mode, SetLazyMode);
797
798 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
799 if (rel->type != VMI_RELOCATION_NONE) {
800 vmi_ops.flush_tlb = (void *)rel->eip;
801 paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
802 paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
803 }
804 para_fill(flush_tlb_single, InvalPage);
805
806 /*
807 * Until a standard flag format can be agreed on, we need to
808 * implement these as wrappers in Linux. Get the VMI ROM
809 * function pointers for the two backend calls.
810 */
811#ifdef CONFIG_X86_PAE
812 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
813 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
814#else
815 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
816 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
817#endif
818 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
819 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
820 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
821
822 paravirt_ops.alloc_pt = vmi_allocate_pt;
823 paravirt_ops.alloc_pd = vmi_allocate_pd;
824 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
825 paravirt_ops.release_pt = vmi_release_pt;
826 paravirt_ops.release_pd = vmi_release_pd;
827 paravirt_ops.set_pte = vmi_set_pte;
828 paravirt_ops.set_pte_at = vmi_set_pte_at;
829 paravirt_ops.set_pmd = vmi_set_pmd;
830 paravirt_ops.pte_update = vmi_update_pte;
831 paravirt_ops.pte_update_defer = vmi_update_pte_defer;
832#ifdef CONFIG_X86_PAE
833 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
834 paravirt_ops.set_pte_present = vmi_set_pte_present;
835 paravirt_ops.set_pud = vmi_set_pud;
836 paravirt_ops.pte_clear = vmi_pte_clear;
837 paravirt_ops.pmd_clear = vmi_pmd_clear;
838#endif
839 /*
840 * These MUST always be patched. Don't support indirect jumps
841 * through these operations, as the VMI interface may use either
842 * a jump or a call to get to these operations, depending on
843 * the backend. They are performance critical anyway, so requiring
844 * a patch is not a big problem.
845 */
846 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
847 paravirt_ops.iret = (void *)0xbadbab0;
848
849#ifdef CONFIG_SMP
850 paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
851 vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
852#endif
853
854#ifdef CONFIG_X86_LOCAL_APIC
855 paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
856 paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
857 paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
858#endif
859
860 /*
861 * Check for VMI timer functionality by probing for a cycle frequency method
862 */
863 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
864 if (rel->type != VMI_RELOCATION_NONE) {
865 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
866 vmi_timer_ops.get_cycle_counter =
867 vmi_get_function(VMI_CALL_GetCycleCounter);
868 vmi_timer_ops.get_wallclock =
869 vmi_get_function(VMI_CALL_GetWallclockTime);
870 vmi_timer_ops.wallclock_updated =
871 vmi_get_function(VMI_CALL_WallclockUpdated);
872 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
873 vmi_timer_ops.cancel_alarm =
874 vmi_get_function(VMI_CALL_CancelAlarm);
875 paravirt_ops.time_init = vmi_time_init;
876 paravirt_ops.get_wallclock = vmi_get_wallclock;
877 paravirt_ops.set_wallclock = vmi_set_wallclock;
878#ifdef CONFIG_X86_LOCAL_APIC
879 paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
880 paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
881#endif
882 custom_sched_clock = vmi_sched_clock;
883 }
884
885 /*
886 * Alternative instruction rewriting doesn't happen soon enough
887 * to convert VMI_IRET to a call instead of a jump; so we have
888 * to do this before IRQs get reenabled. Fortunately, it is
889 * idempotent.
890 */
891 apply_paravirt(__start_parainstructions, __stop_parainstructions);
892
893 vmi_bringup();
894
895 return 1;
896}
897
898#undef para_fill
899
900void __init vmi_init(void)
901{
902 unsigned long flags;
903
904 if (!vmi_rom)
905 probe_vmi_rom();
906 else
907 check_vmi_rom(vmi_rom);
908
909 /* In case probing for or validating the ROM failed, basil */
910 if (!vmi_rom)
911 return;
912
913 reserve_top_address(-vmi_rom->virtual_top);
914
915 local_irq_save(flags);
916 activate_vmi();
917#ifdef CONFIG_SMP
918 no_timer_check = 1;
919#endif
920 local_irq_restore(flags & X86_EFLAGS_IF);
921}
922
923static int __init parse_vmi(char *arg)
924{
925 if (!arg)
926 return -EINVAL;
927
928 if (!strcmp(arg, "disable_nodelay"))
929 disable_nodelay = 1;
930 else if (!strcmp(arg, "disable_pge")) {
931 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
932 disable_pge = 1;
933 } else if (!strcmp(arg, "disable_pse")) {
934 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
935 disable_pse = 1;
936 } else if (!strcmp(arg, "disable_sep")) {
937 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
938 disable_sep = 1;
939 } else if (!strcmp(arg, "disable_tsc")) {
940 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
941 disable_tsc = 1;
942 } else if (!strcmp(arg, "disable_mtrr")) {
943 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
944 disable_mtrr = 1;
945 }
946 return 0;
947}
948
949early_param("vmi", parse_vmi);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
new file mode 100644
index 000000000000..76d2adcae5a3
--- /dev/null
+++ b/arch/i386/kernel/vmitime.c
@@ -0,0 +1,499 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25/*
26 * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
27 * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
28 * See comments there for proper credits.
29 */
30
31#include <linux/spinlock.h>
32#include <linux/init.h>
33#include <linux/errno.h>
34#include <linux/jiffies.h>
35#include <linux/interrupt.h>
36#include <linux/kernel_stat.h>
37#include <linux/rcupdate.h>
38#include <linux/clocksource.h>
39
40#include <asm/timer.h>
41#include <asm/io.h>
42#include <asm/apic.h>
43#include <asm/div64.h>
44#include <asm/timer.h>
45#include <asm/desc.h>
46
47#include <asm/vmi.h>
48#include <asm/vmi_time.h>
49
50#include <mach_timer.h>
51#include <io_ports.h>
52
53#ifdef CONFIG_X86_LOCAL_APIC
54#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
55#else
56#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
57#endif
58
59/* Cached VMI operations */
60struct vmi_timer_ops vmi_timer_ops;
61
62#ifdef CONFIG_NO_IDLE_HZ
63
64/* /proc/sys/kernel/hz_timer state. */
65int sysctl_hz_timer;
66
67/* Some stats */
68static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
69static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
70static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
71
72#endif /* CONFIG_NO_IDLE_HZ */
73
74/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
75static int alarm_hz = CONFIG_VMI_ALARM_HZ;
76
77/* Cache of the value get_cycle_frequency / HZ. */
78static signed long long cycles_per_jiffy;
79
80/* Cache of the value get_cycle_frequency / alarm_hz. */
81static signed long long cycles_per_alarm;
82
83/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
84 * Protected by xtime_lock. */
85static unsigned long long real_cycles_accounted_system;
86
87/* The number of cycles accounted for by update_process_times(), per cpu. */
88static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
89
90/* The number of stolen cycles accounted, per cpu. */
91static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
92
93/* Clock source. */
94static cycle_t read_real_cycles(void)
95{
96 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
97}
98
99static cycle_t read_available_cycles(void)
100{
101 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
102}
103
104#if 0
105static cycle_t read_stolen_cycles(void)
106{
107 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
108}
109#endif /* 0 */
110
111static struct clocksource clocksource_vmi = {
112 .name = "vmi-timer",
113 .rating = 450,
114 .read = read_real_cycles,
115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */
117 .shift = 22,
118 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
119};
120
121
122/* Timer interrupt handler. */
123static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
124
125static struct irqaction vmi_timer_irq = {
126 vmi_timer_interrupt,
127 SA_INTERRUPT,
128 CPU_MASK_NONE,
129 "VMI-alarm",
130 NULL,
131 NULL
132};
133
134/* Alarm rate */
135static int __init vmi_timer_alarm_rate_setup(char* str)
136{
137 int alarm_rate;
138 if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
139 alarm_hz = alarm_rate;
140 printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
141 }
142 return 1;
143}
144__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
145
146
147/* Initialization */
148static void vmi_get_wallclock_ts(struct timespec *ts)
149{
150 unsigned long long wallclock;
151 wallclock = vmi_timer_ops.get_wallclock(); // nsec units
152 ts->tv_nsec = do_div(wallclock, 1000000000);
153 ts->tv_sec = wallclock;
154}
155
156static void update_xtime_from_wallclock(void)
157{
158 struct timespec ts;
159 vmi_get_wallclock_ts(&ts);
160 do_settimeofday(&ts);
161}
162
163unsigned long vmi_get_wallclock(void)
164{
165 struct timespec ts;
166 vmi_get_wallclock_ts(&ts);
167 return ts.tv_sec;
168}
169
170int vmi_set_wallclock(unsigned long now)
171{
172 return -1;
173}
174
175unsigned long long vmi_sched_clock(void)
176{
177 return read_available_cycles();
178}
179
180void __init vmi_time_init(void)
181{
182 unsigned long long cycles_per_sec, cycles_per_msec;
183 unsigned long flags;
184
185 local_irq_save(flags);
186 setup_irq(0, &vmi_timer_irq);
187#ifdef CONFIG_X86_LOCAL_APIC
188 set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
189#endif
190
191 no_sync_cmos_clock = 1;
192
193 vmi_get_wallclock_ts(&xtime);
194 set_normalized_timespec(&wall_to_monotonic,
195 -xtime.tv_sec, -xtime.tv_nsec);
196
197 real_cycles_accounted_system = read_real_cycles();
198 update_xtime_from_wallclock();
199 per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
200
201 cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
202
203 cycles_per_jiffy = cycles_per_sec;
204 (void)do_div(cycles_per_jiffy, HZ);
205 cycles_per_alarm = cycles_per_sec;
206 (void)do_div(cycles_per_alarm, alarm_hz);
207 cycles_per_msec = cycles_per_sec;
208 (void)do_div(cycles_per_msec, 1000);
209 cpu_khz = cycles_per_msec;
210
211 printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
212 "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
213 cycles_per_alarm);
214
215 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
216 clocksource_vmi.shift);
217 if (clocksource_register(&clocksource_vmi))
218 printk(KERN_WARNING "Error registering VMITIME clocksource.");
219
220 /* Disable PIT. */
221 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
222
223 /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
224 * reduce the latency calling update_process_times. */
225 vmi_timer_ops.set_alarm(
226 VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
227 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
228 cycles_per_alarm);
229
230 local_irq_restore(flags);
231}
232
233#ifdef CONFIG_X86_LOCAL_APIC
234
235void __init vmi_timer_setup_boot_alarm(void)
236{
237 local_irq_disable();
238
239 /* Route the interrupt to the correct vector. */
240 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
241
242 /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
243 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
244 vmi_timer_ops.set_alarm(
245 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
246 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
247 cycles_per_alarm);
248 local_irq_enable();
249}
250
251/* Initialize the time accounting variables for an AP on an SMP system.
252 * Also, set the local alarm for the AP. */
253void __init vmi_timer_setup_secondary_alarm(void)
254{
255 int cpu = smp_processor_id();
256
257 /* Route the interrupt to the correct vector. */
258 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
259
260 per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
261
262 vmi_timer_ops.set_alarm(
263 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
264 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
265 cycles_per_alarm);
266}
267
268#endif
269
270/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
271static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
272{
273 long long cycles_not_accounted;
274
275 write_seqlock(&xtime_lock);
276
277 cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
278 while (cycles_not_accounted >= cycles_per_jiffy) {
279 /* systems wide jiffies and wallclock. */
280 do_timer(1);
281
282 cycles_not_accounted -= cycles_per_jiffy;
283 real_cycles_accounted_system += cycles_per_jiffy;
284 }
285
286 if (vmi_timer_ops.wallclock_updated())
287 update_xtime_from_wallclock();
288
289 write_sequnlock(&xtime_lock);
290}
291
292/* Update per-cpu process times. */
293static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
294 unsigned long long cur_process_times_cycles)
295{
296 long long cycles_not_accounted;
297 cycles_not_accounted = cur_process_times_cycles -
298 per_cpu(process_times_cycles_accounted_cpu, cpu);
299
300 while (cycles_not_accounted >= cycles_per_jiffy) {
301 /* Account time to the current process. This includes
302 * calling into the scheduler to decrement the timeslice
303 * and possibly reschedule.*/
304 update_process_times(user_mode(regs));
305 /* XXX handle /proc/profile multiplier. */
306 profile_tick(CPU_PROFILING);
307
308 cycles_not_accounted -= cycles_per_jiffy;
309 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
310 }
311}
312
313#ifdef CONFIG_NO_IDLE_HZ
314/* Update per-cpu idle times. Used when a no-hz halt is ended. */
315static void vmi_account_no_hz_idle_cycles(int cpu,
316 unsigned long long cur_process_times_cycles)
317{
318 long long cycles_not_accounted;
319 unsigned long no_idle_hz_jiffies = 0;
320
321 cycles_not_accounted = cur_process_times_cycles -
322 per_cpu(process_times_cycles_accounted_cpu, cpu);
323
324 while (cycles_not_accounted >= cycles_per_jiffy) {
325 no_idle_hz_jiffies++;
326 cycles_not_accounted -= cycles_per_jiffy;
327 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
328 }
329 /* Account time to the idle process. */
330 account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
331}
332#endif
333
334/* Update per-cpu stolen time. */
335static void vmi_account_stolen_cycles(int cpu,
336 unsigned long long cur_real_cycles,
337 unsigned long long cur_avail_cycles)
338{
339 long long stolen_cycles_not_accounted;
340 unsigned long stolen_jiffies = 0;
341
342 if (cur_real_cycles < cur_avail_cycles)
343 return;
344
345 stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
346 per_cpu(stolen_cycles_accounted_cpu, cpu);
347
348 while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
349 stolen_jiffies++;
350 stolen_cycles_not_accounted -= cycles_per_jiffy;
351 per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
352 }
353 /* HACK: pass NULL to force time onto cpustat->steal. */
354 account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
355}
356
357/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
358 * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
359static void vmi_local_timer_interrupt(int cpu)
360{
361 unsigned long long cur_real_cycles, cur_process_times_cycles;
362
363 cur_real_cycles = read_real_cycles();
364 cur_process_times_cycles = read_available_cycles();
365 /* Update system wide (real) time state (xtime, jiffies). */
366 vmi_account_real_cycles(cur_real_cycles);
367 /* Update per-cpu process times. */
368 vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
369 /* Update time stolen from this cpu by the hypervisor. */
370 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
371}
372
373#ifdef CONFIG_NO_IDLE_HZ
374
375/* Must be called only from idle loop, with interrupts disabled. */
376int vmi_stop_hz_timer(void)
377{
378 /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
379
380 unsigned long seq, next;
381 unsigned long long real_cycles_expiry;
382 int cpu = smp_processor_id();
383 int idle;
384
385 BUG_ON(!irqs_disabled());
386 if (sysctl_hz_timer != 0)
387 return 0;
388
389 cpu_set(cpu, nohz_cpu_mask);
390 smp_mb();
391 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
392 (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
393 cpu_clear(cpu, nohz_cpu_mask);
394 next = jiffies;
395 idle = 0;
396 } else
397 idle = 1;
398
399 /* Convert jiffies to the real cycle counter. */
400 do {
401 seq = read_seqbegin(&xtime_lock);
402 real_cycles_expiry = real_cycles_accounted_system +
403 (long)(next - jiffies) * cycles_per_jiffy;
404 } while (read_seqretry(&xtime_lock, seq));
405
406 /* This cpu is going idle. Disable the periodic alarm. */
407 if (idle) {
408 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
409 per_cpu(idle_start_jiffies, cpu) = jiffies;
410 }
411
412 /* Set the real time alarm to expire at the next event. */
413 vmi_timer_ops.set_alarm(
414 VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
415 real_cycles_expiry, 0);
416
417 return idle;
418}
419
420static void vmi_reenable_hz_timer(int cpu)
421{
422 /* For /proc/vmi/info idle_hz stat. */
423 per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
424 per_cpu(vmi_idle_no_hz_irqs, cpu)++;
425
426 /* Don't bother explicitly cancelling the one-shot alarm -- at
427 * worse we will receive a spurious timer interrupt. */
428 vmi_timer_ops.set_alarm(
429 VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
430 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
431 cycles_per_alarm);
432 /* Indicate this cpu is no longer nohz idle. */
433 cpu_clear(cpu, nohz_cpu_mask);
434}
435
436/* Called from interrupt handlers when (local) HZ timer is disabled. */
437void vmi_account_time_restart_hz_timer(void)
438{
439 unsigned long long cur_real_cycles, cur_process_times_cycles;
440 int cpu = smp_processor_id();
441
442 BUG_ON(!irqs_disabled());
443 /* Account the time during which the HZ timer was disabled. */
444 cur_real_cycles = read_real_cycles();
445 cur_process_times_cycles = read_available_cycles();
446 /* Update system wide (real) time state (xtime, jiffies). */
447 vmi_account_real_cycles(cur_real_cycles);
448 /* Update per-cpu idle times. */
449 vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
450 /* Update time stolen from this cpu by the hypervisor. */
451 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
452 /* Reenable the hz timer. */
453 vmi_reenable_hz_timer(cpu);
454}
455
456#endif /* CONFIG_NO_IDLE_HZ */
457
458/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
459 * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
460 * APIC setup and setup_boot_vmi_alarm() is called. */
461static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
462{
463 vmi_local_timer_interrupt(smp_processor_id());
464 return IRQ_HANDLED;
465}
466
467#ifdef CONFIG_X86_LOCAL_APIC
468
469/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
470 * Also used in UP when CONFIG_X86_LOCAL_APIC.
471 * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
472void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
473{
474 struct pt_regs *old_regs = set_irq_regs(regs);
475 int cpu = smp_processor_id();
476
477 /*
478 * the NMI deadlock-detector uses this.
479 */
480 per_cpu(irq_stat,cpu).apic_timer_irqs++;
481
482 /*
483 * NOTE! We'd better ACK the irq immediately,
484 * because timer handling can be slow.
485 */
486 ack_APIC_irq();
487
488 /*
489 * update_process_times() expects us to have done irq_enter().
490 * Besides, if we don't timer interrupts ignore the global
491 * interrupt lock, which is the WrongThing (tm) to do.
492 */
493 irq_enter();
494 vmi_local_timer_interrupt(cpu);
495 irq_exit();
496 set_irq_regs(old_regs);
497}
498
499#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 5038a73d554e..ca51610955df 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -37,9 +37,14 @@ SECTIONS
37{ 37{
38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; 38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
39 phys_startup_32 = startup_32 - LOAD_OFFSET; 39 phys_startup_32 = startup_32 - LOAD_OFFSET;
40
41 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
43 *(.text.head)
44 } :text = 0x9090
45
40 /* read-only */ 46 /* read-only */
41 .text : AT(ADDR(.text) - LOAD_OFFSET) { 47 .text : AT(ADDR(.text) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
43 *(.text) 48 *(.text)
44 SCHED_TEXT 49 SCHED_TEXT
45 LOCK_TEXT 50 LOCK_TEXT