aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile8
-rw-r--r--arch/i386/kernel/alternative.c118
-rw-r--r--arch/i386/kernel/apic.c16
-rw-r--r--arch/i386/kernel/apm.c6
-rw-r--r--arch/i386/kernel/asm-offsets.c7
-rw-r--r--arch/i386/kernel/cpu/amd.c22
-rw-r--r--arch/i386/kernel/cpu/common.c25
-rw-r--r--arch/i386/kernel/cpu/cyrix.c2
-rw-r--r--arch/i386/kernel/cpu/intel.c6
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c123
-rw-r--r--arch/i386/kernel/cpu/proc.c8
-rw-r--r--arch/i386/kernel/cpuid.c2
-rw-r--r--arch/i386/kernel/crash.c9
-rw-r--r--arch/i386/kernel/entry.S285
-rw-r--r--arch/i386/kernel/hpet.c67
-rw-r--r--arch/i386/kernel/i8253.c118
-rw-r--r--arch/i386/kernel/i8259.c2
-rw-r--r--arch/i386/kernel/io_apic.c49
-rw-r--r--arch/i386/kernel/irq.c10
-rw-r--r--arch/i386/kernel/kprobes.c95
-rw-r--r--arch/i386/kernel/machine_kexec.c4
-rw-r--r--arch/i386/kernel/msr.c2
-rw-r--r--arch/i386/kernel/nmi.c72
-rw-r--r--arch/i386/kernel/numaq.c10
-rw-r--r--arch/i386/kernel/process.c8
-rw-r--r--arch/i386/kernel/scx200.c66
-rw-r--r--arch/i386/kernel/setup.c1
-rw-r--r--arch/i386/kernel/signal.c4
-rw-r--r--arch/i386/kernel/smp.c12
-rw-r--r--arch/i386/kernel/smpboot.c38
-rw-r--r--arch/i386/kernel/sysenter.c128
-rw-r--r--arch/i386/kernel/time.c157
-rw-r--r--arch/i386/kernel/timers/Makefile9
-rw-r--r--arch/i386/kernel/timers/common.c172
-rw-r--r--arch/i386/kernel/timers/timer.c75
-rw-r--r--arch/i386/kernel/timers/timer_cyclone.c259
-rw-r--r--arch/i386/kernel/timers/timer_hpet.c217
-rw-r--r--arch/i386/kernel/timers/timer_none.c39
-rw-r--r--arch/i386/kernel/timers/timer_pit.c177
-rw-r--r--arch/i386/kernel/timers/timer_pm.c342
-rw-r--r--arch/i386/kernel/timers/timer_tsc.c617
-rw-r--r--arch/i386/kernel/topology.c28
-rw-r--r--arch/i386/kernel/traps.c70
-rw-r--r--arch/i386/kernel/tsc.c478
-rw-r--r--arch/i386/kernel/vmlinux.lds.S9
-rw-r--r--arch/i386/kernel/vsyscall-sysenter.S4
-rw-r--r--arch/i386/kernel/vsyscall.lds.S4
47 files changed, 1647 insertions, 2333 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 96fb8a020af2..5e70c2fb273a 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -7,10 +7,9 @@ extra-y := head.o init_task.o vmlinux.lds
7obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o bootflag.o \ 9 pci-dma.o i386_ksyms.o i387.o bootflag.o \
10 quirks.o i8237.o topology.o alternative.o 10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
11 11
12obj-y += cpu/ 12obj-y += cpu/
13obj-y += timers/
14obj-y += acpi/ 13obj-y += acpi/
15obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o 14obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
16obj-$(CONFIG_MCA) += mca.o 15obj-$(CONFIG_MCA) += mca.o
@@ -37,6 +36,8 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o
37obj-$(CONFIG_DOUBLEFAULT) += doublefault.o 36obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
38obj-$(CONFIG_VM86) += vm86.o 37obj-$(CONFIG_VM86) += vm86.o
39obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 38obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
39obj-$(CONFIG_HPET_TIMER) += hpet.o
40obj-$(CONFIG_K8_NB) += k8.o
40 41
41EXTRA_AFLAGS := -traditional 42EXTRA_AFLAGS := -traditional
42 43
@@ -76,3 +77,6 @@ SYSCFLAGS_vsyscall-syms.o = -r
76$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ 77$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
77 $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE 78 $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
78 $(call if_changed,syscall) 79 $(call if_changed,syscall)
80
81k8-y += ../../x86_64/kernel/k8.o
82
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 5cbd6f99fb2a..50eb0e03777e 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -4,27 +4,41 @@
4#include <asm/alternative.h> 4#include <asm/alternative.h>
5#include <asm/sections.h> 5#include <asm/sections.h>
6 6
7#define DEBUG 0 7static int no_replacement = 0;
8#if DEBUG 8static int smp_alt_once = 0;
9# define DPRINTK(fmt, args...) printk(fmt, args) 9static int debug_alternative = 0;
10#else 10
11# define DPRINTK(fmt, args...) 11static int __init noreplacement_setup(char *s)
12#endif 12{
13 no_replacement = 1;
14 return 1;
15}
16static int __init bootonly(char *str)
17{
18 smp_alt_once = 1;
19 return 1;
20}
21static int __init debug_alt(char *str)
22{
23 debug_alternative = 1;
24 return 1;
25}
13 26
27__setup("noreplacement", noreplacement_setup);
28__setup("smp-alt-boot", bootonly);
29__setup("debug-alternative", debug_alt);
30
31#define DPRINTK(fmt, args...) if (debug_alternative) \
32 printk(KERN_DEBUG fmt, args)
33
34#ifdef GENERIC_NOP1
14/* Use inline assembly to define this because the nops are defined 35/* Use inline assembly to define this because the nops are defined
15 as inline assembly strings in the include files and we cannot 36 as inline assembly strings in the include files and we cannot
16 get them easily into strings. */ 37 get them easily into strings. */
17asm("\t.data\nintelnops: " 38asm("\t.data\nintelnops: "
18 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 39 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
19 GENERIC_NOP7 GENERIC_NOP8); 40 GENERIC_NOP7 GENERIC_NOP8);
20asm("\t.data\nk8nops: " 41extern unsigned char intelnops[];
21 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
22 K8_NOP7 K8_NOP8);
23asm("\t.data\nk7nops: "
24 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
25 K7_NOP7 K7_NOP8);
26
27extern unsigned char intelnops[], k8nops[], k7nops[];
28static unsigned char *intel_nops[ASM_NOP_MAX+1] = { 42static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
29 NULL, 43 NULL,
30 intelnops, 44 intelnops,
@@ -36,6 +50,13 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
36 intelnops + 1 + 2 + 3 + 4 + 5 + 6, 50 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
37 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 51 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
38}; 52};
53#endif
54
55#ifdef K8_NOP1
56asm("\t.data\nk8nops: "
57 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
58 K8_NOP7 K8_NOP8);
59extern unsigned char k8nops[];
39static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 60static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
40 NULL, 61 NULL,
41 k8nops, 62 k8nops,
@@ -47,6 +68,13 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
47 k8nops + 1 + 2 + 3 + 4 + 5 + 6, 68 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
48 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 69 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
49}; 70};
71#endif
72
73#ifdef K7_NOP1
74asm("\t.data\nk7nops: "
75 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
76 K7_NOP7 K7_NOP8);
77extern unsigned char k7nops[];
50static unsigned char *k7_nops[ASM_NOP_MAX+1] = { 78static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
51 NULL, 79 NULL,
52 k7nops, 80 k7nops,
@@ -58,6 +86,18 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
58 k7nops + 1 + 2 + 3 + 4 + 5 + 6, 86 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
59 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 87 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
60}; 88};
89#endif
90
91#ifdef CONFIG_X86_64
92
93extern char __vsyscall_0;
94static inline unsigned char** find_nop_table(void)
95{
96 return k8_nops;
97}
98
99#else /* CONFIG_X86_64 */
100
61static struct nop { 101static struct nop {
62 int cpuid; 102 int cpuid;
63 unsigned char **noptable; 103 unsigned char **noptable;
@@ -67,14 +107,6 @@ static struct nop {
67 { -1, NULL } 107 { -1, NULL }
68}; 108};
69 109
70
71extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
72extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
73extern u8 *__smp_locks[], *__smp_locks_end[];
74
75extern u8 __smp_alt_begin[], __smp_alt_end[];
76
77
78static unsigned char** find_nop_table(void) 110static unsigned char** find_nop_table(void)
79{ 111{
80 unsigned char **noptable = intel_nops; 112 unsigned char **noptable = intel_nops;
@@ -89,6 +121,14 @@ static unsigned char** find_nop_table(void)
89 return noptable; 121 return noptable;
90} 122}
91 123
124#endif /* CONFIG_X86_64 */
125
126extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
127extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
128extern u8 *__smp_locks[], *__smp_locks_end[];
129
130extern u8 __smp_alt_begin[], __smp_alt_end[];
131
92/* Replace instructions with better alternatives for this CPU type. 132/* Replace instructions with better alternatives for this CPU type.
93 This runs before SMP is initialized to avoid SMP problems with 133 This runs before SMP is initialized to avoid SMP problems with
94 self modifying code. This implies that assymetric systems where 134 self modifying code. This implies that assymetric systems where
@@ -99,6 +139,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
99{ 139{
100 unsigned char **noptable = find_nop_table(); 140 unsigned char **noptable = find_nop_table();
101 struct alt_instr *a; 141 struct alt_instr *a;
142 u8 *instr;
102 int diff, i, k; 143 int diff, i, k;
103 144
104 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 145 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
@@ -106,7 +147,16 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
106 BUG_ON(a->replacementlen > a->instrlen); 147 BUG_ON(a->replacementlen > a->instrlen);
107 if (!boot_cpu_has(a->cpuid)) 148 if (!boot_cpu_has(a->cpuid))
108 continue; 149 continue;
109 memcpy(a->instr, a->replacement, a->replacementlen); 150 instr = a->instr;
151#ifdef CONFIG_X86_64
152 /* vsyscall code is not mapped yet. resolve it manually. */
153 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
154 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
155 DPRINTK("%s: vsyscall fixup: %p => %p\n",
156 __FUNCTION__, a->instr, instr);
157 }
158#endif
159 memcpy(instr, a->replacement, a->replacementlen);
110 diff = a->instrlen - a->replacementlen; 160 diff = a->instrlen - a->replacementlen;
111 /* Pad the rest with nops */ 161 /* Pad the rest with nops */
112 for (i = a->replacementlen; diff > 0; diff -= k, i += k) { 162 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
@@ -186,14 +236,6 @@ struct smp_alt_module {
186static LIST_HEAD(smp_alt_modules); 236static LIST_HEAD(smp_alt_modules);
187static DEFINE_SPINLOCK(smp_alt); 237static DEFINE_SPINLOCK(smp_alt);
188 238
189static int smp_alt_once = 0;
190static int __init bootonly(char *str)
191{
192 smp_alt_once = 1;
193 return 1;
194}
195__setup("smp-alt-boot", bootonly);
196
197void alternatives_smp_module_add(struct module *mod, char *name, 239void alternatives_smp_module_add(struct module *mod, char *name,
198 void *locks, void *locks_end, 240 void *locks, void *locks_end,
199 void *text, void *text_end) 241 void *text, void *text_end)
@@ -201,6 +243,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
201 struct smp_alt_module *smp; 243 struct smp_alt_module *smp;
202 unsigned long flags; 244 unsigned long flags;
203 245
246 if (no_replacement)
247 return;
248
204 if (smp_alt_once) { 249 if (smp_alt_once) {
205 if (boot_cpu_has(X86_FEATURE_UP)) 250 if (boot_cpu_has(X86_FEATURE_UP))
206 alternatives_smp_unlock(locks, locks_end, 251 alternatives_smp_unlock(locks, locks_end,
@@ -235,7 +280,7 @@ void alternatives_smp_module_del(struct module *mod)
235 struct smp_alt_module *item; 280 struct smp_alt_module *item;
236 unsigned long flags; 281 unsigned long flags;
237 282
238 if (smp_alt_once) 283 if (no_replacement || smp_alt_once)
239 return; 284 return;
240 285
241 spin_lock_irqsave(&smp_alt, flags); 286 spin_lock_irqsave(&smp_alt, flags);
@@ -256,7 +301,7 @@ void alternatives_smp_switch(int smp)
256 struct smp_alt_module *mod; 301 struct smp_alt_module *mod;
257 unsigned long flags; 302 unsigned long flags;
258 303
259 if (smp_alt_once) 304 if (no_replacement || smp_alt_once)
260 return; 305 return;
261 BUG_ON(!smp && (num_online_cpus() > 1)); 306 BUG_ON(!smp && (num_online_cpus() > 1));
262 307
@@ -285,6 +330,13 @@ void alternatives_smp_switch(int smp)
285 330
286void __init alternative_instructions(void) 331void __init alternative_instructions(void)
287{ 332{
333 if (no_replacement) {
334 printk(KERN_INFO "(SMP-)alternatives turned off\n");
335 free_init_pages("SMP alternatives",
336 (unsigned long)__smp_alt_begin,
337 (unsigned long)__smp_alt_end);
338 return;
339 }
288 apply_alternatives(__alt_instructions, __alt_instructions_end); 340 apply_alternatives(__alt_instructions, __alt_instructions_end);
289 341
290 /* switch to patch-once-at-boottime-only mode and free the 342 /* switch to patch-once-at-boottime-only mode and free the
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 5ab59c12335b..7ce09492fc0c 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -36,6 +36,7 @@
36#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/i8253.h> 38#include <asm/i8253.h>
39#include <asm/nmi.h>
39 40
40#include <mach_apic.h> 41#include <mach_apic.h>
41#include <mach_apicdef.h> 42#include <mach_apicdef.h>
@@ -156,7 +157,7 @@ void clear_local_APIC(void)
156 maxlvt = get_maxlvt(); 157 maxlvt = get_maxlvt();
157 158
158 /* 159 /*
159 * Masking an LVT entry on a P6 can trigger a local APIC error 160 * Masking an LVT entry can trigger a local APIC error
160 * if the vector is zero. Mask LVTERR first to prevent this. 161 * if the vector is zero. Mask LVTERR first to prevent this.
161 */ 162 */
162 if (maxlvt >= 3) { 163 if (maxlvt >= 3) {
@@ -1117,7 +1118,18 @@ void disable_APIC_timer(void)
1117 unsigned long v; 1118 unsigned long v;
1118 1119
1119 v = apic_read(APIC_LVTT); 1120 v = apic_read(APIC_LVTT);
1120 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 1121 /*
1122 * When an illegal vector value (0-15) is written to an LVT
1123 * entry and delivery mode is Fixed, the APIC may signal an
1124 * illegal vector error, with out regard to whether the mask
1125 * bit is set or whether an interrupt is actually seen on input.
1126 *
1127 * Boot sequence might call this function when the LVTT has
1128 * '0' vector value. So make sure vector field is set to
1129 * valid value.
1130 */
1131 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1132 apic_write_around(APIC_LVTT, v);
1121 } 1133 }
1122} 1134}
1123 1135
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 9e819eb68229..7c5729d1fd06 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -764,9 +764,9 @@ static int apm_do_idle(void)
764 int idled = 0; 764 int idled = 0;
765 int polling; 765 int polling;
766 766
767 polling = test_thread_flag(TIF_POLLING_NRFLAG); 767 polling = !!(current_thread_info()->status & TS_POLLING);
768 if (polling) { 768 if (polling) {
769 clear_thread_flag(TIF_POLLING_NRFLAG); 769 current_thread_info()->status &= ~TS_POLLING;
770 smp_mb__after_clear_bit(); 770 smp_mb__after_clear_bit();
771 } 771 }
772 if (!need_resched()) { 772 if (!need_resched()) {
@@ -774,7 +774,7 @@ static int apm_do_idle(void)
774 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); 774 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
775 } 775 }
776 if (polling) 776 if (polling)
777 set_thread_flag(TIF_POLLING_NRFLAG); 777 current_thread_info()->status |= TS_POLLING;
778 778
779 if (!idled) 779 if (!idled)
780 return 0; 780 return 0;
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 36d66e2077d0..c80271f8f084 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
4 * to extract and format the required data. 4 * to extract and format the required data.
5 */ 5 */
6 6
7#include <linux/crypto.h>
7#include <linux/sched.h> 8#include <linux/sched.h>
8#include <linux/signal.h> 9#include <linux/signal.h>
9#include <linux/personality.h> 10#include <linux/personality.h>
@@ -13,6 +14,7 @@
13#include <asm/fixmap.h> 14#include <asm/fixmap.h>
14#include <asm/processor.h> 15#include <asm/processor.h>
15#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/elf.h>
16 18
17#define DEFINE(sym, val) \ 19#define DEFINE(sym, val) \
18 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 20 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -53,6 +55,7 @@ void foo(void)
53 OFFSET(TI_preempt_count, thread_info, preempt_count); 55 OFFSET(TI_preempt_count, thread_info, preempt_count);
54 OFFSET(TI_addr_limit, thread_info, addr_limit); 56 OFFSET(TI_addr_limit, thread_info, addr_limit);
55 OFFSET(TI_restart_block, thread_info, restart_block); 57 OFFSET(TI_restart_block, thread_info, restart_block);
58 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
56 BLANK(); 59 BLANK();
57 60
58 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 61 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
@@ -68,5 +71,7 @@ void foo(void)
68 sizeof(struct tss_struct)); 71 sizeof(struct tss_struct));
69 72
70 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 73 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
71 DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); 74 DEFINE(VDSO_PRELINK, VDSO_PRELINK);
75
76 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
72} 77}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 786d1a57048b..e6a2d6b80cda 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -224,22 +224,26 @@ static void __init init_amd(struct cpuinfo_x86 *c)
224 224
225#ifdef CONFIG_X86_HT 225#ifdef CONFIG_X86_HT
226 /* 226 /*
227 * On a AMD dual core setup the lower bits of the APIC id 227 * On a AMD multi core setup the lower bits of the APIC id
228 * distingush the cores. Assumes number of cores is a power 228 * distingush the cores.
229 * of two.
230 */ 229 */
231 if (c->x86_max_cores > 1) { 230 if (c->x86_max_cores > 1) {
232 int cpu = smp_processor_id(); 231 int cpu = smp_processor_id();
233 unsigned bits = 0; 232 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
234 while ((1 << bits) < c->x86_max_cores) 233
235 bits++; 234 if (bits == 0) {
236 cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1); 235 while ((1 << bits) < c->x86_max_cores)
237 phys_proc_id[cpu] >>= bits; 236 bits++;
237 }
238 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
239 c->phys_proc_id >>= bits;
238 printk(KERN_INFO "CPU %d(%d) -> Core %d\n", 240 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
239 cpu, c->x86_max_cores, cpu_core_id[cpu]); 241 cpu, c->x86_max_cores, c->cpu_core_id);
240 } 242 }
241#endif 243#endif
242 244
245 if (cpuid_eax(0x80000000) >= 0x80000006)
246 num_cache_leaves = 3;
243} 247}
244 248
245static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) 249static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 44f2c5f2dda1..70c87de582c7 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -294,7 +294,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
294 if (c->x86 >= 0x6) 294 if (c->x86 >= 0x6)
295 c->x86_model += ((tfms >> 16) & 0xF) << 4; 295 c->x86_model += ((tfms >> 16) & 0xF) << 4;
296 c->x86_mask = tfms & 15; 296 c->x86_mask = tfms & 15;
297#ifdef CONFIG_SMP 297#ifdef CONFIG_X86_HT
298 c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); 298 c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
299#else 299#else
300 c->apicid = (ebx >> 24) & 0xFF; 300 c->apicid = (ebx >> 24) & 0xFF;
@@ -319,7 +319,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
319 early_intel_workaround(c); 319 early_intel_workaround(c);
320 320
321#ifdef CONFIG_X86_HT 321#ifdef CONFIG_X86_HT
322 phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; 322 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
323#endif 323#endif
324} 324}
325 325
@@ -477,11 +477,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
477{ 477{
478 u32 eax, ebx, ecx, edx; 478 u32 eax, ebx, ecx, edx;
479 int index_msb, core_bits; 479 int index_msb, core_bits;
480 int cpu = smp_processor_id();
481 480
482 cpuid(1, &eax, &ebx, &ecx, &edx); 481 cpuid(1, &eax, &ebx, &ecx, &edx);
483 482
484
485 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) 483 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
486 return; 484 return;
487 485
@@ -492,16 +490,17 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
492 } else if (smp_num_siblings > 1 ) { 490 } else if (smp_num_siblings > 1 ) {
493 491
494 if (smp_num_siblings > NR_CPUS) { 492 if (smp_num_siblings > NR_CPUS) {
495 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); 493 printk(KERN_WARNING "CPU: Unsupported number of the "
494 "siblings %d", smp_num_siblings);
496 smp_num_siblings = 1; 495 smp_num_siblings = 1;
497 return; 496 return;
498 } 497 }
499 498
500 index_msb = get_count_order(smp_num_siblings); 499 index_msb = get_count_order(smp_num_siblings);
501 phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); 500 c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
502 501
503 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 502 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
504 phys_proc_id[cpu]); 503 c->phys_proc_id);
505 504
506 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 505 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
507 506
@@ -509,12 +508,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
509 508
510 core_bits = get_count_order(c->x86_max_cores); 509 core_bits = get_count_order(c->x86_max_cores);
511 510
512 cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & 511 c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
513 ((1 << core_bits) - 1); 512 ((1 << core_bits) - 1);
514 513
515 if (c->x86_max_cores > 1) 514 if (c->x86_max_cores > 1)
516 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 515 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
517 cpu_core_id[cpu]); 516 c->cpu_core_id);
518 } 517 }
519} 518}
520#endif 519#endif
@@ -613,6 +612,12 @@ void __cpuinit cpu_init(void)
613 set_in_cr4(X86_CR4_TSD); 612 set_in_cr4(X86_CR4_TSD);
614 } 613 }
615 614
615 /* The CPU hotplug case */
616 if (cpu_gdt_descr->address) {
617 gdt = (struct desc_struct *)cpu_gdt_descr->address;
618 memset(gdt, 0, PAGE_SIZE);
619 goto old_gdt;
620 }
616 /* 621 /*
617 * This is a horrible hack to allocate the GDT. The problem 622 * This is a horrible hack to allocate the GDT. The problem
618 * is that cpu_init() is called really early for the boot CPU 623 * is that cpu_init() is called really early for the boot CPU
@@ -631,7 +636,7 @@ void __cpuinit cpu_init(void)
631 local_irq_enable(); 636 local_irq_enable();
632 } 637 }
633 } 638 }
634 639old_gdt:
635 /* 640 /*
636 * Initialize the per-CPU GDT with the boot GDT, 641 * Initialize the per-CPU GDT with the boot GDT,
637 * and set up the GDT descriptor: 642 * and set up the GDT descriptor:
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index fc32c8028e24..f03b7f94c304 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -354,7 +354,7 @@ static void __init init_nsc(struct cpuinfo_x86 *c)
354 * This function only handles the GX processor, and kicks every 354 * This function only handles the GX processor, and kicks every
355 * thing else to the Cyrix init function above - that should 355 * thing else to the Cyrix init function above - that should
356 * cover any processors that might have been branded differently 356 * cover any processors that might have been branded differently
357 * after NSC aquired Cyrix. 357 * after NSC acquired Cyrix.
358 * 358 *
359 * If this breaks your GX1 horribly, please e-mail 359 * If this breaks your GX1 horribly, please e-mail
360 * info-linux@ldcmail.amd.com to tell us. 360 * info-linux@ldcmail.amd.com to tell us.
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 5386b29bb5a5..10afc645c540 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -122,6 +122,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
122 122
123 select_idle_routine(c); 123 select_idle_routine(c);
124 l2 = init_intel_cacheinfo(c); 124 l2 = init_intel_cacheinfo(c);
125 if (c->cpuid_level > 9 ) {
126 unsigned eax = cpuid_eax(10);
127 /* Check for version and the number of counters */
128 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
129 set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
130 }
125 131
126 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 132 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
127 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 133 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index c8547a6fa7e6..e9f0b928b0a9 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -4,6 +4,7 @@
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen : CPUID4 emulation on AMD.
7 */ 8 */
8 9
9#include <linux/init.h> 10#include <linux/init.h>
@@ -130,25 +131,111 @@ struct _cpuid4_info {
130 cpumask_t shared_cpu_map; 131 cpumask_t shared_cpu_map;
131}; 132};
132 133
133static unsigned short num_cache_leaves; 134unsigned short num_cache_leaves;
135
136/* AMD doesn't have CPUID4. Emulate it here to report the same
137 information to the user. This makes some assumptions about the machine:
138 No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs.
139
140 In theory the TLBs could be reported as fake type (they are in "dummy").
141 Maybe later */
142union l1_cache {
143 struct {
144 unsigned line_size : 8;
145 unsigned lines_per_tag : 8;
146 unsigned assoc : 8;
147 unsigned size_in_kb : 8;
148 };
149 unsigned val;
150};
151
152union l2_cache {
153 struct {
154 unsigned line_size : 8;
155 unsigned lines_per_tag : 4;
156 unsigned assoc : 4;
157 unsigned size_in_kb : 16;
158 };
159 unsigned val;
160};
161
162static const unsigned short assocs[] = {
163 [1] = 1, [2] = 2, [4] = 4, [6] = 8,
164 [8] = 16,
165 [0xf] = 0xffff // ??
166 };
167static const unsigned char levels[] = { 1, 1, 2 };
168static const unsigned char types[] = { 1, 2, 3 };
169
170static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
171 union _cpuid4_leaf_ebx *ebx,
172 union _cpuid4_leaf_ecx *ecx)
173{
174 unsigned dummy;
175 unsigned line_size, lines_per_tag, assoc, size_in_kb;
176 union l1_cache l1i, l1d;
177 union l2_cache l2;
178
179 eax->full = 0;
180 ebx->full = 0;
181 ecx->full = 0;
182
183 cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
184 cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy);
185
186 if (leaf > 2 || !l1d.val || !l1i.val || !l2.val)
187 return;
188
189 eax->split.is_self_initializing = 1;
190 eax->split.type = types[leaf];
191 eax->split.level = levels[leaf];
192 eax->split.num_threads_sharing = 0;
193 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
194
195 if (leaf <= 1) {
196 union l1_cache *l1 = leaf == 0 ? &l1d : &l1i;
197 assoc = l1->assoc;
198 line_size = l1->line_size;
199 lines_per_tag = l1->lines_per_tag;
200 size_in_kb = l1->size_in_kb;
201 } else {
202 assoc = l2.assoc;
203 line_size = l2.line_size;
204 lines_per_tag = l2.lines_per_tag;
205 /* cpu_data has errata corrections for K7 applied */
206 size_in_kb = current_cpu_data.x86_cache_size;
207 }
208
209 if (assoc == 0xf)
210 eax->split.is_fully_associative = 1;
211 ebx->split.coherency_line_size = line_size - 1;
212 ebx->split.ways_of_associativity = assocs[assoc] - 1;
213 ebx->split.physical_line_partition = lines_per_tag - 1;
214 ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
215 (ebx->split.ways_of_associativity + 1) - 1;
216}
134 217
135static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 218static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
136{ 219{
137 unsigned int eax, ebx, ecx, edx; 220 union _cpuid4_leaf_eax eax;
138 union _cpuid4_leaf_eax cache_eax; 221 union _cpuid4_leaf_ebx ebx;
222 union _cpuid4_leaf_ecx ecx;
223 unsigned edx;
139 224
140 cpuid_count(4, index, &eax, &ebx, &ecx, &edx); 225 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
141 cache_eax.full = eax; 226 amd_cpuid4(index, &eax, &ebx, &ecx);
142 if (cache_eax.split.type == CACHE_TYPE_NULL) 227 else
228 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
229 if (eax.split.type == CACHE_TYPE_NULL)
143 return -EIO; /* better error ? */ 230 return -EIO; /* better error ? */
144 231
145 this_leaf->eax.full = eax; 232 this_leaf->eax = eax;
146 this_leaf->ebx.full = ebx; 233 this_leaf->ebx = ebx;
147 this_leaf->ecx.full = ecx; 234 this_leaf->ecx = ecx;
148 this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) * 235 this_leaf->size = (ecx.split.number_of_sets + 1) *
149 (this_leaf->ebx.split.coherency_line_size + 1) * 236 (ebx.split.coherency_line_size + 1) *
150 (this_leaf->ebx.split.physical_line_partition + 1) * 237 (ebx.split.physical_line_partition + 1) *
151 (this_leaf->ebx.split.ways_of_associativity + 1); 238 (ebx.split.ways_of_associativity + 1);
152 return 0; 239 return 0;
153} 240}
154 241
@@ -174,7 +261,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
174 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ 261 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
175 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ 262 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
176 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; 263 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
177#ifdef CONFIG_SMP 264#ifdef CONFIG_X86_HT
178 unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data); 265 unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
179#endif 266#endif
180 267
@@ -296,14 +383,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
296 383
297 if (new_l2) { 384 if (new_l2) {
298 l2 = new_l2; 385 l2 = new_l2;
299#ifdef CONFIG_SMP 386#ifdef CONFIG_X86_HT
300 cpu_llc_id[cpu] = l2_id; 387 cpu_llc_id[cpu] = l2_id;
301#endif 388#endif
302 } 389 }
303 390
304 if (new_l3) { 391 if (new_l3) {
305 l3 = new_l3; 392 l3 = new_l3;
306#ifdef CONFIG_SMP 393#ifdef CONFIG_X86_HT
307 cpu_llc_id[cpu] = l3_id; 394 cpu_llc_id[cpu] = l3_id;
308#endif 395#endif
309 } 396 }
@@ -642,7 +729,7 @@ static void __cpuexit cache_remove_dev(struct sys_device * sys_dev)
642 return; 729 return;
643} 730}
644 731
645static int cacheinfo_cpu_callback(struct notifier_block *nfb, 732static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
646 unsigned long action, void *hcpu) 733 unsigned long action, void *hcpu)
647{ 734{
648 unsigned int cpu = (unsigned long)hcpu; 735 unsigned int cpu = (unsigned long)hcpu;
@@ -660,7 +747,7 @@ static int cacheinfo_cpu_callback(struct notifier_block *nfb,
660 return NOTIFY_OK; 747 return NOTIFY_OK;
661} 748}
662 749
663static struct notifier_block cacheinfo_cpu_notifier = 750static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
664{ 751{
665 .notifier_call = cacheinfo_cpu_callback, 752 .notifier_call = cacheinfo_cpu_callback,
666}; 753};
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index a19fcb262dbb..f54a15268ed7 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -18,7 +18,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
18 * applications want to get the raw CPUID data, they should access 18 * applications want to get the raw CPUID data, they should access
19 * /dev/cpu/<cpu_nr>/cpuid instead. 19 * /dev/cpu/<cpu_nr>/cpuid instead.
20 */ 20 */
21 static char *x86_cap_flags[] = { 21 static const char * const x86_cap_flags[] = {
22 /* Intel-defined */ 22 /* Intel-defined */
23 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", 23 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
24 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", 24 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
@@ -62,7 +62,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64 }; 64 };
65 static char *x86_power_flags[] = { 65 static const char * const x86_power_flags[] = {
66 "ts", /* temperature sensor */ 66 "ts", /* temperature sensor */
67 "fid", /* frequency id control */ 67 "fid", /* frequency id control */
68 "vid", /* voltage id control */ 68 "vid", /* voltage id control */
@@ -109,9 +109,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
109 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); 109 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
110#ifdef CONFIG_X86_HT 110#ifdef CONFIG_X86_HT
111 if (c->x86_max_cores * smp_num_siblings > 1) { 111 if (c->x86_max_cores * smp_num_siblings > 1) {
112 seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]); 112 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
113 seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n])); 113 seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n]));
114 seq_printf(m, "core id\t\t: %d\n", cpu_core_id[n]); 114 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
115 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 115 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
116 } 116 }
117#endif 117#endif
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 1d9a4abcdfc7..f6dfa9fb675c 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -183,7 +183,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac
183 return NOTIFY_OK; 183 return NOTIFY_OK;
184} 184}
185 185
186static struct notifier_block cpuid_class_cpu_notifier = 186static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
187{ 187{
188 .notifier_call = cpuid_class_cpu_callback, 188 .notifier_call = cpuid_class_cpu_callback,
189}; 189};
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 21dc1bbb8067..48f0f62f781c 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -120,14 +120,9 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
120 return 1; 120 return 1;
121} 121}
122 122
123/*
124 * By using the NMI code instead of a vector we just sneak thru the
125 * word generator coming out with just what we want. AND it does
126 * not matter if clustered_apic_mode is set or not.
127 */
128static void smp_send_nmi_allbutself(void) 123static void smp_send_nmi_allbutself(void)
129{ 124{
130 send_IPI_allbutself(APIC_DM_NMI); 125 send_IPI_allbutself(NMI_VECTOR);
131} 126}
132 127
133static void nmi_shootdown_cpus(void) 128static void nmi_shootdown_cpus(void)
@@ -163,7 +158,7 @@ static void nmi_shootdown_cpus(void)
163void machine_crash_shutdown(struct pt_regs *regs) 158void machine_crash_shutdown(struct pt_regs *regs)
164{ 159{
165 /* This function is only called after the system 160 /* This function is only called after the system
166 * has paniced or is otherwise in a critical state. 161 * has panicked or is otherwise in a critical state.
167 * The minimum amount of code to allow a kexec'd kernel 162 * The minimum amount of code to allow a kexec'd kernel
168 * to run successfully needs to happen here. 163 * to run successfully needs to happen here.
169 * 164 *
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index cfc683f153b9..fbdb933251b6 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -48,6 +48,7 @@
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/page.h> 49#include <asm/page.h>
50#include <asm/desc.h> 50#include <asm/desc.h>
51#include <asm/dwarf2.h>
51#include "irq_vectors.h" 52#include "irq_vectors.h"
52 53
53#define nr_syscalls ((syscall_table_size)/4) 54#define nr_syscalls ((syscall_table_size)/4)
@@ -82,34 +83,76 @@ VM_MASK = 0x00020000
82#define resume_kernel restore_nocheck 83#define resume_kernel restore_nocheck
83#endif 84#endif
84 85
86#ifdef CONFIG_VM86
87#define resume_userspace_sig check_userspace
88#else
89#define resume_userspace_sig resume_userspace
90#endif
91
85#define SAVE_ALL \ 92#define SAVE_ALL \
86 cld; \ 93 cld; \
87 pushl %es; \ 94 pushl %es; \
95 CFI_ADJUST_CFA_OFFSET 4;\
96 /*CFI_REL_OFFSET es, 0;*/\
88 pushl %ds; \ 97 pushl %ds; \
98 CFI_ADJUST_CFA_OFFSET 4;\
99 /*CFI_REL_OFFSET ds, 0;*/\
89 pushl %eax; \ 100 pushl %eax; \
101 CFI_ADJUST_CFA_OFFSET 4;\
102 CFI_REL_OFFSET eax, 0;\
90 pushl %ebp; \ 103 pushl %ebp; \
104 CFI_ADJUST_CFA_OFFSET 4;\
105 CFI_REL_OFFSET ebp, 0;\
91 pushl %edi; \ 106 pushl %edi; \
107 CFI_ADJUST_CFA_OFFSET 4;\
108 CFI_REL_OFFSET edi, 0;\
92 pushl %esi; \ 109 pushl %esi; \
110 CFI_ADJUST_CFA_OFFSET 4;\
111 CFI_REL_OFFSET esi, 0;\
93 pushl %edx; \ 112 pushl %edx; \
113 CFI_ADJUST_CFA_OFFSET 4;\
114 CFI_REL_OFFSET edx, 0;\
94 pushl %ecx; \ 115 pushl %ecx; \
116 CFI_ADJUST_CFA_OFFSET 4;\
117 CFI_REL_OFFSET ecx, 0;\
95 pushl %ebx; \ 118 pushl %ebx; \
119 CFI_ADJUST_CFA_OFFSET 4;\
120 CFI_REL_OFFSET ebx, 0;\
96 movl $(__USER_DS), %edx; \ 121 movl $(__USER_DS), %edx; \
97 movl %edx, %ds; \ 122 movl %edx, %ds; \
98 movl %edx, %es; 123 movl %edx, %es;
99 124
100#define RESTORE_INT_REGS \ 125#define RESTORE_INT_REGS \
101 popl %ebx; \ 126 popl %ebx; \
127 CFI_ADJUST_CFA_OFFSET -4;\
128 CFI_RESTORE ebx;\
102 popl %ecx; \ 129 popl %ecx; \
130 CFI_ADJUST_CFA_OFFSET -4;\
131 CFI_RESTORE ecx;\
103 popl %edx; \ 132 popl %edx; \
133 CFI_ADJUST_CFA_OFFSET -4;\
134 CFI_RESTORE edx;\
104 popl %esi; \ 135 popl %esi; \
136 CFI_ADJUST_CFA_OFFSET -4;\
137 CFI_RESTORE esi;\
105 popl %edi; \ 138 popl %edi; \
139 CFI_ADJUST_CFA_OFFSET -4;\
140 CFI_RESTORE edi;\
106 popl %ebp; \ 141 popl %ebp; \
107 popl %eax 142 CFI_ADJUST_CFA_OFFSET -4;\
143 CFI_RESTORE ebp;\
144 popl %eax; \
145 CFI_ADJUST_CFA_OFFSET -4;\
146 CFI_RESTORE eax
108 147
109#define RESTORE_REGS \ 148#define RESTORE_REGS \
110 RESTORE_INT_REGS; \ 149 RESTORE_INT_REGS; \
1111: popl %ds; \ 1501: popl %ds; \
151 CFI_ADJUST_CFA_OFFSET -4;\
152 /*CFI_RESTORE ds;*/\
1122: popl %es; \ 1532: popl %es; \
154 CFI_ADJUST_CFA_OFFSET -4;\
155 /*CFI_RESTORE es;*/\
113.section .fixup,"ax"; \ 156.section .fixup,"ax"; \
1143: movl $0,(%esp); \ 1573: movl $0,(%esp); \
115 jmp 1b; \ 158 jmp 1b; \
@@ -122,13 +165,43 @@ VM_MASK = 0x00020000
122 .long 2b,4b; \ 165 .long 2b,4b; \
123.previous 166.previous
124 167
168#define RING0_INT_FRAME \
169 CFI_STARTPROC simple;\
170 CFI_DEF_CFA esp, 3*4;\
171 /*CFI_OFFSET cs, -2*4;*/\
172 CFI_OFFSET eip, -3*4
173
174#define RING0_EC_FRAME \
175 CFI_STARTPROC simple;\
176 CFI_DEF_CFA esp, 4*4;\
177 /*CFI_OFFSET cs, -2*4;*/\
178 CFI_OFFSET eip, -3*4
179
180#define RING0_PTREGS_FRAME \
181 CFI_STARTPROC simple;\
182 CFI_DEF_CFA esp, OLDESP-EBX;\
183 /*CFI_OFFSET cs, CS-OLDESP;*/\
184 CFI_OFFSET eip, EIP-OLDESP;\
185 /*CFI_OFFSET es, ES-OLDESP;*/\
186 /*CFI_OFFSET ds, DS-OLDESP;*/\
187 CFI_OFFSET eax, EAX-OLDESP;\
188 CFI_OFFSET ebp, EBP-OLDESP;\
189 CFI_OFFSET edi, EDI-OLDESP;\
190 CFI_OFFSET esi, ESI-OLDESP;\
191 CFI_OFFSET edx, EDX-OLDESP;\
192 CFI_OFFSET ecx, ECX-OLDESP;\
193 CFI_OFFSET ebx, EBX-OLDESP
125 194
126ENTRY(ret_from_fork) 195ENTRY(ret_from_fork)
196 CFI_STARTPROC
127 pushl %eax 197 pushl %eax
198 CFI_ADJUST_CFA_OFFSET -4
128 call schedule_tail 199 call schedule_tail
129 GET_THREAD_INFO(%ebp) 200 GET_THREAD_INFO(%ebp)
130 popl %eax 201 popl %eax
202 CFI_ADJUST_CFA_OFFSET -4
131 jmp syscall_exit 203 jmp syscall_exit
204 CFI_ENDPROC
132 205
133/* 206/*
134 * Return to user mode is not as complex as all this looks, 207 * Return to user mode is not as complex as all this looks,
@@ -139,10 +212,12 @@ ENTRY(ret_from_fork)
139 212
140 # userspace resumption stub bypassing syscall exit tracing 213 # userspace resumption stub bypassing syscall exit tracing
141 ALIGN 214 ALIGN
215 RING0_PTREGS_FRAME
142ret_from_exception: 216ret_from_exception:
143 preempt_stop 217 preempt_stop
144ret_from_intr: 218ret_from_intr:
145 GET_THREAD_INFO(%ebp) 219 GET_THREAD_INFO(%ebp)
220check_userspace:
146 movl EFLAGS(%esp), %eax # mix EFLAGS and CS 221 movl EFLAGS(%esp), %eax # mix EFLAGS and CS
147 movb CS(%esp), %al 222 movb CS(%esp), %al
148 testl $(VM_MASK | 3), %eax 223 testl $(VM_MASK | 3), %eax
@@ -171,20 +246,38 @@ need_resched:
171 call preempt_schedule_irq 246 call preempt_schedule_irq
172 jmp need_resched 247 jmp need_resched
173#endif 248#endif
249 CFI_ENDPROC
174 250
175/* SYSENTER_RETURN points to after the "sysenter" instruction in 251/* SYSENTER_RETURN points to after the "sysenter" instruction in
176 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 252 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
177 253
178 # sysenter call handler stub 254 # sysenter call handler stub
179ENTRY(sysenter_entry) 255ENTRY(sysenter_entry)
256 CFI_STARTPROC simple
257 CFI_DEF_CFA esp, 0
258 CFI_REGISTER esp, ebp
180 movl TSS_sysenter_esp0(%esp),%esp 259 movl TSS_sysenter_esp0(%esp),%esp
181sysenter_past_esp: 260sysenter_past_esp:
182 sti 261 sti
183 pushl $(__USER_DS) 262 pushl $(__USER_DS)
263 CFI_ADJUST_CFA_OFFSET 4
264 /*CFI_REL_OFFSET ss, 0*/
184 pushl %ebp 265 pushl %ebp
266 CFI_ADJUST_CFA_OFFSET 4
267 CFI_REL_OFFSET esp, 0
185 pushfl 268 pushfl
269 CFI_ADJUST_CFA_OFFSET 4
186 pushl $(__USER_CS) 270 pushl $(__USER_CS)
187 pushl $SYSENTER_RETURN 271 CFI_ADJUST_CFA_OFFSET 4
272 /*CFI_REL_OFFSET cs, 0*/
273 /*
274 * Push current_thread_info()->sysenter_return to the stack.
275 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
276 * pushed above; +8 corresponds to copy_thread's esp0 setting.
277 */
278 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
279 CFI_ADJUST_CFA_OFFSET 4
280 CFI_REL_OFFSET eip, 0
188 281
189/* 282/*
190 * Load the potential sixth argument from user stack. 283 * Load the potential sixth argument from user stack.
@@ -199,6 +292,7 @@ sysenter_past_esp:
199.previous 292.previous
200 293
201 pushl %eax 294 pushl %eax
295 CFI_ADJUST_CFA_OFFSET 4
202 SAVE_ALL 296 SAVE_ALL
203 GET_THREAD_INFO(%ebp) 297 GET_THREAD_INFO(%ebp)
204 298
@@ -219,11 +313,14 @@ sysenter_past_esp:
219 xorl %ebp,%ebp 313 xorl %ebp,%ebp
220 sti 314 sti
221 sysexit 315 sysexit
316 CFI_ENDPROC
222 317
223 318
224 # system call handler stub 319 # system call handler stub
225ENTRY(system_call) 320ENTRY(system_call)
321 RING0_INT_FRAME # can't unwind into user space anyway
226 pushl %eax # save orig_eax 322 pushl %eax # save orig_eax
323 CFI_ADJUST_CFA_OFFSET 4
227 SAVE_ALL 324 SAVE_ALL
228 GET_THREAD_INFO(%ebp) 325 GET_THREAD_INFO(%ebp)
229 testl $TF_MASK,EFLAGS(%esp) 326 testl $TF_MASK,EFLAGS(%esp)
@@ -256,10 +353,12 @@ restore_all:
256 movb CS(%esp), %al 353 movb CS(%esp), %al
257 andl $(VM_MASK | (4 << 8) | 3), %eax 354 andl $(VM_MASK | (4 << 8) | 3), %eax
258 cmpl $((4 << 8) | 3), %eax 355 cmpl $((4 << 8) | 3), %eax
356 CFI_REMEMBER_STATE
259 je ldt_ss # returning to user-space with LDT SS 357 je ldt_ss # returning to user-space with LDT SS
260restore_nocheck: 358restore_nocheck:
261 RESTORE_REGS 359 RESTORE_REGS
262 addl $4, %esp 360 addl $4, %esp
361 CFI_ADJUST_CFA_OFFSET -4
2631: iret 3621: iret
264.section .fixup,"ax" 363.section .fixup,"ax"
265iret_exc: 364iret_exc:
@@ -273,6 +372,7 @@ iret_exc:
273 .long 1b,iret_exc 372 .long 1b,iret_exc
274.previous 373.previous
275 374
375 CFI_RESTORE_STATE
276ldt_ss: 376ldt_ss:
277 larl OLDSS(%esp), %eax 377 larl OLDSS(%esp), %eax
278 jnz restore_nocheck 378 jnz restore_nocheck
@@ -285,11 +385,13 @@ ldt_ss:
285 * CPUs, which we can try to work around to make 385 * CPUs, which we can try to work around to make
286 * dosemu and wine happy. */ 386 * dosemu and wine happy. */
287 subl $8, %esp # reserve space for switch16 pointer 387 subl $8, %esp # reserve space for switch16 pointer
388 CFI_ADJUST_CFA_OFFSET 8
288 cli 389 cli
289 movl %esp, %eax 390 movl %esp, %eax
290 /* Set up the 16bit stack frame with switch32 pointer on top, 391 /* Set up the 16bit stack frame with switch32 pointer on top,
291 * and a switch16 pointer on top of the current frame. */ 392 * and a switch16 pointer on top of the current frame. */
292 call setup_x86_bogus_stack 393 call setup_x86_bogus_stack
394 CFI_ADJUST_CFA_OFFSET -8 # frame has moved
293 RESTORE_REGS 395 RESTORE_REGS
294 lss 20+4(%esp), %esp # switch to 16bit stack 396 lss 20+4(%esp), %esp # switch to 16bit stack
2951: iret 3971: iret
@@ -297,9 +399,11 @@ ldt_ss:
297 .align 4 399 .align 4
298 .long 1b,iret_exc 400 .long 1b,iret_exc
299.previous 401.previous
402 CFI_ENDPROC
300 403
301 # perform work that needs to be done immediately before resumption 404 # perform work that needs to be done immediately before resumption
302 ALIGN 405 ALIGN
406 RING0_PTREGS_FRAME # can't unwind into user space anyway
303work_pending: 407work_pending:
304 testb $_TIF_NEED_RESCHED, %cl 408 testb $_TIF_NEED_RESCHED, %cl
305 jz work_notifysig 409 jz work_notifysig
@@ -323,18 +427,20 @@ work_notifysig: # deal with pending signals and
323 # vm86-space 427 # vm86-space
324 xorl %edx, %edx 428 xorl %edx, %edx
325 call do_notify_resume 429 call do_notify_resume
326 jmp resume_userspace 430 jmp resume_userspace_sig
327 431
328 ALIGN 432 ALIGN
329work_notifysig_v86: 433work_notifysig_v86:
330#ifdef CONFIG_VM86 434#ifdef CONFIG_VM86
331 pushl %ecx # save ti_flags for do_notify_resume 435 pushl %ecx # save ti_flags for do_notify_resume
436 CFI_ADJUST_CFA_OFFSET 4
332 call save_v86_state # %eax contains pt_regs pointer 437 call save_v86_state # %eax contains pt_regs pointer
333 popl %ecx 438 popl %ecx
439 CFI_ADJUST_CFA_OFFSET -4
334 movl %eax, %esp 440 movl %eax, %esp
335 xorl %edx, %edx 441 xorl %edx, %edx
336 call do_notify_resume 442 call do_notify_resume
337 jmp resume_userspace 443 jmp resume_userspace_sig
338#endif 444#endif
339 445
340 # perform syscall exit tracing 446 # perform syscall exit tracing
@@ -363,19 +469,21 @@ syscall_exit_work:
363 movl $1, %edx 469 movl $1, %edx
364 call do_syscall_trace 470 call do_syscall_trace
365 jmp resume_userspace 471 jmp resume_userspace
472 CFI_ENDPROC
366 473
367 ALIGN 474 RING0_INT_FRAME # can't unwind into user space anyway
368syscall_fault: 475syscall_fault:
369 pushl %eax # save orig_eax 476 pushl %eax # save orig_eax
477 CFI_ADJUST_CFA_OFFSET 4
370 SAVE_ALL 478 SAVE_ALL
371 GET_THREAD_INFO(%ebp) 479 GET_THREAD_INFO(%ebp)
372 movl $-EFAULT,EAX(%esp) 480 movl $-EFAULT,EAX(%esp)
373 jmp resume_userspace 481 jmp resume_userspace
374 482
375 ALIGN
376syscall_badsys: 483syscall_badsys:
377 movl $-ENOSYS,EAX(%esp) 484 movl $-ENOSYS,EAX(%esp)
378 jmp resume_userspace 485 jmp resume_userspace
486 CFI_ENDPROC
379 487
380#define FIXUP_ESPFIX_STACK \ 488#define FIXUP_ESPFIX_STACK \
381 movl %esp, %eax; \ 489 movl %esp, %eax; \
@@ -387,16 +495,21 @@ syscall_badsys:
387 movl %eax, %esp; 495 movl %eax, %esp;
388#define UNWIND_ESPFIX_STACK \ 496#define UNWIND_ESPFIX_STACK \
389 pushl %eax; \ 497 pushl %eax; \
498 CFI_ADJUST_CFA_OFFSET 4; \
390 movl %ss, %eax; \ 499 movl %ss, %eax; \
391 /* see if on 16bit stack */ \ 500 /* see if on 16bit stack */ \
392 cmpw $__ESPFIX_SS, %ax; \ 501 cmpw $__ESPFIX_SS, %ax; \
393 jne 28f; \ 502 je 28f; \
394 movl $__KERNEL_DS, %edx; \ 50327: popl %eax; \
395 movl %edx, %ds; \ 504 CFI_ADJUST_CFA_OFFSET -4; \
396 movl %edx, %es; \ 505.section .fixup,"ax"; \
50628: movl $__KERNEL_DS, %eax; \
507 movl %eax, %ds; \
508 movl %eax, %es; \
397 /* switch to 32bit stack */ \ 509 /* switch to 32bit stack */ \
398 FIXUP_ESPFIX_STACK \ 510 FIXUP_ESPFIX_STACK; \
39928: popl %eax; 511 jmp 27b; \
512.previous
400 513
401/* 514/*
402 * Build the entry stubs and pointer table with 515 * Build the entry stubs and pointer table with
@@ -408,9 +521,14 @@ ENTRY(interrupt)
408 521
409vector=0 522vector=0
410ENTRY(irq_entries_start) 523ENTRY(irq_entries_start)
524 RING0_INT_FRAME
411.rept NR_IRQS 525.rept NR_IRQS
412 ALIGN 526 ALIGN
4131: pushl $vector-256 527 .if vector
528 CFI_ADJUST_CFA_OFFSET -4
529 .endif
5301: pushl $~(vector)
531 CFI_ADJUST_CFA_OFFSET 4
414 jmp common_interrupt 532 jmp common_interrupt
415.data 533.data
416 .long 1b 534 .long 1b
@@ -424,60 +542,99 @@ common_interrupt:
424 movl %esp,%eax 542 movl %esp,%eax
425 call do_IRQ 543 call do_IRQ
426 jmp ret_from_intr 544 jmp ret_from_intr
545 CFI_ENDPROC
427 546
428#define BUILD_INTERRUPT(name, nr) \ 547#define BUILD_INTERRUPT(name, nr) \
429ENTRY(name) \ 548ENTRY(name) \
430 pushl $nr-256; \ 549 RING0_INT_FRAME; \
431 SAVE_ALL \ 550 pushl $~(nr); \
551 CFI_ADJUST_CFA_OFFSET 4; \
552 SAVE_ALL; \
432 movl %esp,%eax; \ 553 movl %esp,%eax; \
433 call smp_/**/name; \ 554 call smp_/**/name; \
434 jmp ret_from_intr; 555 jmp ret_from_intr; \
556 CFI_ENDPROC
435 557
436/* The include is where all of the SMP etc. interrupts come from */ 558/* The include is where all of the SMP etc. interrupts come from */
437#include "entry_arch.h" 559#include "entry_arch.h"
438 560
439ENTRY(divide_error) 561ENTRY(divide_error)
562 RING0_INT_FRAME
440 pushl $0 # no error code 563 pushl $0 # no error code
564 CFI_ADJUST_CFA_OFFSET 4
441 pushl $do_divide_error 565 pushl $do_divide_error
566 CFI_ADJUST_CFA_OFFSET 4
442 ALIGN 567 ALIGN
443error_code: 568error_code:
444 pushl %ds 569 pushl %ds
570 CFI_ADJUST_CFA_OFFSET 4
571 /*CFI_REL_OFFSET ds, 0*/
445 pushl %eax 572 pushl %eax
573 CFI_ADJUST_CFA_OFFSET 4
574 CFI_REL_OFFSET eax, 0
446 xorl %eax, %eax 575 xorl %eax, %eax
447 pushl %ebp 576 pushl %ebp
577 CFI_ADJUST_CFA_OFFSET 4
578 CFI_REL_OFFSET ebp, 0
448 pushl %edi 579 pushl %edi
580 CFI_ADJUST_CFA_OFFSET 4
581 CFI_REL_OFFSET edi, 0
449 pushl %esi 582 pushl %esi
583 CFI_ADJUST_CFA_OFFSET 4
584 CFI_REL_OFFSET esi, 0
450 pushl %edx 585 pushl %edx
586 CFI_ADJUST_CFA_OFFSET 4
587 CFI_REL_OFFSET edx, 0
451 decl %eax # eax = -1 588 decl %eax # eax = -1
452 pushl %ecx 589 pushl %ecx
590 CFI_ADJUST_CFA_OFFSET 4
591 CFI_REL_OFFSET ecx, 0
453 pushl %ebx 592 pushl %ebx
593 CFI_ADJUST_CFA_OFFSET 4
594 CFI_REL_OFFSET ebx, 0
454 cld 595 cld
455 pushl %es 596 pushl %es
597 CFI_ADJUST_CFA_OFFSET 4
598 /*CFI_REL_OFFSET es, 0*/
456 UNWIND_ESPFIX_STACK 599 UNWIND_ESPFIX_STACK
457 popl %ecx 600 popl %ecx
601 CFI_ADJUST_CFA_OFFSET -4
602 /*CFI_REGISTER es, ecx*/
458 movl ES(%esp), %edi # get the function address 603 movl ES(%esp), %edi # get the function address
459 movl ORIG_EAX(%esp), %edx # get the error code 604 movl ORIG_EAX(%esp), %edx # get the error code
460 movl %eax, ORIG_EAX(%esp) 605 movl %eax, ORIG_EAX(%esp)
461 movl %ecx, ES(%esp) 606 movl %ecx, ES(%esp)
607 /*CFI_REL_OFFSET es, ES*/
462 movl $(__USER_DS), %ecx 608 movl $(__USER_DS), %ecx
463 movl %ecx, %ds 609 movl %ecx, %ds
464 movl %ecx, %es 610 movl %ecx, %es
465 movl %esp,%eax # pt_regs pointer 611 movl %esp,%eax # pt_regs pointer
466 call *%edi 612 call *%edi
467 jmp ret_from_exception 613 jmp ret_from_exception
614 CFI_ENDPROC
468 615
469ENTRY(coprocessor_error) 616ENTRY(coprocessor_error)
617 RING0_INT_FRAME
470 pushl $0 618 pushl $0
619 CFI_ADJUST_CFA_OFFSET 4
471 pushl $do_coprocessor_error 620 pushl $do_coprocessor_error
621 CFI_ADJUST_CFA_OFFSET 4
472 jmp error_code 622 jmp error_code
623 CFI_ENDPROC
473 624
474ENTRY(simd_coprocessor_error) 625ENTRY(simd_coprocessor_error)
626 RING0_INT_FRAME
475 pushl $0 627 pushl $0
628 CFI_ADJUST_CFA_OFFSET 4
476 pushl $do_simd_coprocessor_error 629 pushl $do_simd_coprocessor_error
630 CFI_ADJUST_CFA_OFFSET 4
477 jmp error_code 631 jmp error_code
632 CFI_ENDPROC
478 633
479ENTRY(device_not_available) 634ENTRY(device_not_available)
635 RING0_INT_FRAME
480 pushl $-1 # mark this as an int 636 pushl $-1 # mark this as an int
637 CFI_ADJUST_CFA_OFFSET 4
481 SAVE_ALL 638 SAVE_ALL
482 movl %cr0, %eax 639 movl %cr0, %eax
483 testl $0x4, %eax # EM (math emulation bit) 640 testl $0x4, %eax # EM (math emulation bit)
@@ -487,9 +644,12 @@ ENTRY(device_not_available)
487 jmp ret_from_exception 644 jmp ret_from_exception
488device_not_available_emulate: 645device_not_available_emulate:
489 pushl $0 # temporary storage for ORIG_EIP 646 pushl $0 # temporary storage for ORIG_EIP
647 CFI_ADJUST_CFA_OFFSET 4
490 call math_emulate 648 call math_emulate
491 addl $4, %esp 649 addl $4, %esp
650 CFI_ADJUST_CFA_OFFSET -4
492 jmp ret_from_exception 651 jmp ret_from_exception
652 CFI_ENDPROC
493 653
494/* 654/*
495 * Debug traps and NMI can happen at the one SYSENTER instruction 655 * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -514,16 +674,19 @@ label: \
514 pushl $sysenter_past_esp 674 pushl $sysenter_past_esp
515 675
516KPROBE_ENTRY(debug) 676KPROBE_ENTRY(debug)
677 RING0_INT_FRAME
517 cmpl $sysenter_entry,(%esp) 678 cmpl $sysenter_entry,(%esp)
518 jne debug_stack_correct 679 jne debug_stack_correct
519 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) 680 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
520debug_stack_correct: 681debug_stack_correct:
521 pushl $-1 # mark this as an int 682 pushl $-1 # mark this as an int
683 CFI_ADJUST_CFA_OFFSET 4
522 SAVE_ALL 684 SAVE_ALL
523 xorl %edx,%edx # error code 0 685 xorl %edx,%edx # error code 0
524 movl %esp,%eax # pt_regs pointer 686 movl %esp,%eax # pt_regs pointer
525 call do_debug 687 call do_debug
526 jmp ret_from_exception 688 jmp ret_from_exception
689 CFI_ENDPROC
527 .previous .text 690 .previous .text
528/* 691/*
529 * NMI is doubly nasty. It can happen _while_ we're handling 692 * NMI is doubly nasty. It can happen _while_ we're handling
@@ -534,14 +697,18 @@ debug_stack_correct:
534 * fault happened on the sysenter path. 697 * fault happened on the sysenter path.
535 */ 698 */
536ENTRY(nmi) 699ENTRY(nmi)
700 RING0_INT_FRAME
537 pushl %eax 701 pushl %eax
702 CFI_ADJUST_CFA_OFFSET 4
538 movl %ss, %eax 703 movl %ss, %eax
539 cmpw $__ESPFIX_SS, %ax 704 cmpw $__ESPFIX_SS, %ax
540 popl %eax 705 popl %eax
706 CFI_ADJUST_CFA_OFFSET -4
541 je nmi_16bit_stack 707 je nmi_16bit_stack
542 cmpl $sysenter_entry,(%esp) 708 cmpl $sysenter_entry,(%esp)
543 je nmi_stack_fixup 709 je nmi_stack_fixup
544 pushl %eax 710 pushl %eax
711 CFI_ADJUST_CFA_OFFSET 4
545 movl %esp,%eax 712 movl %esp,%eax
546 /* Do not access memory above the end of our stack page, 713 /* Do not access memory above the end of our stack page,
547 * it might not exist. 714 * it might not exist.
@@ -549,16 +716,19 @@ ENTRY(nmi)
549 andl $(THREAD_SIZE-1),%eax 716 andl $(THREAD_SIZE-1),%eax
550 cmpl $(THREAD_SIZE-20),%eax 717 cmpl $(THREAD_SIZE-20),%eax
551 popl %eax 718 popl %eax
719 CFI_ADJUST_CFA_OFFSET -4
552 jae nmi_stack_correct 720 jae nmi_stack_correct
553 cmpl $sysenter_entry,12(%esp) 721 cmpl $sysenter_entry,12(%esp)
554 je nmi_debug_stack_check 722 je nmi_debug_stack_check
555nmi_stack_correct: 723nmi_stack_correct:
556 pushl %eax 724 pushl %eax
725 CFI_ADJUST_CFA_OFFSET 4
557 SAVE_ALL 726 SAVE_ALL
558 xorl %edx,%edx # zero error code 727 xorl %edx,%edx # zero error code
559 movl %esp,%eax # pt_regs pointer 728 movl %esp,%eax # pt_regs pointer
560 call do_nmi 729 call do_nmi
561 jmp restore_all 730 jmp restore_all
731 CFI_ENDPROC
562 732
563nmi_stack_fixup: 733nmi_stack_fixup:
564 FIX_STACK(12,nmi_stack_correct, 1) 734 FIX_STACK(12,nmi_stack_correct, 1)
@@ -574,94 +744,177 @@ nmi_debug_stack_check:
574 jmp nmi_stack_correct 744 jmp nmi_stack_correct
575 745
576nmi_16bit_stack: 746nmi_16bit_stack:
747 RING0_INT_FRAME
577 /* create the pointer to lss back */ 748 /* create the pointer to lss back */
578 pushl %ss 749 pushl %ss
750 CFI_ADJUST_CFA_OFFSET 4
579 pushl %esp 751 pushl %esp
752 CFI_ADJUST_CFA_OFFSET 4
580 movzwl %sp, %esp 753 movzwl %sp, %esp
581 addw $4, (%esp) 754 addw $4, (%esp)
582 /* copy the iret frame of 12 bytes */ 755 /* copy the iret frame of 12 bytes */
583 .rept 3 756 .rept 3
584 pushl 16(%esp) 757 pushl 16(%esp)
758 CFI_ADJUST_CFA_OFFSET 4
585 .endr 759 .endr
586 pushl %eax 760 pushl %eax
761 CFI_ADJUST_CFA_OFFSET 4
587 SAVE_ALL 762 SAVE_ALL
588 FIXUP_ESPFIX_STACK # %eax == %esp 763 FIXUP_ESPFIX_STACK # %eax == %esp
764 CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
589 xorl %edx,%edx # zero error code 765 xorl %edx,%edx # zero error code
590 call do_nmi 766 call do_nmi
591 RESTORE_REGS 767 RESTORE_REGS
592 lss 12+4(%esp), %esp # back to 16bit stack 768 lss 12+4(%esp), %esp # back to 16bit stack
5931: iret 7691: iret
770 CFI_ENDPROC
594.section __ex_table,"a" 771.section __ex_table,"a"
595 .align 4 772 .align 4
596 .long 1b,iret_exc 773 .long 1b,iret_exc
597.previous 774.previous
598 775
599KPROBE_ENTRY(int3) 776KPROBE_ENTRY(int3)
777 RING0_INT_FRAME
600 pushl $-1 # mark this as an int 778 pushl $-1 # mark this as an int
779 CFI_ADJUST_CFA_OFFSET 4
601 SAVE_ALL 780 SAVE_ALL
602 xorl %edx,%edx # zero error code 781 xorl %edx,%edx # zero error code
603 movl %esp,%eax # pt_regs pointer 782 movl %esp,%eax # pt_regs pointer
604 call do_int3 783 call do_int3
605 jmp ret_from_exception 784 jmp ret_from_exception
785 CFI_ENDPROC
606 .previous .text 786 .previous .text
607 787
608ENTRY(overflow) 788ENTRY(overflow)
789 RING0_INT_FRAME
609 pushl $0 790 pushl $0
791 CFI_ADJUST_CFA_OFFSET 4
610 pushl $do_overflow 792 pushl $do_overflow
793 CFI_ADJUST_CFA_OFFSET 4
611 jmp error_code 794 jmp error_code
795 CFI_ENDPROC
612 796
613ENTRY(bounds) 797ENTRY(bounds)
798 RING0_INT_FRAME
614 pushl $0 799 pushl $0
800 CFI_ADJUST_CFA_OFFSET 4
615 pushl $do_bounds 801 pushl $do_bounds
802 CFI_ADJUST_CFA_OFFSET 4
616 jmp error_code 803 jmp error_code
804 CFI_ENDPROC
617 805
618ENTRY(invalid_op) 806ENTRY(invalid_op)
807 RING0_INT_FRAME
619 pushl $0 808 pushl $0
809 CFI_ADJUST_CFA_OFFSET 4
620 pushl $do_invalid_op 810 pushl $do_invalid_op
811 CFI_ADJUST_CFA_OFFSET 4
621 jmp error_code 812 jmp error_code
813 CFI_ENDPROC
622 814
623ENTRY(coprocessor_segment_overrun) 815ENTRY(coprocessor_segment_overrun)
816 RING0_INT_FRAME
624 pushl $0 817 pushl $0
818 CFI_ADJUST_CFA_OFFSET 4
625 pushl $do_coprocessor_segment_overrun 819 pushl $do_coprocessor_segment_overrun
820 CFI_ADJUST_CFA_OFFSET 4
626 jmp error_code 821 jmp error_code
822 CFI_ENDPROC
627 823
628ENTRY(invalid_TSS) 824ENTRY(invalid_TSS)
825 RING0_EC_FRAME
629 pushl $do_invalid_TSS 826 pushl $do_invalid_TSS
827 CFI_ADJUST_CFA_OFFSET 4
630 jmp error_code 828 jmp error_code
829 CFI_ENDPROC
631 830
632ENTRY(segment_not_present) 831ENTRY(segment_not_present)
832 RING0_EC_FRAME
633 pushl $do_segment_not_present 833 pushl $do_segment_not_present
834 CFI_ADJUST_CFA_OFFSET 4
634 jmp error_code 835 jmp error_code
836 CFI_ENDPROC
635 837
636ENTRY(stack_segment) 838ENTRY(stack_segment)
839 RING0_EC_FRAME
637 pushl $do_stack_segment 840 pushl $do_stack_segment
841 CFI_ADJUST_CFA_OFFSET 4
638 jmp error_code 842 jmp error_code
843 CFI_ENDPROC
639 844
640KPROBE_ENTRY(general_protection) 845KPROBE_ENTRY(general_protection)
846 RING0_EC_FRAME
641 pushl $do_general_protection 847 pushl $do_general_protection
848 CFI_ADJUST_CFA_OFFSET 4
642 jmp error_code 849 jmp error_code
850 CFI_ENDPROC
643 .previous .text 851 .previous .text
644 852
645ENTRY(alignment_check) 853ENTRY(alignment_check)
854 RING0_EC_FRAME
646 pushl $do_alignment_check 855 pushl $do_alignment_check
856 CFI_ADJUST_CFA_OFFSET 4
647 jmp error_code 857 jmp error_code
858 CFI_ENDPROC
648 859
649KPROBE_ENTRY(page_fault) 860KPROBE_ENTRY(page_fault)
861 RING0_EC_FRAME
650 pushl $do_page_fault 862 pushl $do_page_fault
863 CFI_ADJUST_CFA_OFFSET 4
651 jmp error_code 864 jmp error_code
865 CFI_ENDPROC
652 .previous .text 866 .previous .text
653 867
654#ifdef CONFIG_X86_MCE 868#ifdef CONFIG_X86_MCE
655ENTRY(machine_check) 869ENTRY(machine_check)
870 RING0_INT_FRAME
656 pushl $0 871 pushl $0
872 CFI_ADJUST_CFA_OFFSET 4
657 pushl machine_check_vector 873 pushl machine_check_vector
874 CFI_ADJUST_CFA_OFFSET 4
658 jmp error_code 875 jmp error_code
876 CFI_ENDPROC
659#endif 877#endif
660 878
661ENTRY(spurious_interrupt_bug) 879ENTRY(spurious_interrupt_bug)
880 RING0_INT_FRAME
662 pushl $0 881 pushl $0
882 CFI_ADJUST_CFA_OFFSET 4
663 pushl $do_spurious_interrupt_bug 883 pushl $do_spurious_interrupt_bug
884 CFI_ADJUST_CFA_OFFSET 4
664 jmp error_code 885 jmp error_code
886 CFI_ENDPROC
887
888#ifdef CONFIG_STACK_UNWIND
889ENTRY(arch_unwind_init_running)
890 CFI_STARTPROC
891 movl 4(%esp), %edx
892 movl (%esp), %ecx
893 leal 4(%esp), %eax
894 movl %ebx, EBX(%edx)
895 xorl %ebx, %ebx
896 movl %ebx, ECX(%edx)
897 movl %ebx, EDX(%edx)
898 movl %esi, ESI(%edx)
899 movl %edi, EDI(%edx)
900 movl %ebp, EBP(%edx)
901 movl %ebx, EAX(%edx)
902 movl $__USER_DS, DS(%edx)
903 movl $__USER_DS, ES(%edx)
904 movl %ebx, ORIG_EAX(%edx)
905 movl %ecx, EIP(%edx)
906 movl 12(%esp), %ecx
907 movl $__KERNEL_CS, CS(%edx)
908 movl %ebx, EFLAGS(%edx)
909 movl %eax, OLDESP(%edx)
910 movl 8(%esp), %eax
911 movl %ecx, 8(%esp)
912 movl EBX(%edx), %ebx
913 movl $__KERNEL_DS, OLDSS(%edx)
914 jmpl *%eax
915 CFI_ENDPROC
916ENDPROC(arch_unwind_init_running)
917#endif
665 918
666.section .rodata,"a" 919.section .rodata,"a"
667#include "syscall_table.S" 920#include "syscall_table.S"
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
new file mode 100644
index 000000000000..c6737c35815d
--- /dev/null
+++ b/arch/i386/kernel/hpet.c
@@ -0,0 +1,67 @@
1#include <linux/clocksource.h>
2#include <linux/errno.h>
3#include <linux/hpet.h>
4#include <linux/init.h>
5
6#include <asm/hpet.h>
7#include <asm/io.h>
8
9#define HPET_MASK CLOCKSOURCE_MASK(32)
10#define HPET_SHIFT 22
11
12/* FSEC = 10^-15 NSEC = 10^-9 */
13#define FSEC_PER_NSEC 1000000
14
15static void *hpet_ptr;
16
17static cycle_t read_hpet(void)
18{
19 return (cycle_t)readl(hpet_ptr);
20}
21
22static struct clocksource clocksource_hpet = {
23 .name = "hpet",
24 .rating = 250,
25 .read = read_hpet,
26 .mask = HPET_MASK,
27 .mult = 0, /* set below */
28 .shift = HPET_SHIFT,
29 .is_continuous = 1,
30};
31
32static int __init init_hpet_clocksource(void)
33{
34 unsigned long hpet_period;
35 void __iomem* hpet_base;
36 u64 tmp;
37
38 if (!hpet_address)
39 return -ENODEV;
40
41 /* calculate the hpet address: */
42 hpet_base =
43 (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
44 hpet_ptr = hpet_base + HPET_COUNTER;
45
46 /* calculate the frequency: */
47 hpet_period = readl(hpet_base + HPET_PERIOD);
48
49 /*
50 * hpet period is in femto seconds per cycle
51 * so we need to convert this to ns/cyc units
52 * aproximated by mult/2^shift
53 *
54 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
55 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
56 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
57 * (fsec/cyc << shift)/1000000 = mult
58 * (hpet_period << shift)/FSEC_PER_NSEC = mult
59 */
60 tmp = (u64)hpet_period << HPET_SHIFT;
61 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp;
63
64 return clocksource_register(&clocksource_hpet);
65}
66
67module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
new file mode 100644
index 000000000000..477b24daff53
--- /dev/null
+++ b/arch/i386/kernel/i8253.c
@@ -0,0 +1,118 @@
1/*
2 * i8253.c 8253/PIT functions
3 *
4 */
5#include <linux/clocksource.h>
6#include <linux/spinlock.h>
7#include <linux/jiffies.h>
8#include <linux/sysdev.h>
9#include <linux/module.h>
10#include <linux/init.h>
11
12#include <asm/smp.h>
13#include <asm/delay.h>
14#include <asm/i8253.h>
15#include <asm/io.h>
16
17#include "io_ports.h"
18
19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock);
21
22void setup_pit_timer(void)
23{
24 unsigned long flags;
25
26 spin_lock_irqsave(&i8253_lock, flags);
27 outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
28 udelay(10);
29 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
30 udelay(10);
31 outb(LATCH >> 8 , PIT_CH0); /* MSB */
32 spin_unlock_irqrestore(&i8253_lock, flags);
33}
34
35/*
36 * Since the PIT overflows every tick, its not very useful
37 * to just read by itself. So use jiffies to emulate a free
38 * running counter:
39 */
40static cycle_t pit_read(void)
41{
42 unsigned long flags;
43 int count;
44 u32 jifs;
45 static int old_count;
46 static u32 old_jifs;
47
48 spin_lock_irqsave(&i8253_lock, flags);
49 /*
50 * Although our caller may have the read side of xtime_lock,
51 * this is now a seqlock, and we are cheating in this routine
52 * by having side effects on state that we cannot undo if
53 * there is a collision on the seqlock and our caller has to
54 * retry. (Namely, old_jifs and old_count.) So we must treat
55 * jiffies as volatile despite the lock. We read jiffies
56 * before latching the timer count to guarantee that although
57 * the jiffies value might be older than the count (that is,
58 * the counter may underflow between the last point where
59 * jiffies was incremented and the point where we latch the
60 * count), it cannot be newer.
61 */
62 jifs = jiffies;
63 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
64 count = inb_p(PIT_CH0); /* read the latched count */
65 count |= inb_p(PIT_CH0) << 8;
66
67 /* VIA686a test code... reset the latch if count > max + 1 */
68 if (count > LATCH) {
69 outb_p(0x34, PIT_MODE);
70 outb_p(LATCH & 0xff, PIT_CH0);
71 outb(LATCH >> 8, PIT_CH0);
72 count = LATCH - 1;
73 }
74
75 /*
76 * It's possible for count to appear to go the wrong way for a
77 * couple of reasons:
78 *
79 * 1. The timer counter underflows, but we haven't handled the
80 * resulting interrupt and incremented jiffies yet.
81 * 2. Hardware problem with the timer, not giving us continuous time,
82 * the counter does small "jumps" upwards on some Pentium systems,
83 * (see c't 95/10 page 335 for Neptun bug.)
84 *
85 * Previous attempts to handle these cases intelligently were
86 * buggy, so we just do the simple thing now.
87 */
88 if (count > old_count && jifs == old_jifs) {
89 count = old_count;
90 }
91 old_count = count;
92 old_jifs = jifs;
93
94 spin_unlock_irqrestore(&i8253_lock, flags);
95
96 count = (LATCH - 1) - count;
97
98 return (cycle_t)(jifs * LATCH) + count;
99}
100
101static struct clocksource clocksource_pit = {
102 .name = "pit",
103 .rating = 110,
104 .read = pit_read,
105 .mask = CLOCKSOURCE_MASK(32),
106 .mult = 0,
107 .shift = 20,
108};
109
110static int __init init_pit_clocksource(void)
111{
112 if (num_possible_cpus() > 4) /* PIT does not scale! */
113 return 0;
114
115 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
116 return clocksource_register(&clocksource_pit);
117}
118module_init(init_pit_clocksource);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index b7636b96e104..c1a42feba286 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -175,7 +175,7 @@ static void mask_and_ack_8259A(unsigned int irq)
175 * Lightweight spurious IRQ detection. We do not want 175 * Lightweight spurious IRQ detection. We do not want
176 * to overdo spurious IRQ handling - it's usually a sign 176 * to overdo spurious IRQ handling - it's usually a sign
177 * of hardware problems, so we only do the checks we can 177 * of hardware problems, so we only do the checks we can
178 * do without slowing down good hardware unnecesserily. 178 * do without slowing down good hardware unnecessarily.
179 * 179 *
180 * Note that IRQ7 and IRQ15 (the two spurious IRQs 180 * Note that IRQ7 and IRQ15 (the two spurious IRQs
181 * usually resulting from the 8259A-1|2 PICs) occur 181 * usually resulting from the 8259A-1|2 PICs) occur
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index a62df3e764c5..72ae414e4d49 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -38,6 +38,7 @@
38#include <asm/desc.h> 38#include <asm/desc.h>
39#include <asm/timer.h> 39#include <asm/timer.h>
40#include <asm/i8259.h> 40#include <asm/i8259.h>
41#include <asm/nmi.h>
41 42
42#include <mach_apic.h> 43#include <mach_apic.h>
43 44
@@ -50,6 +51,7 @@ atomic_t irq_mis_count;
50static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 51static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
51 52
52static DEFINE_SPINLOCK(ioapic_lock); 53static DEFINE_SPINLOCK(ioapic_lock);
54static DEFINE_SPINLOCK(vector_lock);
53 55
54int timer_over_8254 __initdata = 1; 56int timer_over_8254 __initdata = 1;
55 57
@@ -1161,10 +1163,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1161int assign_irq_vector(int irq) 1163int assign_irq_vector(int irq)
1162{ 1164{
1163 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; 1165 static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
1166 unsigned long flags;
1167 int vector;
1168
1169 BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
1164 1170
1165 BUG_ON(irq >= NR_IRQ_VECTORS); 1171 spin_lock_irqsave(&vector_lock, flags);
1166 if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) 1172
1173 if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
1174 spin_unlock_irqrestore(&vector_lock, flags);
1167 return IO_APIC_VECTOR(irq); 1175 return IO_APIC_VECTOR(irq);
1176 }
1168next: 1177next:
1169 current_vector += 8; 1178 current_vector += 8;
1170 if (current_vector == SYSCALL_VECTOR) 1179 if (current_vector == SYSCALL_VECTOR)
@@ -1172,16 +1181,21 @@ next:
1172 1181
1173 if (current_vector >= FIRST_SYSTEM_VECTOR) { 1182 if (current_vector >= FIRST_SYSTEM_VECTOR) {
1174 offset++; 1183 offset++;
1175 if (!(offset%8)) 1184 if (!(offset%8)) {
1185 spin_unlock_irqrestore(&vector_lock, flags);
1176 return -ENOSPC; 1186 return -ENOSPC;
1187 }
1177 current_vector = FIRST_DEVICE_VECTOR + offset; 1188 current_vector = FIRST_DEVICE_VECTOR + offset;
1178 } 1189 }
1179 1190
1180 vector_irq[current_vector] = irq; 1191 vector = current_vector;
1192 vector_irq[vector] = irq;
1181 if (irq != AUTO_ASSIGN) 1193 if (irq != AUTO_ASSIGN)
1182 IO_APIC_VECTOR(irq) = current_vector; 1194 IO_APIC_VECTOR(irq) = vector;
1183 1195
1184 return current_vector; 1196 spin_unlock_irqrestore(&vector_lock, flags);
1197
1198 return vector;
1185} 1199}
1186 1200
1187static struct hw_interrupt_type ioapic_level_type; 1201static struct hw_interrupt_type ioapic_level_type;
@@ -1193,21 +1207,14 @@ static struct hw_interrupt_type ioapic_edge_type;
1193 1207
1194static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) 1208static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1195{ 1209{
1196 if (use_pci_vector() && !platform_legacy_irq(irq)) { 1210 unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
1197 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1211
1198 trigger == IOAPIC_LEVEL) 1212 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1199 irq_desc[vector].handler = &ioapic_level_type; 1213 trigger == IOAPIC_LEVEL)
1200 else 1214 irq_desc[idx].handler = &ioapic_level_type;
1201 irq_desc[vector].handler = &ioapic_edge_type; 1215 else
1202 set_intr_gate(vector, interrupt[vector]); 1216 irq_desc[idx].handler = &ioapic_edge_type;
1203 } else { 1217 set_intr_gate(vector, interrupt[idx]);
1204 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1205 trigger == IOAPIC_LEVEL)
1206 irq_desc[irq].handler = &ioapic_level_type;
1207 else
1208 irq_desc[irq].handler = &ioapic_edge_type;
1209 set_intr_gate(vector, interrupt[irq]);
1210 }
1211} 1218}
1212 1219
1213static void __init setup_IO_APIC_irqs(void) 1220static void __init setup_IO_APIC_irqs(void)
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 49ce4c31b713..c703bc7b0880 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -53,8 +53,8 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
53 */ 53 */
54fastcall unsigned int do_IRQ(struct pt_regs *regs) 54fastcall unsigned int do_IRQ(struct pt_regs *regs)
55{ 55{
56 /* high bits used in ret_from_ code */ 56 /* high bit used in ret_from_ code */
57 int irq = regs->orig_eax & 0xff; 57 int irq = ~regs->orig_eax;
58#ifdef CONFIG_4KSTACKS 58#ifdef CONFIG_4KSTACKS
59 union irq_ctx *curctx, *irqctx; 59 union irq_ctx *curctx, *irqctx;
60 u32 *isp; 60 u32 *isp;
@@ -100,8 +100,8 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
100 * softirq checks work in the hardirq context. 100 * softirq checks work in the hardirq context.
101 */ 101 */
102 irqctx->tinfo.preempt_count = 102 irqctx->tinfo.preempt_count =
103 irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK | 103 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
104 curctx->tinfo.preempt_count & SOFTIRQ_MASK; 104 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
105 105
106 asm volatile( 106 asm volatile(
107 " xchgl %%ebx,%%esp \n" 107 " xchgl %%ebx,%%esp \n"
@@ -227,7 +227,7 @@ int show_interrupts(struct seq_file *p, void *v)
227 if (i == 0) { 227 if (i == 0) {
228 seq_printf(p, " "); 228 seq_printf(p, " ");
229 for_each_online_cpu(j) 229 for_each_online_cpu(j)
230 seq_printf(p, "CPU%d ",j); 230 seq_printf(p, "CPU%-8d",j);
231 seq_putc(p, '\n'); 231 seq_putc(p, '\n');
232 } 232 }
233 233
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 395a9a6dff88..727e419ad78a 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -57,34 +57,85 @@ static __always_inline void set_jmp_op(void *from, void *to)
57/* 57/*
58 * returns non-zero if opcodes can be boosted. 58 * returns non-zero if opcodes can be boosted.
59 */ 59 */
60static __always_inline int can_boost(kprobe_opcode_t opcode) 60static __always_inline int can_boost(kprobe_opcode_t *opcodes)
61{ 61{
62 switch (opcode & 0xf0 ) { 62#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
63 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
64 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
65 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
66 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
67 << (row % 32))
68 /*
69 * Undefined/reserved opcodes, conditional jump, Opcode Extension
70 * Groups, and some special opcodes can not be boost.
71 */
72 static const unsigned long twobyte_is_boostable[256 / 32] = {
73 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
74 /* ------------------------------- */
75 W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
76 W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
77 W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
78 W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
79 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
80 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
81 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
82 W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
83 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
84 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
85 W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
86 W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
87 W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
88 W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
89 W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
90 W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
91 /* ------------------------------- */
92 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
93 };
94#undef W
95 kprobe_opcode_t opcode;
96 kprobe_opcode_t *orig_opcodes = opcodes;
97retry:
98 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
99 return 0;
100 opcode = *(opcodes++);
101
102 /* 2nd-byte opcode */
103 if (opcode == 0x0f) {
104 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
105 return 0;
106 return test_bit(*opcodes, twobyte_is_boostable);
107 }
108
109 switch (opcode & 0xf0) {
110 case 0x60:
111 if (0x63 < opcode && opcode < 0x67)
112 goto retry; /* prefixes */
113 /* can't boost Address-size override and bound */
114 return (opcode != 0x62 && opcode != 0x67);
63 case 0x70: 115 case 0x70:
64 return 0; /* can't boost conditional jump */ 116 return 0; /* can't boost conditional jump */
65 case 0x90:
66 /* can't boost call and pushf */
67 return opcode != 0x9a && opcode != 0x9c;
68 case 0xc0: 117 case 0xc0:
69 /* can't boost undefined opcodes and soft-interruptions */ 118 /* can't boost software-interruptions */
70 return (0xc1 < opcode && opcode < 0xc6) || 119 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
71 (0xc7 < opcode && opcode < 0xcc) || opcode == 0xcf;
72 case 0xd0: 120 case 0xd0:
73 /* can boost AA* and XLAT */ 121 /* can boost AA* and XLAT */
74 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); 122 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
75 case 0xe0: 123 case 0xe0:
76 /* can boost in/out and (may be) jmps */ 124 /* can boost in/out and absolute jmps */
77 return (0xe3 < opcode && opcode != 0xe8); 125 return ((opcode & 0x04) || opcode == 0xea);
78 case 0xf0: 126 case 0xf0:
127 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
128 goto retry; /* lock/rep(ne) prefix */
79 /* clear and set flags can be boost */ 129 /* clear and set flags can be boost */
80 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); 130 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
81 default: 131 default:
82 /* currently, can't boost 2 bytes opcodes */ 132 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
83 return opcode != 0x0f; 133 goto retry; /* prefixes */
134 /* can't boost CS override and call */
135 return (opcode != 0x2e && opcode != 0x9a);
84 } 136 }
85} 137}
86 138
87
88/* 139/*
89 * returns non-zero if opcode modifies the interrupt flag. 140 * returns non-zero if opcode modifies the interrupt flag.
90 */ 141 */
@@ -109,7 +160,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
109 160
110 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 161 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
111 p->opcode = *p->addr; 162 p->opcode = *p->addr;
112 if (can_boost(p->opcode)) { 163 if (can_boost(p->addr)) {
113 p->ainsn.boostable = 0; 164 p->ainsn.boostable = 0;
114 } else { 165 } else {
115 p->ainsn.boostable = -1; 166 p->ainsn.boostable = -1;
@@ -208,7 +259,9 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
208 struct kprobe_ctlblk *kcb; 259 struct kprobe_ctlblk *kcb;
209#ifdef CONFIG_PREEMPT 260#ifdef CONFIG_PREEMPT
210 unsigned pre_preempt_count = preempt_count(); 261 unsigned pre_preempt_count = preempt_count();
211#endif /* CONFIG_PREEMPT */ 262#else
263 unsigned pre_preempt_count = 1;
264#endif
212 265
213 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); 266 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
214 267
@@ -285,22 +338,14 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
285 /* handler has already set things up, so skip ss setup */ 338 /* handler has already set things up, so skip ss setup */
286 return 1; 339 return 1;
287 340
288 if (p->ainsn.boostable == 1 && 341ss_probe:
289#ifdef CONFIG_PREEMPT 342 if (pre_preempt_count && p->ainsn.boostable == 1 && !p->post_handler){
290 !(pre_preempt_count) && /*
291 * This enables booster when the direct
292 * execution path aren't preempted.
293 */
294#endif /* CONFIG_PREEMPT */
295 !p->post_handler && !p->break_handler ) {
296 /* Boost up -- we can execute copied instructions directly */ 343 /* Boost up -- we can execute copied instructions directly */
297 reset_current_kprobe(); 344 reset_current_kprobe();
298 regs->eip = (unsigned long)p->ainsn.insn; 345 regs->eip = (unsigned long)p->ainsn.insn;
299 preempt_enable_no_resched(); 346 preempt_enable_no_resched();
300 return 1; 347 return 1;
301 } 348 }
302
303ss_probe:
304 prepare_singlestep(p, regs); 349 prepare_singlestep(p, regs);
305 kcb->kprobe_status = KPROBE_HIT_SS; 350 kcb->kprobe_status = KPROBE_HIT_SS;
306 return 1; 351 return 1;
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index f73d7374a2ba..511abe52a94e 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -133,9 +133,9 @@ typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
133 unsigned long start_address, 133 unsigned long start_address,
134 unsigned int has_pae) ATTRIB_NORET; 134 unsigned int has_pae) ATTRIB_NORET;
135 135
136const extern unsigned char relocate_new_kernel[]; 136extern const unsigned char relocate_new_kernel[];
137extern void relocate_new_kernel_end(void); 137extern void relocate_new_kernel_end(void);
138const extern unsigned int relocate_new_kernel_size; 138extern const unsigned int relocate_new_kernel_size;
139 139
140/* 140/*
141 * A architecture hook called to validate the 141 * A architecture hook called to validate the
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 7a328230e540..d022cb8fd725 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -266,7 +266,7 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long acti
266 return NOTIFY_OK; 266 return NOTIFY_OK;
267} 267}
268 268
269static struct notifier_block msr_class_cpu_notifier = 269static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
270{ 270{
271 .notifier_call = msr_class_cpu_callback, 271 .notifier_call = msr_class_cpu_callback,
272}; 272};
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index d43b498ec745..a76e93146585 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -14,21 +14,17 @@
14 */ 14 */
15 15
16#include <linux/config.h> 16#include <linux/config.h>
17#include <linux/mm.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/bootmem.h>
20#include <linux/smp_lock.h>
21#include <linux/interrupt.h> 18#include <linux/interrupt.h>
22#include <linux/mc146818rtc.h>
23#include <linux/kernel_stat.h>
24#include <linux/module.h> 19#include <linux/module.h>
25#include <linux/nmi.h> 20#include <linux/nmi.h>
26#include <linux/sysdev.h> 21#include <linux/sysdev.h>
27#include <linux/sysctl.h> 22#include <linux/sysctl.h>
23#include <linux/percpu.h>
28 24
29#include <asm/smp.h> 25#include <asm/smp.h>
30#include <asm/div64.h>
31#include <asm/nmi.h> 26#include <asm/nmi.h>
27#include <asm/intel_arch_perfmon.h>
32 28
33#include "mach_traps.h" 29#include "mach_traps.h"
34 30
@@ -100,6 +96,9 @@ int nmi_active;
100 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 96 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
101 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 97 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
102 98
99#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
100#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
101
103#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
104/* The performance counters used by NMI_LOCAL_APIC don't trigger when 103/* The performance counters used by NMI_LOCAL_APIC don't trigger when
105 * the CPU is idle. To make sure the NMI watchdog really ticks on all 104 * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -212,6 +211,8 @@ static int __init setup_nmi_watchdog(char *str)
212 211
213__setup("nmi_watchdog=", setup_nmi_watchdog); 212__setup("nmi_watchdog=", setup_nmi_watchdog);
214 213
214static void disable_intel_arch_watchdog(void);
215
215static void disable_lapic_nmi_watchdog(void) 216static void disable_lapic_nmi_watchdog(void)
216{ 217{
217 if (nmi_active <= 0) 218 if (nmi_active <= 0)
@@ -221,6 +222,10 @@ static void disable_lapic_nmi_watchdog(void)
221 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 222 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
222 break; 223 break;
223 case X86_VENDOR_INTEL: 224 case X86_VENDOR_INTEL:
225 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
226 disable_intel_arch_watchdog();
227 break;
228 }
224 switch (boot_cpu_data.x86) { 229 switch (boot_cpu_data.x86) {
225 case 6: 230 case 6:
226 if (boot_cpu_data.x86_model > 0xd) 231 if (boot_cpu_data.x86_model > 0xd)
@@ -449,6 +454,53 @@ static int setup_p4_watchdog(void)
449 return 1; 454 return 1;
450} 455}
451 456
457static void disable_intel_arch_watchdog(void)
458{
459 unsigned ebx;
460
461 /*
462 * Check whether the Architectural PerfMon supports
463 * Unhalted Core Cycles Event or not.
464 * NOTE: Corresponding bit = 0 in ebp indicates event present.
465 */
466 ebx = cpuid_ebx(10);
467 if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
468 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
469}
470
471static int setup_intel_arch_watchdog(void)
472{
473 unsigned int evntsel;
474 unsigned ebx;
475
476 /*
477 * Check whether the Architectural PerfMon supports
478 * Unhalted Core Cycles Event or not.
479 * NOTE: Corresponding bit = 0 in ebp indicates event present.
480 */
481 ebx = cpuid_ebx(10);
482 if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
483 return 0;
484
485 nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
486
487 clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
488 clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
489
490 evntsel = ARCH_PERFMON_EVENTSEL_INT
491 | ARCH_PERFMON_EVENTSEL_OS
492 | ARCH_PERFMON_EVENTSEL_USR
493 | ARCH_PERFMON_NMI_EVENT_SEL
494 | ARCH_PERFMON_NMI_EVENT_UMASK;
495
496 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
497 write_watchdog_counter("INTEL_ARCH_PERFCTR0");
498 apic_write(APIC_LVTPC, APIC_DM_NMI);
499 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
500 wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
501 return 1;
502}
503
452void setup_apic_nmi_watchdog (void) 504void setup_apic_nmi_watchdog (void)
453{ 505{
454 switch (boot_cpu_data.x86_vendor) { 506 switch (boot_cpu_data.x86_vendor) {
@@ -458,6 +510,11 @@ void setup_apic_nmi_watchdog (void)
458 setup_k7_watchdog(); 510 setup_k7_watchdog();
459 break; 511 break;
460 case X86_VENDOR_INTEL: 512 case X86_VENDOR_INTEL:
513 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
514 if (!setup_intel_arch_watchdog())
515 return;
516 break;
517 }
461 switch (boot_cpu_data.x86) { 518 switch (boot_cpu_data.x86) {
462 case 6: 519 case 6:
463 if (boot_cpu_data.x86_model > 0xd) 520 if (boot_cpu_data.x86_model > 0xd)
@@ -561,7 +618,8 @@ void nmi_watchdog_tick (struct pt_regs * regs)
561 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 618 wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
562 apic_write(APIC_LVTPC, APIC_DM_NMI); 619 apic_write(APIC_LVTPC, APIC_DM_NMI);
563 } 620 }
564 else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { 621 else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 ||
622 nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
565 /* Only P6 based Pentium M need to re-unmask 623 /* Only P6 based Pentium M need to re-unmask
566 * the apic vector but it doesn't hurt 624 * the apic vector but it doesn't hurt
567 * other P6 variant */ 625 * other P6 variant */
diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c
index 5f5b075f860a..0caf14652bad 100644
--- a/arch/i386/kernel/numaq.c
+++ b/arch/i386/kernel/numaq.c
@@ -79,10 +79,12 @@ int __init get_memcfg_numaq(void)
79 return 1; 79 return 1;
80} 80}
81 81
82static int __init numaq_dsc_disable(void) 82static int __init numaq_tsc_disable(void)
83{ 83{
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 84 if (num_online_nodes() > 1) {
85 tsc_disable = 1; 85 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
86 tsc_disable = 1;
87 }
86 return 0; 88 return 0;
87} 89}
88core_initcall(numaq_dsc_disable); 90arch_initcall(numaq_tsc_disable);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 6259afea46d1..6946b06e2784 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -102,7 +102,7 @@ void default_idle(void)
102 local_irq_enable(); 102 local_irq_enable();
103 103
104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
105 clear_thread_flag(TIF_POLLING_NRFLAG); 105 current_thread_info()->status &= ~TS_POLLING;
106 smp_mb__after_clear_bit(); 106 smp_mb__after_clear_bit();
107 while (!need_resched()) { 107 while (!need_resched()) {
108 local_irq_disable(); 108 local_irq_disable();
@@ -111,7 +111,7 @@ void default_idle(void)
111 else 111 else
112 local_irq_enable(); 112 local_irq_enable();
113 } 113 }
114 set_thread_flag(TIF_POLLING_NRFLAG); 114 current_thread_info()->status |= TS_POLLING;
115 } else { 115 } else {
116 while (!need_resched()) 116 while (!need_resched())
117 cpu_relax(); 117 cpu_relax();
@@ -174,7 +174,7 @@ void cpu_idle(void)
174{ 174{
175 int cpu = smp_processor_id(); 175 int cpu = smp_processor_id();
176 176
177 set_thread_flag(TIF_POLLING_NRFLAG); 177 current_thread_info()->status |= TS_POLLING;
178 178
179 /* endless idle loop with no priority at all */ 179 /* endless idle loop with no priority at all */
180 while (1) { 180 while (1) {
@@ -312,7 +312,7 @@ void show_regs(struct pt_regs * regs)
312 cr3 = read_cr3(); 312 cr3 = read_cr3();
313 cr4 = read_cr4_safe(); 313 cr4 = read_cr4_safe();
314 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); 314 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
315 show_trace(NULL, &regs->esp); 315 show_trace(NULL, regs, &regs->esp);
316} 316}
317 317
318/* 318/*
diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
index 321f5fd26e75..9bf590cefc7d 100644
--- a/arch/i386/kernel/scx200.c
+++ b/arch/i386/kernel/scx200.c
@@ -9,6 +9,7 @@
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/mutex.h>
12#include <linux/pci.h> 13#include <linux/pci.h>
13 14
14#include <linux/scx200.h> 15#include <linux/scx200.h>
@@ -45,11 +46,19 @@ static struct pci_driver scx200_pci_driver = {
45 .probe = scx200_probe, 46 .probe = scx200_probe,
46}; 47};
47 48
48static DEFINE_SPINLOCK(scx200_gpio_config_lock); 49static DEFINE_MUTEX(scx200_gpio_config_lock);
49 50
50static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) 51static void __devinit scx200_init_shadow(void)
51{ 52{
52 int bank; 53 int bank;
54
55 /* read the current values driven on the GPIO signals */
56 for (bank = 0; bank < 2; ++bank)
57 scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
58}
59
60static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
61{
53 unsigned base; 62 unsigned base;
54 63
55 if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || 64 if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
@@ -63,10 +72,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
63 } 72 }
64 73
65 scx200_gpio_base = base; 74 scx200_gpio_base = base;
66 75 scx200_init_shadow();
67 /* read the current values driven on the GPIO signals */
68 for (bank = 0; bank < 2; ++bank)
69 scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
70 76
71 } else { 77 } else {
72 /* find the base of the Configuration Block */ 78 /* find the base of the Configuration Block */
@@ -87,12 +93,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
87 return 0; 93 return 0;
88} 94}
89 95
90u32 scx200_gpio_configure(int index, u32 mask, u32 bits) 96u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
91{ 97{
92 u32 config, new_config; 98 u32 config, new_config;
93 unsigned long flags;
94 99
95 spin_lock_irqsave(&scx200_gpio_config_lock, flags); 100 mutex_lock(&scx200_gpio_config_lock);
96 101
97 outl(index, scx200_gpio_base + 0x20); 102 outl(index, scx200_gpio_base + 0x20);
98 config = inl(scx200_gpio_base + 0x24); 103 config = inl(scx200_gpio_base + 0x24);
@@ -100,45 +105,11 @@ u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
100 new_config = (config & mask) | bits; 105 new_config = (config & mask) | bits;
101 outl(new_config, scx200_gpio_base + 0x24); 106 outl(new_config, scx200_gpio_base + 0x24);
102 107
103 spin_unlock_irqrestore(&scx200_gpio_config_lock, flags); 108 mutex_unlock(&scx200_gpio_config_lock);
104 109
105 return config; 110 return config;
106} 111}
107 112
108#if 0
109void scx200_gpio_dump(unsigned index)
110{
111 u32 config = scx200_gpio_configure(index, ~0, 0);
112 printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config);
113
114 if (config & 1)
115 printk(" OE"); /* output enabled */
116 else
117 printk(" TS"); /* tristate */
118 if (config & 2)
119 printk(" PP"); /* push pull */
120 else
121 printk(" OD"); /* open drain */
122 if (config & 4)
123 printk(" PUE"); /* pull up enabled */
124 else
125 printk(" PUD"); /* pull up disabled */
126 if (config & 8)
127 printk(" LOCKED"); /* locked */
128 if (config & 16)
129 printk(" LEVEL"); /* level input */
130 else
131 printk(" EDGE"); /* edge input */
132 if (config & 32)
133 printk(" HI"); /* trigger on rising edge */
134 else
135 printk(" LO"); /* trigger on falling edge */
136 if (config & 64)
137 printk(" DEBOUNCE"); /* debounce */
138 printk("\n");
139}
140#endif /* 0 */
141
142static int __init scx200_init(void) 113static int __init scx200_init(void)
143{ 114{
144 printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); 115 printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
@@ -159,10 +130,3 @@ EXPORT_SYMBOL(scx200_gpio_base);
159EXPORT_SYMBOL(scx200_gpio_shadow); 130EXPORT_SYMBOL(scx200_gpio_shadow);
160EXPORT_SYMBOL(scx200_gpio_configure); 131EXPORT_SYMBOL(scx200_gpio_configure);
161EXPORT_SYMBOL(scx200_cb_base); 132EXPORT_SYMBOL(scx200_cb_base);
162
163/*
164 Local variables:
165 compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
166 c-basic-offset: 8
167 End:
168*/
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 6bef9273733e..4a65040cc624 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1575,6 +1575,7 @@ void __init setup_arch(char **cmdline_p)
1575 conswitchp = &dummy_con; 1575 conswitchp = &dummy_con;
1576#endif 1576#endif
1577#endif 1577#endif
1578 tsc_init();
1578} 1579}
1579 1580
1580static __init int add_pcspkr(void) 1581static __init int add_pcspkr(void)
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 5c352c3a9e7f..43002cfb40c4 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
351 goto give_sigsegv; 351 goto give_sigsegv;
352 } 352 }
353 353
354 restorer = &__kernel_sigreturn; 354 restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
355 if (ka->sa.sa_flags & SA_RESTORER) 355 if (ka->sa.sa_flags & SA_RESTORER)
356 restorer = ka->sa.sa_restorer; 356 restorer = ka->sa.sa_restorer;
357 357
@@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
447 goto give_sigsegv; 447 goto give_sigsegv;
448 448
449 /* Set up to return from userspace. */ 449 /* Set up to return from userspace. */
450 restorer = &__kernel_rt_sigreturn; 450 restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
451 if (ka->sa.sa_flags & SA_RESTORER) 451 if (ka->sa.sa_flags & SA_RESTORER)
452 restorer = ka->sa.sa_restorer; 452 restorer = ka->sa.sa_restorer;
453 err |= __put_user(restorer, &frame->pretcode); 453 err |= __put_user(restorer, &frame->pretcode);
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index d134e9643a58..c10789d7a9d3 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -114,7 +114,17 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_m
114 114
115static inline int __prepare_ICR (unsigned int shortcut, int vector) 115static inline int __prepare_ICR (unsigned int shortcut, int vector)
116{ 116{
117 return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL; 117 unsigned int icr = shortcut | APIC_DEST_LOGICAL;
118
119 switch (vector) {
120 default:
121 icr |= APIC_DM_FIXED | vector;
122 break;
123 case NMI_VECTOR:
124 icr |= APIC_DM_NMI;
125 break;
126 }
127 return icr;
118} 128}
119 129
120static inline int __prepare_ICR2 (unsigned int mask) 130static inline int __prepare_ICR2 (unsigned int mask)
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index bd0ca5c9f053..89e7315e539c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
53#include <asm/desc.h> 53#include <asm/desc.h>
54#include <asm/arch_hooks.h> 54#include <asm/arch_hooks.h>
55#include <asm/nmi.h>
55 56
56#include <mach_apic.h> 57#include <mach_apic.h>
57#include <mach_wakecpu.h> 58#include <mach_wakecpu.h>
@@ -66,12 +67,6 @@ int smp_num_siblings = 1;
66EXPORT_SYMBOL(smp_num_siblings); 67EXPORT_SYMBOL(smp_num_siblings);
67#endif 68#endif
68 69
69/* Package ID of each logical CPU */
70int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
71
72/* Core ID of each logical CPU */
73int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
74
75/* Last level cache ID of each logical CPU */ 70/* Last level cache ID of each logical CPU */
76int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; 71int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
77 72
@@ -453,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu)
453 struct cpuinfo_x86 *c = cpu_data + cpu; 448 struct cpuinfo_x86 *c = cpu_data + cpu;
454 /* 449 /*
455 * For perf, we return last level cache shared map. 450 * For perf, we return last level cache shared map.
456 * TBD: when power saving sched policy is added, we will return 451 * And for power savings, we return cpu_core_map
457 * cpu_core_map when power saving policy is enabled
458 */ 452 */
459 return c->llc_shared_map; 453 if (sched_mc_power_savings || sched_smt_power_savings)
454 return cpu_core_map[cpu];
455 else
456 return c->llc_shared_map;
460} 457}
461 458
462/* representing cpus for which sibling maps can be computed */ 459/* representing cpus for which sibling maps can be computed */
@@ -472,8 +469,8 @@ set_cpu_sibling_map(int cpu)
472 469
473 if (smp_num_siblings > 1) { 470 if (smp_num_siblings > 1) {
474 for_each_cpu_mask(i, cpu_sibling_setup_map) { 471 for_each_cpu_mask(i, cpu_sibling_setup_map) {
475 if (phys_proc_id[cpu] == phys_proc_id[i] && 472 if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
476 cpu_core_id[cpu] == cpu_core_id[i]) { 473 c[cpu].cpu_core_id == c[i].cpu_core_id) {
477 cpu_set(i, cpu_sibling_map[cpu]); 474 cpu_set(i, cpu_sibling_map[cpu]);
478 cpu_set(cpu, cpu_sibling_map[i]); 475 cpu_set(cpu, cpu_sibling_map[i]);
479 cpu_set(i, cpu_core_map[cpu]); 476 cpu_set(i, cpu_core_map[cpu]);
@@ -500,7 +497,7 @@ set_cpu_sibling_map(int cpu)
500 cpu_set(i, c[cpu].llc_shared_map); 497 cpu_set(i, c[cpu].llc_shared_map);
501 cpu_set(cpu, c[i].llc_shared_map); 498 cpu_set(cpu, c[i].llc_shared_map);
502 } 499 }
503 if (phys_proc_id[cpu] == phys_proc_id[i]) { 500 if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
504 cpu_set(i, cpu_core_map[cpu]); 501 cpu_set(i, cpu_core_map[cpu]);
505 cpu_set(cpu, cpu_core_map[i]); 502 cpu_set(cpu, cpu_core_map[i]);
506 /* 503 /*
@@ -1055,6 +1052,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1055 struct warm_boot_cpu_info info; 1052 struct warm_boot_cpu_info info;
1056 struct work_struct task; 1053 struct work_struct task;
1057 int apicid, ret; 1054 int apicid, ret;
1055 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1058 1056
1059 apicid = x86_cpu_to_apicid[cpu]; 1057 apicid = x86_cpu_to_apicid[cpu];
1060 if (apicid == BAD_APICID) { 1058 if (apicid == BAD_APICID) {
@@ -1062,6 +1060,18 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1062 goto exit; 1060 goto exit;
1063 } 1061 }
1064 1062
1063 /*
1064 * the CPU isn't initialized at boot time, allocate gdt table here.
1065 * cpu_init will initialize it
1066 */
1067 if (!cpu_gdt_descr->address) {
1068 cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
1069 if (!cpu_gdt_descr->address)
1070 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
1071 ret = -ENOMEM;
1072 goto exit;
1073 }
1074
1065 info.complete = &done; 1075 info.complete = &done;
1066 info.apicid = apicid; 1076 info.apicid = apicid;
1067 info.cpu = cpu; 1077 info.cpu = cpu;
@@ -1339,8 +1349,8 @@ remove_siblinginfo(int cpu)
1339 cpu_clear(cpu, cpu_sibling_map[sibling]); 1349 cpu_clear(cpu, cpu_sibling_map[sibling]);
1340 cpus_clear(cpu_sibling_map[cpu]); 1350 cpus_clear(cpu_sibling_map[cpu]);
1341 cpus_clear(cpu_core_map[cpu]); 1351 cpus_clear(cpu_core_map[cpu]);
1342 phys_proc_id[cpu] = BAD_APICID; 1352 c[cpu].phys_proc_id = 0;
1343 cpu_core_id[cpu] = BAD_APICID; 1353 c[cpu].cpu_core_id = 0;
1344 cpu_clear(cpu, cpu_sibling_setup_map); 1354 cpu_clear(cpu, cpu_sibling_setup_map);
1345} 1355}
1346 1356
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 0bada1870bdf..c60419dee018 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -2,6 +2,8 @@
2 * linux/arch/i386/kernel/sysenter.c 2 * linux/arch/i386/kernel/sysenter.c
3 * 3 *
4 * (C) Copyright 2002 Linus Torvalds 4 * (C) Copyright 2002 Linus Torvalds
5 * Portions based on the vdso-randomization code from exec-shield:
6 * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
5 * 7 *
6 * This file contains the needed initializations to support sysenter. 8 * This file contains the needed initializations to support sysenter.
7 */ 9 */
@@ -13,12 +15,31 @@
13#include <linux/gfp.h> 15#include <linux/gfp.h>
14#include <linux/string.h> 16#include <linux/string.h>
15#include <linux/elf.h> 17#include <linux/elf.h>
18#include <linux/mm.h>
19#include <linux/module.h>
16 20
17#include <asm/cpufeature.h> 21#include <asm/cpufeature.h>
18#include <asm/msr.h> 22#include <asm/msr.h>
19#include <asm/pgtable.h> 23#include <asm/pgtable.h>
20#include <asm/unistd.h> 24#include <asm/unistd.h>
21 25
26/*
27 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()?
29 */
30unsigned int __read_mostly vdso_enabled = 1;
31
32EXPORT_SYMBOL_GPL(vdso_enabled);
33
34static int __init vdso_setup(char *s)
35{
36 vdso_enabled = simple_strtoul(s, NULL, 0);
37
38 return 1;
39}
40
41__setup("vdso=", vdso_setup);
42
22extern asmlinkage void sysenter_entry(void); 43extern asmlinkage void sysenter_entry(void);
23 44
24void enable_sep_cpu(void) 45void enable_sep_cpu(void)
@@ -45,23 +66,122 @@ void enable_sep_cpu(void)
45 */ 66 */
46extern const char vsyscall_int80_start, vsyscall_int80_end; 67extern const char vsyscall_int80_start, vsyscall_int80_end;
47extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; 68extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
69static void *syscall_page;
48 70
49int __init sysenter_setup(void) 71int __init sysenter_setup(void)
50{ 72{
51 void *page = (void *)get_zeroed_page(GFP_ATOMIC); 73 syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
52 74
53 __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); 75#ifdef CONFIG_COMPAT_VDSO
76 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
77 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
78#else
79 /*
80 * In the non-compat case the ELF coredumping code needs the fixmap:
81 */
82 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO);
83#endif
54 84
55 if (!boot_cpu_has(X86_FEATURE_SEP)) { 85 if (!boot_cpu_has(X86_FEATURE_SEP)) {
56 memcpy(page, 86 memcpy(syscall_page,
57 &vsyscall_int80_start, 87 &vsyscall_int80_start,
58 &vsyscall_int80_end - &vsyscall_int80_start); 88 &vsyscall_int80_end - &vsyscall_int80_start);
59 return 0; 89 return 0;
60 } 90 }
61 91
62 memcpy(page, 92 memcpy(syscall_page,
63 &vsyscall_sysenter_start, 93 &vsyscall_sysenter_start,
64 &vsyscall_sysenter_end - &vsyscall_sysenter_start); 94 &vsyscall_sysenter_end - &vsyscall_sysenter_start);
65 95
66 return 0; 96 return 0;
67} 97}
98
99static struct page *syscall_nopage(struct vm_area_struct *vma,
100 unsigned long adr, int *type)
101{
102 struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
103 get_page(p);
104 return p;
105}
106
107/* Prevent VMA merging */
108static void syscall_vma_close(struct vm_area_struct *vma)
109{
110}
111
112static struct vm_operations_struct syscall_vm_ops = {
113 .close = syscall_vma_close,
114 .nopage = syscall_nopage,
115};
116
117/* Defined in vsyscall-sysenter.S */
118extern void SYSENTER_RETURN;
119
120/* Setup a VMA at program startup for the vsyscall page */
121int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
122{
123 struct vm_area_struct *vma;
124 struct mm_struct *mm = current->mm;
125 unsigned long addr;
126 int ret;
127
128 down_write(&mm->mmap_sem);
129 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
130 if (IS_ERR_VALUE(addr)) {
131 ret = addr;
132 goto up_fail;
133 }
134
135 vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
136 if (!vma) {
137 ret = -ENOMEM;
138 goto up_fail;
139 }
140
141 vma->vm_start = addr;
142 vma->vm_end = addr + PAGE_SIZE;
143 /* MAYWRITE to allow gdb to COW and set breakpoints */
144 vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
145 vma->vm_flags |= mm->def_flags;
146 vma->vm_page_prot = protection_map[vma->vm_flags & 7];
147 vma->vm_ops = &syscall_vm_ops;
148 vma->vm_mm = mm;
149
150 ret = insert_vm_struct(mm, vma);
151 if (ret)
152 goto free_vma;
153
154 current->mm->context.vdso = (void *)addr;
155 current_thread_info()->sysenter_return =
156 (void *)VDSO_SYM(&SYSENTER_RETURN);
157 mm->total_vm++;
158up_fail:
159 up_write(&mm->mmap_sem);
160 return ret;
161
162free_vma:
163 kmem_cache_free(vm_area_cachep, vma);
164 return ret;
165}
166
167const char *arch_vma_name(struct vm_area_struct *vma)
168{
169 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
170 return "[vdso]";
171 return NULL;
172}
173
174struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
175{
176 return NULL;
177}
178
179int in_gate_area(struct task_struct *task, unsigned long addr)
180{
181 return 0;
182}
183
184int in_gate_area_no_task(unsigned long addr)
185{
186 return 0;
187}
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 9d3074759856..5f43d0410122 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -82,13 +82,6 @@ extern unsigned long wall_jiffies;
82DEFINE_SPINLOCK(rtc_lock); 82DEFINE_SPINLOCK(rtc_lock);
83EXPORT_SYMBOL(rtc_lock); 83EXPORT_SYMBOL(rtc_lock);
84 84
85#include <asm/i8253.h>
86
87DEFINE_SPINLOCK(i8253_lock);
88EXPORT_SYMBOL(i8253_lock);
89
90struct timer_opts *cur_timer __read_mostly = &timer_none;
91
92/* 85/*
93 * This is a special lock that is owned by the CPU and holds the index 86 * This is a special lock that is owned by the CPU and holds the index
94 * register we are working with. It is required for NMI access to the 87 * register we are working with. It is required for NMI access to the
@@ -118,99 +111,19 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
118} 111}
119EXPORT_SYMBOL(rtc_cmos_write); 112EXPORT_SYMBOL(rtc_cmos_write);
120 113
121/*
122 * This version of gettimeofday has microsecond resolution
123 * and better than microsecond precision on fast x86 machines with TSC.
124 */
125void do_gettimeofday(struct timeval *tv)
126{
127 unsigned long seq;
128 unsigned long usec, sec;
129 unsigned long max_ntp_tick;
130
131 do {
132 unsigned long lost;
133
134 seq = read_seqbegin(&xtime_lock);
135
136 usec = cur_timer->get_offset();
137 lost = jiffies - wall_jiffies;
138
139 /*
140 * If time_adjust is negative then NTP is slowing the clock
141 * so make sure not to go into next possible interval.
142 * Better to lose some accuracy than have time go backwards..
143 */
144 if (unlikely(time_adjust < 0)) {
145 max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
146 usec = min(usec, max_ntp_tick);
147
148 if (lost)
149 usec += lost * max_ntp_tick;
150 }
151 else if (unlikely(lost))
152 usec += lost * (USEC_PER_SEC / HZ);
153
154 sec = xtime.tv_sec;
155 usec += (xtime.tv_nsec / 1000);
156 } while (read_seqretry(&xtime_lock, seq));
157
158 while (usec >= 1000000) {
159 usec -= 1000000;
160 sec++;
161 }
162
163 tv->tv_sec = sec;
164 tv->tv_usec = usec;
165}
166
167EXPORT_SYMBOL(do_gettimeofday);
168
169int do_settimeofday(struct timespec *tv)
170{
171 time_t wtm_sec, sec = tv->tv_sec;
172 long wtm_nsec, nsec = tv->tv_nsec;
173
174 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
175 return -EINVAL;
176
177 write_seqlock_irq(&xtime_lock);
178 /*
179 * This is revolting. We need to set "xtime" correctly. However, the
180 * value in this location is the value at the most recent update of
181 * wall time. Discover what correction gettimeofday() would have
182 * made, and then undo it!
183 */
184 nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
185 nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
186
187 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
188 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
189
190 set_normalized_timespec(&xtime, sec, nsec);
191 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
192
193 ntp_clear();
194 write_sequnlock_irq(&xtime_lock);
195 clock_was_set();
196 return 0;
197}
198
199EXPORT_SYMBOL(do_settimeofday);
200
201static int set_rtc_mmss(unsigned long nowtime) 114static int set_rtc_mmss(unsigned long nowtime)
202{ 115{
203 int retval; 116 int retval;
204 117 unsigned long flags;
205 WARN_ON(irqs_disabled());
206 118
207 /* gets recalled with irq locally disabled */ 119 /* gets recalled with irq locally disabled */
208 spin_lock_irq(&rtc_lock); 120 /* XXX - does irqsave resolve this? -johnstul */
121 spin_lock_irqsave(&rtc_lock, flags);
209 if (efi_enabled) 122 if (efi_enabled)
210 retval = efi_set_rtc_mmss(nowtime); 123 retval = efi_set_rtc_mmss(nowtime);
211 else 124 else
212 retval = mach_set_rtc_mmss(nowtime); 125 retval = mach_set_rtc_mmss(nowtime);
213 spin_unlock_irq(&rtc_lock); 126 spin_unlock_irqrestore(&rtc_lock, flags);
214 127
215 return retval; 128 return retval;
216} 129}
@@ -218,16 +131,6 @@ static int set_rtc_mmss(unsigned long nowtime)
218 131
219int timer_ack; 132int timer_ack;
220 133
221/* monotonic_clock(): returns # of nanoseconds passed since time_init()
222 * Note: This function is required to return accurate
223 * time even in the absence of multiple timer ticks.
224 */
225unsigned long long monotonic_clock(void)
226{
227 return cur_timer->monotonic_clock();
228}
229EXPORT_SYMBOL(monotonic_clock);
230
231#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) 134#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
232unsigned long profile_pc(struct pt_regs *regs) 135unsigned long profile_pc(struct pt_regs *regs)
233{ 136{
@@ -242,11 +145,21 @@ EXPORT_SYMBOL(profile_pc);
242#endif 145#endif
243 146
244/* 147/*
245 * timer_interrupt() needs to keep up the real-time clock, 148 * This is the same as the above, except we _also_ save the current
246 * as well as call the "do_timer()" routine every clocktick 149 * Time Stamp Counter value at the time of the timer interrupt, so that
150 * we later on can estimate the time of day more exactly.
247 */ 151 */
248static inline void do_timer_interrupt(int irq, struct pt_regs *regs) 152irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
249{ 153{
154 /*
155 * Here we are in the timer irq handler. We just have irqs locally
156 * disabled but we don't know if the timer_bh is running on the other
157 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
158 * the irq version of write_lock because as just said we have irq
159 * locally disabled. -arca
160 */
161 write_seqlock(&xtime_lock);
162
250#ifdef CONFIG_X86_IO_APIC 163#ifdef CONFIG_X86_IO_APIC
251 if (timer_ack) { 164 if (timer_ack) {
252 /* 165 /*
@@ -279,27 +192,6 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
279 irq = inb_p( 0x61 ); /* read the current state */ 192 irq = inb_p( 0x61 ); /* read the current state */
280 outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ 193 outb_p( irq|0x80, 0x61 ); /* reset the IRQ */
281 } 194 }
282}
283
284/*
285 * This is the same as the above, except we _also_ save the current
286 * Time Stamp Counter value at the time of the timer interrupt, so that
287 * we later on can estimate the time of day more exactly.
288 */
289irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
290{
291 /*
292 * Here we are in the timer irq handler. We just have irqs locally
293 * disabled but we don't know if the timer_bh is running on the other
294 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
295 * the irq version of write_lock because as just said we have irq
296 * locally disabled. -arca
297 */
298 write_seqlock(&xtime_lock);
299
300 cur_timer->mark_offset();
301
302 do_timer_interrupt(irq, regs);
303 195
304 write_sequnlock(&xtime_lock); 196 write_sequnlock(&xtime_lock);
305 197
@@ -380,7 +272,6 @@ void notify_arch_cmos_timer(void)
380 272
381static long clock_cmos_diff, sleep_start; 273static long clock_cmos_diff, sleep_start;
382 274
383static struct timer_opts *last_timer;
384static int timer_suspend(struct sys_device *dev, pm_message_t state) 275static int timer_suspend(struct sys_device *dev, pm_message_t state)
385{ 276{
386 /* 277 /*
@@ -389,10 +280,6 @@ static int timer_suspend(struct sys_device *dev, pm_message_t state)
389 clock_cmos_diff = -get_cmos_time(); 280 clock_cmos_diff = -get_cmos_time();
390 clock_cmos_diff += get_seconds(); 281 clock_cmos_diff += get_seconds();
391 sleep_start = get_cmos_time(); 282 sleep_start = get_cmos_time();
392 last_timer = cur_timer;
393 cur_timer = &timer_none;
394 if (last_timer->suspend)
395 last_timer->suspend(state);
396 return 0; 283 return 0;
397} 284}
398 285
@@ -415,10 +302,6 @@ static int timer_resume(struct sys_device *dev)
415 jiffies_64 += sleep_length; 302 jiffies_64 += sleep_length;
416 wall_jiffies += sleep_length; 303 wall_jiffies += sleep_length;
417 write_sequnlock_irqrestore(&xtime_lock, flags); 304 write_sequnlock_irqrestore(&xtime_lock, flags);
418 if (last_timer->resume)
419 last_timer->resume();
420 cur_timer = last_timer;
421 last_timer = NULL;
422 touch_softlockup_watchdog(); 305 touch_softlockup_watchdog();
423 return 0; 306 return 0;
424} 307}
@@ -460,9 +343,6 @@ static void __init hpet_time_init(void)
460 printk("Using HPET for base-timer\n"); 343 printk("Using HPET for base-timer\n");
461 } 344 }
462 345
463 cur_timer = select_timer();
464 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
465
466 time_init_hook(); 346 time_init_hook();
467} 347}
468#endif 348#endif
@@ -484,8 +364,5 @@ void __init time_init(void)
484 set_normalized_timespec(&wall_to_monotonic, 364 set_normalized_timespec(&wall_to_monotonic,
485 -xtime.tv_sec, -xtime.tv_nsec); 365 -xtime.tv_sec, -xtime.tv_nsec);
486 366
487 cur_timer = select_timer();
488 printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
489
490 time_init_hook(); 367 time_init_hook();
491} 368}
diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile
deleted file mode 100644
index 8fa12be658dd..000000000000
--- a/arch/i386/kernel/timers/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
1#
2# Makefile for x86 timers
3#
4
5obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
6
7obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o
8obj-$(CONFIG_HPET_TIMER) += timer_hpet.o
9obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o
diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
deleted file mode 100644
index 8163fe0cf1f0..000000000000
--- a/arch/i386/kernel/timers/common.c
+++ /dev/null
@@ -1,172 +0,0 @@
1/*
2 * Common functions used across the timers go here
3 */
4
5#include <linux/init.h>
6#include <linux/timex.h>
7#include <linux/errno.h>
8#include <linux/jiffies.h>
9#include <linux/module.h>
10
11#include <asm/io.h>
12#include <asm/timer.h>
13#include <asm/hpet.h>
14
15#include "mach_timer.h"
16
17/* ------ Calibrate the TSC -------
18 * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
19 * Too much 64-bit arithmetic here to do this cleanly in C, and for
20 * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
21 * output busy loop as low as possible. We avoid reading the CTC registers
22 * directly because of the awkward 8-bit access mechanism of the 82C54
23 * device.
24 */
25
26#define CALIBRATE_TIME (5 * 1000020/HZ)
27
28unsigned long calibrate_tsc(void)
29{
30 mach_prepare_counter();
31
32 {
33 unsigned long startlow, starthigh;
34 unsigned long endlow, endhigh;
35 unsigned long count;
36
37 rdtsc(startlow,starthigh);
38 mach_countup(&count);
39 rdtsc(endlow,endhigh);
40
41
42 /* Error: ECTCNEVERSET */
43 if (count <= 1)
44 goto bad_ctc;
45
46 /* 64-bit subtract - gcc just messes up with long longs */
47 __asm__("subl %2,%0\n\t"
48 "sbbl %3,%1"
49 :"=a" (endlow), "=d" (endhigh)
50 :"g" (startlow), "g" (starthigh),
51 "0" (endlow), "1" (endhigh));
52
53 /* Error: ECPUTOOFAST */
54 if (endhigh)
55 goto bad_ctc;
56
57 /* Error: ECPUTOOSLOW */
58 if (endlow <= CALIBRATE_TIME)
59 goto bad_ctc;
60
61 __asm__("divl %2"
62 :"=a" (endlow), "=d" (endhigh)
63 :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME));
64
65 return endlow;
66 }
67
68 /*
69 * The CTC wasn't reliable: we got a hit on the very first read,
70 * or the CPU was so fast/slow that the quotient wouldn't fit in
71 * 32 bits..
72 */
73bad_ctc:
74 return 0;
75}
76
77#ifdef CONFIG_HPET_TIMER
78/* ------ Calibrate the TSC using HPET -------
79 * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq.
80 * Second output is parameter 1 (when non NULL)
81 * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet().
82 * calibrate_tsc() calibrates the processor TSC by comparing
83 * it to the HPET timer of known frequency.
84 * Too much 64-bit arithmetic here to do this cleanly in C
85 */
86#define CALIBRATE_CNT_HPET (5 * hpet_tick)
87#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC)
88
89unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr)
90{
91 unsigned long tsc_startlow, tsc_starthigh;
92 unsigned long tsc_endlow, tsc_endhigh;
93 unsigned long hpet_start, hpet_end;
94 unsigned long result, remain;
95
96 hpet_start = hpet_readl(HPET_COUNTER);
97 rdtsc(tsc_startlow, tsc_starthigh);
98 do {
99 hpet_end = hpet_readl(HPET_COUNTER);
100 } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET);
101 rdtsc(tsc_endlow, tsc_endhigh);
102
103 /* 64-bit subtract - gcc just messes up with long longs */
104 __asm__("subl %2,%0\n\t"
105 "sbbl %3,%1"
106 :"=a" (tsc_endlow), "=d" (tsc_endhigh)
107 :"g" (tsc_startlow), "g" (tsc_starthigh),
108 "0" (tsc_endlow), "1" (tsc_endhigh));
109
110 /* Error: ECPUTOOFAST */
111 if (tsc_endhigh)
112 goto bad_calibration;
113
114 /* Error: ECPUTOOSLOW */
115 if (tsc_endlow <= CALIBRATE_TIME_HPET)
116 goto bad_calibration;
117
118 ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET);
119 if (remain > (tsc_endlow >> 1))
120 result++; /* rounding the result */
121
122 if (tsc_hpet_quotient_ptr) {
123 unsigned long tsc_hpet_quotient;
124
125 ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0,
126 CALIBRATE_CNT_HPET);
127 if (remain > (tsc_endlow >> 1))
128 tsc_hpet_quotient++; /* rounding the result */
129 *tsc_hpet_quotient_ptr = tsc_hpet_quotient;
130 }
131
132 return result;
133bad_calibration:
134 /*
135 * the CPU was so fast/slow that the quotient wouldn't fit in
136 * 32 bits..
137 */
138 return 0;
139}
140#endif
141
142
143unsigned long read_timer_tsc(void)
144{
145 unsigned long retval;
146 rdtscl(retval);
147 return retval;
148}
149
150
151/* calculate cpu_khz */
152void init_cpu_khz(void)
153{
154 if (cpu_has_tsc) {
155 unsigned long tsc_quotient = calibrate_tsc();
156 if (tsc_quotient) {
157 /* report CPU clock rate in Hz.
158 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
159 * clock/second. Our precision is about 100 ppm.
160 */
161 { unsigned long eax=0, edx=1000;
162 __asm__("divl %2"
163 :"=a" (cpu_khz), "=d" (edx)
164 :"r" (tsc_quotient),
165 "0" (eax), "1" (edx));
166 printk("Detected %u.%03u MHz processor.\n",
167 cpu_khz / 1000, cpu_khz % 1000);
168 }
169 }
170 }
171}
172
diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c
deleted file mode 100644
index 7e39ed8e33f8..000000000000
--- a/arch/i386/kernel/timers/timer.c
+++ /dev/null
@@ -1,75 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/string.h>
4#include <asm/timer.h>
5
6#ifdef CONFIG_HPET_TIMER
7/*
8 * HPET memory read is slower than tsc reads, but is more dependable as it
9 * always runs at constant frequency and reduces complexity due to
10 * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use
11 * timer_pit when HPET is active. So, we default to timer_tsc.
12 */
13#endif
14/* list of timers, ordered by preference, NULL terminated */
15static struct init_timer_opts* __initdata timers[] = {
16#ifdef CONFIG_X86_CYCLONE_TIMER
17 &timer_cyclone_init,
18#endif
19#ifdef CONFIG_HPET_TIMER
20 &timer_hpet_init,
21#endif
22#ifdef CONFIG_X86_PM_TIMER
23 &timer_pmtmr_init,
24#endif
25 &timer_tsc_init,
26 &timer_pit_init,
27 NULL,
28};
29
30static char clock_override[10] __initdata;
31
32static int __init clock_setup(char* str)
33{
34 if (str)
35 strlcpy(clock_override, str, sizeof(clock_override));
36 return 1;
37}
38__setup("clock=", clock_setup);
39
40
41/* The chosen timesource has been found to be bad.
42 * Fall back to a known good timesource (the PIT)
43 */
44void clock_fallback(void)
45{
46 cur_timer = &timer_pit;
47}
48
49/* iterates through the list of timers, returning the first
50 * one that initializes successfully.
51 */
52struct timer_opts* __init select_timer(void)
53{
54 int i = 0;
55
56 /* find most preferred working timer */
57 while (timers[i]) {
58 if (timers[i]->init)
59 if (timers[i]->init(clock_override) == 0)
60 return timers[i]->opts;
61 ++i;
62 }
63
64 panic("select_timer: Cannot find a suitable timer\n");
65 return NULL;
66}
67
68int read_current_timer(unsigned long *timer_val)
69{
70 if (cur_timer->read_timer) {
71 *timer_val = cur_timer->read_timer();
72 return 0;
73 }
74 return -1;
75}
diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c
deleted file mode 100644
index 13892a65c941..000000000000
--- a/arch/i386/kernel/timers/timer_cyclone.c
+++ /dev/null
@@ -1,259 +0,0 @@
1/* Cyclone-timer:
2 * This code implements timer_ops for the cyclone counter found
3 * on IBM x440, x360, and other Summit based systems.
4 *
5 * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com)
6 */
7
8
9#include <linux/spinlock.h>
10#include <linux/init.h>
11#include <linux/timex.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/jiffies.h>
15
16#include <asm/timer.h>
17#include <asm/io.h>
18#include <asm/pgtable.h>
19#include <asm/fixmap.h>
20#include <asm/i8253.h>
21
22#include "io_ports.h"
23
24/* Number of usecs that the last interrupt was delayed */
25static int delay_at_last_interrupt;
26
27#define CYCLONE_CBAR_ADDR 0xFEB00CD0
28#define CYCLONE_PMCC_OFFSET 0x51A0
29#define CYCLONE_MPMC_OFFSET 0x51D0
30#define CYCLONE_MPCS_OFFSET 0x51A8
31#define CYCLONE_TIMER_FREQ 100000000
32#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
33int use_cyclone = 0;
34
35static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */
36static u32 last_cyclone_low;
37static u32 last_cyclone_high;
38static unsigned long long monotonic_base;
39static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
40
41/* helper macro to atomically read both cyclone counter registers */
42#define read_cyclone_counter(low,high) \
43 do{ \
44 high = cyclone_timer[1]; low = cyclone_timer[0]; \
45 } while (high != cyclone_timer[1]);
46
47
48static void mark_offset_cyclone(void)
49{
50 unsigned long lost, delay;
51 unsigned long delta = last_cyclone_low;
52 int count;
53 unsigned long long this_offset, last_offset;
54
55 write_seqlock(&monotonic_lock);
56 last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
57
58 spin_lock(&i8253_lock);
59 read_cyclone_counter(last_cyclone_low,last_cyclone_high);
60
61 /* read values for delay_at_last_interrupt */
62 outb_p(0x00, 0x43); /* latch the count ASAP */
63
64 count = inb_p(0x40); /* read the latched count */
65 count |= inb(0x40) << 8;
66
67 /*
68 * VIA686a test code... reset the latch if count > max + 1
69 * from timer_pit.c - cjb
70 */
71 if (count > LATCH) {
72 outb_p(0x34, PIT_MODE);
73 outb_p(LATCH & 0xff, PIT_CH0);
74 outb(LATCH >> 8, PIT_CH0);
75 count = LATCH - 1;
76 }
77 spin_unlock(&i8253_lock);
78
79 /* lost tick compensation */
80 delta = last_cyclone_low - delta;
81 delta /= (CYCLONE_TIMER_FREQ/1000000);
82 delta += delay_at_last_interrupt;
83 lost = delta/(1000000/HZ);
84 delay = delta%(1000000/HZ);
85 if (lost >= 2)
86 jiffies_64 += lost-1;
87
88 /* update the monotonic base value */
89 this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
90 monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
91 write_sequnlock(&monotonic_lock);
92
93 /* calculate delay_at_last_interrupt */
94 count = ((LATCH-1) - count) * TICK_SIZE;
95 delay_at_last_interrupt = (count + LATCH/2) / LATCH;
96
97
98 /* catch corner case where tick rollover occured
99 * between cyclone and pit reads (as noted when
100 * usec delta is > 90% # of usecs/tick)
101 */
102 if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
103 jiffies_64++;
104}
105
106static unsigned long get_offset_cyclone(void)
107{
108 u32 offset;
109
110 if(!cyclone_timer)
111 return delay_at_last_interrupt;
112
113 /* Read the cyclone timer */
114 offset = cyclone_timer[0];
115
116 /* .. relative to previous jiffy */
117 offset = offset - last_cyclone_low;
118
119 /* convert cyclone ticks to microseconds */
120 /* XXX slow, can we speed this up? */
121 offset = offset/(CYCLONE_TIMER_FREQ/1000000);
122
123 /* our adjusted time offset in microseconds */
124 return delay_at_last_interrupt + offset;
125}
126
127static unsigned long long monotonic_clock_cyclone(void)
128{
129 u32 now_low, now_high;
130 unsigned long long last_offset, this_offset, base;
131 unsigned long long ret;
132 unsigned seq;
133
134 /* atomically read monotonic base & last_offset */
135 do {
136 seq = read_seqbegin(&monotonic_lock);
137 last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
138 base = monotonic_base;
139 } while (read_seqretry(&monotonic_lock, seq));
140
141
142 /* Read the cyclone counter */
143 read_cyclone_counter(now_low,now_high);
144 this_offset = ((unsigned long long)now_high<<32)|now_low;
145
146 /* convert to nanoseconds */
147 ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
148 return ret * (1000000000 / CYCLONE_TIMER_FREQ);
149}
150
151static int __init init_cyclone(char* override)
152{
153 u32* reg;
154 u32 base; /* saved cyclone base address */
155 u32 pageaddr; /* page that contains cyclone_timer register */
156 u32 offset; /* offset from pageaddr to cyclone_timer register */
157 int i;
158
159 /* check clock override */
160 if (override[0] && strncmp(override,"cyclone",7))
161 return -ENODEV;
162
163 /*make sure we're on a summit box*/
164 if(!use_cyclone) return -ENODEV;
165
166 printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
167
168 /* find base address */
169 pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
170 offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
171 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
172 reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
173 if(!reg){
174 printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
175 return -ENODEV;
176 }
177 base = *reg;
178 if(!base){
179 printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
180 return -ENODEV;
181 }
182
183 /* setup PMCC */
184 pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
185 offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
186 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
187 reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
188 if(!reg){
189 printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
190 return -ENODEV;
191 }
192 reg[0] = 0x00000001;
193
194 /* setup MPCS */
195 pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
196 offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
197 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
198 reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
199 if(!reg){
200 printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
201 return -ENODEV;
202 }
203 reg[0] = 0x00000001;
204
205 /* map in cyclone_timer */
206 pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
207 offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
208 set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
209 cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
210 if(!cyclone_timer){
211 printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
212 return -ENODEV;
213 }
214
215 /*quick test to make sure its ticking*/
216 for(i=0; i<3; i++){
217 u32 old = cyclone_timer[0];
218 int stall = 100;
219 while(stall--) barrier();
220 if(cyclone_timer[0] == old){
221 printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
222 cyclone_timer = 0;
223 return -ENODEV;
224 }
225 }
226
227 init_cpu_khz();
228
229 /* Everything looks good! */
230 return 0;
231}
232
233
234static void delay_cyclone(unsigned long loops)
235{
236 unsigned long bclock, now;
237 if(!cyclone_timer)
238 return;
239 bclock = cyclone_timer[0];
240 do {
241 rep_nop();
242 now = cyclone_timer[0];
243 } while ((now-bclock) < loops);
244}
245/************************************************************/
246
247/* cyclone timer_opts struct */
248static struct timer_opts timer_cyclone = {
249 .name = "cyclone",
250 .mark_offset = mark_offset_cyclone,
251 .get_offset = get_offset_cyclone,
252 .monotonic_clock = monotonic_clock_cyclone,
253 .delay = delay_cyclone,
254};
255
256struct init_timer_opts __initdata timer_cyclone_init = {
257 .init = init_cyclone,
258 .opts = &timer_cyclone,
259};
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
deleted file mode 100644
index 17a6fe7166e7..000000000000
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ /dev/null
@@ -1,217 +0,0 @@
1/*
2 * This code largely moved from arch/i386/kernel/time.c.
3 * See comments there for proper credits.
4 */
5
6#include <linux/spinlock.h>
7#include <linux/init.h>
8#include <linux/timex.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/jiffies.h>
12
13#include <asm/timer.h>
14#include <asm/io.h>
15#include <asm/processor.h>
16
17#include "io_ports.h"
18#include "mach_timer.h"
19#include <asm/hpet.h>
20
21static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */
22static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */
23static unsigned long hpet_last; /* hpet counter value at last tick*/
24static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
25static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
26static unsigned long long monotonic_base;
27static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
28
29/* convert from cycles(64bits) => nanoseconds (64bits)
30 * basic equation:
31 * ns = cycles / (freq / ns_per_sec)
32 * ns = cycles * (ns_per_sec / freq)
33 * ns = cycles * (10^9 / (cpu_khz * 10^3))
34 * ns = cycles * (10^6 / cpu_khz)
35 *
36 * Then we use scaling math (suggested by george@mvista.com) to get:
37 * ns = cycles * (10^6 * SC / cpu_khz) / SC
38 * ns = cycles * cyc2ns_scale / SC
39 *
40 * And since SC is a constant power of two, we can convert the div
41 * into a shift.
42 *
43 * We can use khz divisor instead of mhz to keep a better percision, since
44 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
45 * (mathieu.desnoyers@polymtl.ca)
46 *
47 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
48 */
49static unsigned long cyc2ns_scale __read_mostly;
50#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
51
52static inline void set_cyc2ns_scale(unsigned long cpu_khz)
53{
54 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
55}
56
57static inline unsigned long long cycles_2_ns(unsigned long long cyc)
58{
59 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
60}
61
62static unsigned long long monotonic_clock_hpet(void)
63{
64 unsigned long long last_offset, this_offset, base;
65 unsigned seq;
66
67 /* atomically read monotonic base & last_offset */
68 do {
69 seq = read_seqbegin(&monotonic_lock);
70 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
71 base = monotonic_base;
72 } while (read_seqretry(&monotonic_lock, seq));
73
74 /* Read the Time Stamp Counter */
75 rdtscll(this_offset);
76
77 /* return the value in ns */
78 return base + cycles_2_ns(this_offset - last_offset);
79}
80
81static unsigned long get_offset_hpet(void)
82{
83 register unsigned long eax, edx;
84
85 eax = hpet_readl(HPET_COUNTER);
86 eax -= hpet_last; /* hpet delta */
87 eax = min(hpet_tick, eax);
88 /*
89 * Time offset = (hpet delta) * ( usecs per HPET clock )
90 * = (hpet delta) * ( usecs per tick / HPET clocks per tick)
91 * = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
92 *
93 * Where,
94 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
95 *
96 * Using a mull instead of a divl saves some cycles in critical path.
97 */
98 ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax);
99
100 /* our adjusted time offset in microseconds */
101 return edx;
102}
103
104static void mark_offset_hpet(void)
105{
106 unsigned long long this_offset, last_offset;
107 unsigned long offset;
108
109 write_seqlock(&monotonic_lock);
110 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
111 rdtsc(last_tsc_low, last_tsc_high);
112
113 if (hpet_use_timer)
114 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
115 else
116 offset = hpet_readl(HPET_COUNTER);
117 if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) {
118 int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1;
119 jiffies_64 += lost_ticks;
120 }
121 hpet_last = offset;
122
123 /* update the monotonic base value */
124 this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
125 monotonic_base += cycles_2_ns(this_offset - last_offset);
126 write_sequnlock(&monotonic_lock);
127}
128
129static void delay_hpet(unsigned long loops)
130{
131 unsigned long hpet_start, hpet_end;
132 unsigned long eax;
133
134 /* loops is the number of cpu cycles. Convert it to hpet clocks */
135 ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops);
136
137 hpet_start = hpet_readl(HPET_COUNTER);
138 do {
139 rep_nop();
140 hpet_end = hpet_readl(HPET_COUNTER);
141 } while ((hpet_end - hpet_start) < (loops));
142}
143
144static struct timer_opts timer_hpet;
145
146static int __init init_hpet(char* override)
147{
148 unsigned long result, remain;
149
150 /* check clock override */
151 if (override[0] && strncmp(override,"hpet",4))
152 return -ENODEV;
153
154 if (!is_hpet_enabled())
155 return -ENODEV;
156
157 printk("Using HPET for gettimeofday\n");
158 if (cpu_has_tsc) {
159 unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient);
160 if (tsc_quotient) {
161 /* report CPU clock rate in Hz.
162 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
163 * clock/second. Our precision is about 100 ppm.
164 */
165 { unsigned long eax=0, edx=1000;
166 ASM_DIV64_REG(cpu_khz, edx, tsc_quotient,
167 eax, edx);
168 printk("Detected %u.%03u MHz processor.\n",
169 cpu_khz / 1000, cpu_khz % 1000);
170 }
171 set_cyc2ns_scale(cpu_khz);
172 }
173 /* set this only when cpu_has_tsc */
174 timer_hpet.read_timer = read_timer_tsc;
175 }
176
177 /*
178 * Math to calculate hpet to usec multiplier
179 * Look for the comments at get_offset_hpet()
180 */
181 ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC);
182 if (remain > (hpet_tick >> 1))
183 result++; /* rounding the result */
184 hpet_usec_quotient = result;
185
186 return 0;
187}
188
189static int hpet_resume(void)
190{
191 write_seqlock(&monotonic_lock);
192 /* Assume this is the last mark offset time */
193 rdtsc(last_tsc_low, last_tsc_high);
194
195 if (hpet_use_timer)
196 hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
197 else
198 hpet_last = hpet_readl(HPET_COUNTER);
199 write_sequnlock(&monotonic_lock);
200 return 0;
201}
202/************************************************************/
203
204/* tsc timer_opts struct */
205static struct timer_opts timer_hpet __read_mostly = {
206 .name = "hpet",
207 .mark_offset = mark_offset_hpet,
208 .get_offset = get_offset_hpet,
209 .monotonic_clock = monotonic_clock_hpet,
210 .delay = delay_hpet,
211 .resume = hpet_resume,
212};
213
214struct init_timer_opts __initdata timer_hpet_init = {
215 .init = init_hpet,
216 .opts = &timer_hpet,
217};
diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c
deleted file mode 100644
index 4ea2f414dbbd..000000000000
--- a/arch/i386/kernel/timers/timer_none.c
+++ /dev/null
@@ -1,39 +0,0 @@
1#include <linux/init.h>
2#include <asm/timer.h>
3
4static void mark_offset_none(void)
5{
6 /* nothing needed */
7}
8
9static unsigned long get_offset_none(void)
10{
11 return 0;
12}
13
14static unsigned long long monotonic_clock_none(void)
15{
16 return 0;
17}
18
19static void delay_none(unsigned long loops)
20{
21 int d0;
22 __asm__ __volatile__(
23 "\tjmp 1f\n"
24 ".align 16\n"
25 "1:\tjmp 2f\n"
26 ".align 16\n"
27 "2:\tdecl %0\n\tjns 2b"
28 :"=&a" (d0)
29 :"0" (loops));
30}
31
32/* none timer_opts struct */
33struct timer_opts timer_none = {
34 .name = "none",
35 .mark_offset = mark_offset_none,
36 .get_offset = get_offset_none,
37 .monotonic_clock = monotonic_clock_none,
38 .delay = delay_none,
39};
diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c
deleted file mode 100644
index b9b6bd56b9ba..000000000000
--- a/arch/i386/kernel/timers/timer_pit.c
+++ /dev/null
@@ -1,177 +0,0 @@
1/*
2 * This code largely moved from arch/i386/kernel/time.c.
3 * See comments there for proper credits.
4 */
5
6#include <linux/spinlock.h>
7#include <linux/module.h>
8#include <linux/device.h>
9#include <linux/sysdev.h>
10#include <linux/timex.h>
11#include <asm/delay.h>
12#include <asm/mpspec.h>
13#include <asm/timer.h>
14#include <asm/smp.h>
15#include <asm/io.h>
16#include <asm/arch_hooks.h>
17#include <asm/i8253.h>
18
19#include "do_timer.h"
20#include "io_ports.h"
21
22static int count_p; /* counter in get_offset_pit() */
23
24static int __init init_pit(char* override)
25{
26 /* check clock override */
27 if (override[0] && strncmp(override,"pit",3))
28 printk(KERN_ERR "Warning: clock= override failed. Defaulting "
29 "to PIT\n");
30 init_cpu_khz();
31 count_p = LATCH;
32 return 0;
33}
34
35static void mark_offset_pit(void)
36{
37 /* nothing needed */
38}
39
40static unsigned long long monotonic_clock_pit(void)
41{
42 return 0;
43}
44
45static void delay_pit(unsigned long loops)
46{
47 int d0;
48 __asm__ __volatile__(
49 "\tjmp 1f\n"
50 ".align 16\n"
51 "1:\tjmp 2f\n"
52 ".align 16\n"
53 "2:\tdecl %0\n\tjns 2b"
54 :"=&a" (d0)
55 :"0" (loops));
56}
57
58
59/* This function must be called with xtime_lock held.
60 * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs
61 *
62 * However, the pc-audio speaker driver changes the divisor so that
63 * it gets interrupted rather more often - it loads 64 into the
64 * counter rather than 11932! This has an adverse impact on
65 * do_gettimeoffset() -- it stops working! What is also not
66 * good is that the interval that our timer function gets called
67 * is no longer 10.0002 ms, but 9.9767 ms. To get around this
68 * would require using a different timing source. Maybe someone
69 * could use the RTC - I know that this can interrupt at frequencies
70 * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix
71 * it so that at startup, the timer code in sched.c would select
72 * using either the RTC or the 8253 timer. The decision would be
73 * based on whether there was any other device around that needed
74 * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz,
75 * and then do some jiggery to have a version of do_timer that
76 * advanced the clock by 1/1024 s. Every time that reached over 1/100
77 * of a second, then do all the old code. If the time was kept correct
78 * then do_gettimeoffset could just return 0 - there is no low order
79 * divider that can be accessed.
80 *
81 * Ideally, you would be able to use the RTC for the speaker driver,
82 * but it appears that the speaker driver really needs interrupt more
83 * often than every 120 us or so.
84 *
85 * Anyway, this needs more thought.... pjsg (1993-08-28)
86 *
87 * If you are really that interested, you should be reading
88 * comp.protocols.time.ntp!
89 */
90
91static unsigned long get_offset_pit(void)
92{
93 int count;
94 unsigned long flags;
95 static unsigned long jiffies_p = 0;
96
97 /*
98 * cache volatile jiffies temporarily; we have xtime_lock.
99 */
100 unsigned long jiffies_t;
101
102 spin_lock_irqsave(&i8253_lock, flags);
103 /* timer count may underflow right here */
104 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
105
106 count = inb_p(PIT_CH0); /* read the latched count */
107
108 /*
109 * We do this guaranteed double memory access instead of a _p
110 * postfix in the previous port access. Wheee, hackady hack
111 */
112 jiffies_t = jiffies;
113
114 count |= inb_p(PIT_CH0) << 8;
115
116 /* VIA686a test code... reset the latch if count > max + 1 */
117 if (count > LATCH) {
118 outb_p(0x34, PIT_MODE);
119 outb_p(LATCH & 0xff, PIT_CH0);
120 outb(LATCH >> 8, PIT_CH0);
121 count = LATCH - 1;
122 }
123
124 /*
125 * avoiding timer inconsistencies (they are rare, but they happen)...
126 * there are two kinds of problems that must be avoided here:
127 * 1. the timer counter underflows
128 * 2. hardware problem with the timer, not giving us continuous time,
129 * the counter does small "jumps" upwards on some Pentium systems,
130 * (see c't 95/10 page 335 for Neptun bug.)
131 */
132
133 if( jiffies_t == jiffies_p ) {
134 if( count > count_p ) {
135 /* the nutcase */
136 count = do_timer_overflow(count);
137 }
138 } else
139 jiffies_p = jiffies_t;
140
141 count_p = count;
142
143 spin_unlock_irqrestore(&i8253_lock, flags);
144
145 count = ((LATCH-1) - count) * TICK_SIZE;
146 count = (count + LATCH/2) / LATCH;
147
148 return count;
149}
150
151
152/* tsc timer_opts struct */
153struct timer_opts timer_pit = {
154 .name = "pit",
155 .mark_offset = mark_offset_pit,
156 .get_offset = get_offset_pit,
157 .monotonic_clock = monotonic_clock_pit,
158 .delay = delay_pit,
159};
160
161struct init_timer_opts __initdata timer_pit_init = {
162 .init = init_pit,
163 .opts = &timer_pit,
164};
165
166void setup_pit_timer(void)
167{
168 unsigned long flags;
169
170 spin_lock_irqsave(&i8253_lock, flags);
171 outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
172 udelay(10);
173 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
174 udelay(10);
175 outb(LATCH >> 8 , PIT_CH0); /* MSB */
176 spin_unlock_irqrestore(&i8253_lock, flags);
177}
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
deleted file mode 100644
index 144e94a04933..000000000000
--- a/arch/i386/kernel/timers/timer_pm.c
+++ /dev/null
@@ -1,342 +0,0 @@
1/*
2 * (C) Dominik Brodowski <linux@brodo.de> 2003
3 *
4 * Driver to use the Power Management Timer (PMTMR) available in some
5 * southbridges as primary timing source for the Linux kernel.
6 *
7 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
8 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
9 *
10 * This file is licensed under the GPL v2.
11 */
12
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/device.h>
17#include <linux/init.h>
18#include <linux/pci.h>
19#include <asm/types.h>
20#include <asm/timer.h>
21#include <asm/smp.h>
22#include <asm/io.h>
23#include <asm/arch_hooks.h>
24
25#include <linux/timex.h>
26#include "mach_timer.h"
27
28/* Number of PMTMR ticks expected during calibration run */
29#define PMTMR_TICKS_PER_SEC 3579545
30#define PMTMR_EXPECTED_RATE \
31 ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
32
33
34/* The I/O port the PMTMR resides at.
35 * The location is detected during setup_arch(),
36 * in arch/i386/acpi/boot.c */
37u32 pmtmr_ioport = 0;
38
39
40/* value of the Power timer at last timer interrupt */
41static u32 offset_tick;
42static u32 offset_delay;
43
44static unsigned long long monotonic_base;
45static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
46
47#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
48
49static int pmtmr_need_workaround __read_mostly = 1;
50
51/*helper function to safely read acpi pm timesource*/
52static inline u32 read_pmtmr(void)
53{
54 if (pmtmr_need_workaround) {
55 u32 v1, v2, v3;
56
57 /* It has been reported that because of various broken
58 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
59 * source is not latched, so you must read it multiple
60 * times to insure a safe value is read.
61 */
62 do {
63 v1 = inl(pmtmr_ioport);
64 v2 = inl(pmtmr_ioport);
65 v3 = inl(pmtmr_ioport);
66 } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
67 || (v3 > v1 && v3 < v2));
68
69 /* mask the output to 24 bits */
70 return v2 & ACPI_PM_MASK;
71 }
72
73 return inl(pmtmr_ioport) & ACPI_PM_MASK;
74}
75
76
77/*
78 * Some boards have the PMTMR running way too fast. We check
79 * the PMTMR rate against PIT channel 2 to catch these cases.
80 */
81static int verify_pmtmr_rate(void)
82{
83 u32 value1, value2;
84 unsigned long count, delta;
85
86 mach_prepare_counter();
87 value1 = read_pmtmr();
88 mach_countup(&count);
89 value2 = read_pmtmr();
90 delta = (value2 - value1) & ACPI_PM_MASK;
91
92 /* Check that the PMTMR delta is within 5% of what we expect */
93 if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
94 delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
95 printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
96 return -1;
97 }
98
99 return 0;
100}
101
102
103static int init_pmtmr(char* override)
104{
105 u32 value1, value2;
106 unsigned int i;
107
108 if (override[0] && strncmp(override,"pmtmr",5))
109 return -ENODEV;
110
111 if (!pmtmr_ioport)
112 return -ENODEV;
113
114 /* we use the TSC for delay_pmtmr, so make sure it exists */
115 if (!cpu_has_tsc)
116 return -ENODEV;
117
118 /* "verify" this timing source */
119 value1 = read_pmtmr();
120 for (i = 0; i < 10000; i++) {
121 value2 = read_pmtmr();
122 if (value2 == value1)
123 continue;
124 if (value2 > value1)
125 goto pm_good;
126 if ((value2 < value1) && ((value2) < 0xFFF))
127 goto pm_good;
128 printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
129 return -EINVAL;
130 }
131 printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
132 return -ENODEV;
133
134pm_good:
135 if (verify_pmtmr_rate() != 0)
136 return -ENODEV;
137
138 init_cpu_khz();
139 return 0;
140}
141
142static inline u32 cyc2us(u32 cycles)
143{
144 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
145 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
146 *
147 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
148 * easily be multiplied with 286 (=0x11E) without having to fear
149 * u32 overflows.
150 */
151 cycles *= 286;
152 return (cycles >> 10);
153}
154
155/*
156 * this gets called during each timer interrupt
157 * - Called while holding the writer xtime_lock
158 */
159static void mark_offset_pmtmr(void)
160{
161 u32 lost, delta, last_offset;
162 static int first_run = 1;
163 last_offset = offset_tick;
164
165 write_seqlock(&monotonic_lock);
166
167 offset_tick = read_pmtmr();
168
169 /* calculate tick interval */
170 delta = (offset_tick - last_offset) & ACPI_PM_MASK;
171
172 /* convert to usecs */
173 delta = cyc2us(delta);
174
175 /* update the monotonic base value */
176 monotonic_base += delta * NSEC_PER_USEC;
177 write_sequnlock(&monotonic_lock);
178
179 /* convert to ticks */
180 delta += offset_delay;
181 lost = delta / (USEC_PER_SEC / HZ);
182 offset_delay = delta % (USEC_PER_SEC / HZ);
183
184
185 /* compensate for lost ticks */
186 if (lost >= 2)
187 jiffies_64 += lost - 1;
188
189 /* don't calculate delay for first run,
190 or if we've got less then a tick */
191 if (first_run || (lost < 1)) {
192 first_run = 0;
193 offset_delay = 0;
194 }
195}
196
197static int pmtmr_resume(void)
198{
199 write_seqlock(&monotonic_lock);
200 /* Assume this is the last mark offset time */
201 offset_tick = read_pmtmr();
202 write_sequnlock(&monotonic_lock);
203 return 0;
204}
205
206static unsigned long long monotonic_clock_pmtmr(void)
207{
208 u32 last_offset, this_offset;
209 unsigned long long base, ret;
210 unsigned seq;
211
212
213 /* atomically read monotonic base & last_offset */
214 do {
215 seq = read_seqbegin(&monotonic_lock);
216 last_offset = offset_tick;
217 base = monotonic_base;
218 } while (read_seqretry(&monotonic_lock, seq));
219
220 /* Read the pmtmr */
221 this_offset = read_pmtmr();
222
223 /* convert to nanoseconds */
224 ret = (this_offset - last_offset) & ACPI_PM_MASK;
225 ret = base + (cyc2us(ret) * NSEC_PER_USEC);
226 return ret;
227}
228
229static void delay_pmtmr(unsigned long loops)
230{
231 unsigned long bclock, now;
232
233 rdtscl(bclock);
234 do
235 {
236 rep_nop();
237 rdtscl(now);
238 } while ((now-bclock) < loops);
239}
240
241
242/*
243 * get the offset (in microseconds) from the last call to mark_offset()
244 * - Called holding a reader xtime_lock
245 */
246static unsigned long get_offset_pmtmr(void)
247{
248 u32 now, offset, delta = 0;
249
250 offset = offset_tick;
251 now = read_pmtmr();
252 delta = (now - offset)&ACPI_PM_MASK;
253
254 return (unsigned long) offset_delay + cyc2us(delta);
255}
256
257
258/* acpi timer_opts struct */
259static struct timer_opts timer_pmtmr = {
260 .name = "pmtmr",
261 .mark_offset = mark_offset_pmtmr,
262 .get_offset = get_offset_pmtmr,
263 .monotonic_clock = monotonic_clock_pmtmr,
264 .delay = delay_pmtmr,
265 .read_timer = read_timer_tsc,
266 .resume = pmtmr_resume,
267};
268
269struct init_timer_opts __initdata timer_pmtmr_init = {
270 .init = init_pmtmr,
271 .opts = &timer_pmtmr,
272};
273
274#ifdef CONFIG_PCI
275/*
276 * PIIX4 Errata:
277 *
278 * The power management timer may return improper results when read.
279 * Although the timer value settles properly after incrementing,
280 * while incrementing there is a 3 ns window every 69.8 ns where the
281 * timer value is indeterminate (a 4.2% chance that the data will be
282 * incorrect when read). As a result, the ACPI free running count up
283 * timer specification is violated due to erroneous reads.
284 */
285static int __init pmtmr_bug_check(void)
286{
287 static struct pci_device_id gray_list[] __initdata = {
288 /* these chipsets may have bug. */
289 { PCI_DEVICE(PCI_VENDOR_ID_INTEL,
290 PCI_DEVICE_ID_INTEL_82801DB_0) },
291 { },
292 };
293 struct pci_dev *dev;
294 int pmtmr_has_bug = 0;
295 u8 rev;
296
297 if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround)
298 return 0;
299
300 dev = pci_get_device(PCI_VENDOR_ID_INTEL,
301 PCI_DEVICE_ID_INTEL_82371AB_3, NULL);
302 if (dev) {
303 pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
304 /* the bug has been fixed in PIIX4M */
305 if (rev < 3) {
306 printk(KERN_WARNING "* Found PM-Timer Bug on this "
307 "chipset. Due to workarounds for a bug,\n"
308 "* this time source is slow. Consider trying "
309 "other time sources (clock=)\n");
310 pmtmr_has_bug = 1;
311 }
312 pci_dev_put(dev);
313 }
314
315 if (pci_dev_present(gray_list)) {
316 printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due"
317 " to workarounds for a bug,\n"
318 "* this time source is slow. If you are sure your timer"
319 " does not have\n"
320 "* this bug, please use \"pmtmr_good\" to disable the "
321 "workaround\n");
322 pmtmr_has_bug = 1;
323 }
324
325 if (!pmtmr_has_bug)
326 pmtmr_need_workaround = 0;
327
328 return 0;
329}
330device_initcall(pmtmr_bug_check);
331#endif
332
333static int __init pmtr_good_setup(char *__str)
334{
335 pmtmr_need_workaround = 0;
336 return 1;
337}
338__setup("pmtmr_good", pmtr_good_setup);
339
340MODULE_LICENSE("GPL");
341MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
342MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
deleted file mode 100644
index f1187ddb0d0f..000000000000
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ /dev/null
@@ -1,617 +0,0 @@
1/*
2 * This code largely moved from arch/i386/kernel/time.c.
3 * See comments there for proper credits.
4 *
5 * 2004-06-25 Jesper Juhl
6 * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
7 * failing to inline.
8 */
9
10#include <linux/spinlock.h>
11#include <linux/init.h>
12#include <linux/timex.h>
13#include <linux/errno.h>
14#include <linux/cpufreq.h>
15#include <linux/string.h>
16#include <linux/jiffies.h>
17
18#include <asm/timer.h>
19#include <asm/io.h>
20/* processor.h for distable_tsc flag */
21#include <asm/processor.h>
22
23#include "io_ports.h"
24#include "mach_timer.h"
25
26#include <asm/hpet.h>
27#include <asm/i8253.h>
28
29#ifdef CONFIG_HPET_TIMER
30static unsigned long hpet_usec_quotient;
31static unsigned long hpet_last;
32static struct timer_opts timer_tsc;
33#endif
34
35static inline void cpufreq_delayed_get(void);
36
37int tsc_disable __devinitdata = 0;
38
39static int use_tsc;
40/* Number of usecs that the last interrupt was delayed */
41static int delay_at_last_interrupt;
42
43static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
44static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
45static unsigned long long monotonic_base;
46static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
47
48/* Avoid compensating for lost ticks before TSCs are synched */
49static int detect_lost_ticks;
50static int __init start_lost_tick_compensation(void)
51{
52 detect_lost_ticks = 1;
53 return 0;
54}
55late_initcall(start_lost_tick_compensation);
56
57/* convert from cycles(64bits) => nanoseconds (64bits)
58 * basic equation:
59 * ns = cycles / (freq / ns_per_sec)
60 * ns = cycles * (ns_per_sec / freq)
61 * ns = cycles * (10^9 / (cpu_khz * 10^3))
62 * ns = cycles * (10^6 / cpu_khz)
63 *
64 * Then we use scaling math (suggested by george@mvista.com) to get:
65 * ns = cycles * (10^6 * SC / cpu_khz) / SC
66 * ns = cycles * cyc2ns_scale / SC
67 *
68 * And since SC is a constant power of two, we can convert the div
69 * into a shift.
70 *
71 * We can use khz divisor instead of mhz to keep a better percision, since
72 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
73 * (mathieu.desnoyers@polymtl.ca)
74 *
75 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
76 */
77static unsigned long cyc2ns_scale __read_mostly;
78#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
79
80static inline void set_cyc2ns_scale(unsigned long cpu_khz)
81{
82 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
83}
84
85static inline unsigned long long cycles_2_ns(unsigned long long cyc)
86{
87 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
88}
89
90static int count2; /* counter for mark_offset_tsc() */
91
92/* Cached *multiplier* to convert TSC counts to microseconds.
93 * (see the equation below).
94 * Equal to 2^32 * (1 / (clocks per usec) ).
95 * Initialized in time_init.
96 */
97static unsigned long fast_gettimeoffset_quotient;
98
99static unsigned long get_offset_tsc(void)
100{
101 register unsigned long eax, edx;
102
103 /* Read the Time Stamp Counter */
104
105 rdtsc(eax,edx);
106
107 /* .. relative to previous jiffy (32 bits is enough) */
108 eax -= last_tsc_low; /* tsc_low delta */
109
110 /*
111 * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
112 * = (tsc_low delta) * (usecs_per_clock)
113 * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
114 *
115 * Using a mull instead of a divl saves up to 31 clock cycles
116 * in the critical path.
117 */
118
119 __asm__("mull %2"
120 :"=a" (eax), "=d" (edx)
121 :"rm" (fast_gettimeoffset_quotient),
122 "0" (eax));
123
124 /* our adjusted time offset in microseconds */
125 return delay_at_last_interrupt + edx;
126}
127
128static unsigned long long monotonic_clock_tsc(void)
129{
130 unsigned long long last_offset, this_offset, base;
131 unsigned seq;
132
133 /* atomically read monotonic base & last_offset */
134 do {
135 seq = read_seqbegin(&monotonic_lock);
136 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
137 base = monotonic_base;
138 } while (read_seqretry(&monotonic_lock, seq));
139
140 /* Read the Time Stamp Counter */
141 rdtscll(this_offset);
142
143 /* return the value in ns */
144 return base + cycles_2_ns(this_offset - last_offset);
145}
146
147/*
148 * Scheduler clock - returns current time in nanosec units.
149 */
150unsigned long long sched_clock(void)
151{
152 unsigned long long this_offset;
153
154 /*
155 * In the NUMA case we dont use the TSC as they are not
156 * synchronized across all CPUs.
157 */
158#ifndef CONFIG_NUMA
159 if (!use_tsc)
160#endif
161 /* no locking but a rare wrong value is not a big deal */
162 return jiffies_64 * (1000000000 / HZ);
163
164 /* Read the Time Stamp Counter */
165 rdtscll(this_offset);
166
167 /* return the value in ns */
168 return cycles_2_ns(this_offset);
169}
170
171static void delay_tsc(unsigned long loops)
172{
173 unsigned long bclock, now;
174
175 rdtscl(bclock);
176 do
177 {
178 rep_nop();
179 rdtscl(now);
180 } while ((now-bclock) < loops);
181}
182
183#ifdef CONFIG_HPET_TIMER
184static void mark_offset_tsc_hpet(void)
185{
186 unsigned long long this_offset, last_offset;
187 unsigned long offset, temp, hpet_current;
188
189 write_seqlock(&monotonic_lock);
190 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
191 /*
192 * It is important that these two operations happen almost at
193 * the same time. We do the RDTSC stuff first, since it's
194 * faster. To avoid any inconsistencies, we need interrupts
195 * disabled locally.
196 */
197 /*
198 * Interrupts are just disabled locally since the timer irq
199 * has the SA_INTERRUPT flag set. -arca
200 */
201 /* read Pentium cycle counter */
202
203 hpet_current = hpet_readl(HPET_COUNTER);
204 rdtsc(last_tsc_low, last_tsc_high);
205
206 /* lost tick compensation */
207 offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
208 if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))
209 && detect_lost_ticks) {
210 int lost_ticks = (offset - hpet_last) / hpet_tick;
211 jiffies_64 += lost_ticks;
212 }
213 hpet_last = hpet_current;
214
215 /* update the monotonic base value */
216 this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
217 monotonic_base += cycles_2_ns(this_offset - last_offset);
218 write_sequnlock(&monotonic_lock);
219
220 /* calculate delay_at_last_interrupt */
221 /*
222 * Time offset = (hpet delta) * ( usecs per HPET clock )
223 * = (hpet delta) * ( usecs per tick / HPET clocks per tick)
224 * = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
225 * Where,
226 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
227 */
228 delay_at_last_interrupt = hpet_current - offset;
229 ASM_MUL64_REG(temp, delay_at_last_interrupt,
230 hpet_usec_quotient, delay_at_last_interrupt);
231}
232#endif
233
234
235#ifdef CONFIG_CPU_FREQ
236#include <linux/workqueue.h>
237
238static unsigned int cpufreq_delayed_issched = 0;
239static unsigned int cpufreq_init = 0;
240static struct work_struct cpufreq_delayed_get_work;
241
242static void handle_cpufreq_delayed_get(void *v)
243{
244 unsigned int cpu;
245 for_each_online_cpu(cpu) {
246 cpufreq_get(cpu);
247 }
248 cpufreq_delayed_issched = 0;
249}
250
251/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
252 * to verify the CPU frequency the timing core thinks the CPU is running
253 * at is still correct.
254 */
255static inline void cpufreq_delayed_get(void)
256{
257 if (cpufreq_init && !cpufreq_delayed_issched) {
258 cpufreq_delayed_issched = 1;
259 printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
260 schedule_work(&cpufreq_delayed_get_work);
261 }
262}
263
264/* If the CPU frequency is scaled, TSC-based delays will need a different
265 * loops_per_jiffy value to function properly.
266 */
267
268static unsigned int ref_freq = 0;
269static unsigned long loops_per_jiffy_ref = 0;
270
271#ifndef CONFIG_SMP
272static unsigned long fast_gettimeoffset_ref = 0;
273static unsigned int cpu_khz_ref = 0;
274#endif
275
276static int
277time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
278 void *data)
279{
280 struct cpufreq_freqs *freq = data;
281
282 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
283 write_seqlock_irq(&xtime_lock);
284 if (!ref_freq) {
285 if (!freq->old){
286 ref_freq = freq->new;
287 goto end;
288 }
289 ref_freq = freq->old;
290 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
291#ifndef CONFIG_SMP
292 fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
293 cpu_khz_ref = cpu_khz;
294#endif
295 }
296
297 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
298 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
299 (val == CPUFREQ_RESUMECHANGE)) {
300 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
301 cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
302#ifndef CONFIG_SMP
303 if (cpu_khz)
304 cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
305 if (use_tsc) {
306 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
307 fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
308 set_cyc2ns_scale(cpu_khz);
309 }
310 }
311#endif
312 }
313
314end:
315 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
316 write_sequnlock_irq(&xtime_lock);
317
318 return 0;
319}
320
321static struct notifier_block time_cpufreq_notifier_block = {
322 .notifier_call = time_cpufreq_notifier
323};
324
325
326static int __init cpufreq_tsc(void)
327{
328 int ret;
329 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
330 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
331 CPUFREQ_TRANSITION_NOTIFIER);
332 if (!ret)
333 cpufreq_init = 1;
334 return ret;
335}
336core_initcall(cpufreq_tsc);
337
338#else /* CONFIG_CPU_FREQ */
339static inline void cpufreq_delayed_get(void) { return; }
340#endif
341
342int recalibrate_cpu_khz(void)
343{
344#ifndef CONFIG_SMP
345 unsigned int cpu_khz_old = cpu_khz;
346
347 if (cpu_has_tsc) {
348 local_irq_disable();
349 init_cpu_khz();
350 local_irq_enable();
351 cpu_data[0].loops_per_jiffy =
352 cpufreq_scale(cpu_data[0].loops_per_jiffy,
353 cpu_khz_old,
354 cpu_khz);
355 return 0;
356 } else
357 return -ENODEV;
358#else
359 return -ENODEV;
360#endif
361}
362EXPORT_SYMBOL(recalibrate_cpu_khz);
363
364static void mark_offset_tsc(void)
365{
366 unsigned long lost,delay;
367 unsigned long delta = last_tsc_low;
368 int count;
369 int countmp;
370 static int count1 = 0;
371 unsigned long long this_offset, last_offset;
372 static int lost_count = 0;
373
374 write_seqlock(&monotonic_lock);
375 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
376 /*
377 * It is important that these two operations happen almost at
378 * the same time. We do the RDTSC stuff first, since it's
379 * faster. To avoid any inconsistencies, we need interrupts
380 * disabled locally.
381 */
382
383 /*
384 * Interrupts are just disabled locally since the timer irq
385 * has the SA_INTERRUPT flag set. -arca
386 */
387
388 /* read Pentium cycle counter */
389
390 rdtsc(last_tsc_low, last_tsc_high);
391
392 spin_lock(&i8253_lock);
393 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
394
395 count = inb_p(PIT_CH0); /* read the latched count */
396 count |= inb(PIT_CH0) << 8;
397
398 /*
399 * VIA686a test code... reset the latch if count > max + 1
400 * from timer_pit.c - cjb
401 */
402 if (count > LATCH) {
403 outb_p(0x34, PIT_MODE);
404 outb_p(LATCH & 0xff, PIT_CH0);
405 outb(LATCH >> 8, PIT_CH0);
406 count = LATCH - 1;
407 }
408
409 spin_unlock(&i8253_lock);
410
411 if (pit_latch_buggy) {
412 /* get center value of last 3 time lutch */
413 if ((count2 >= count && count >= count1)
414 || (count1 >= count && count >= count2)) {
415 count2 = count1; count1 = count;
416 } else if ((count1 >= count2 && count2 >= count)
417 || (count >= count2 && count2 >= count1)) {
418 countmp = count;count = count2;
419 count2 = count1;count1 = countmp;
420 } else {
421 count2 = count1; count1 = count; count = count1;
422 }
423 }
424
425 /* lost tick compensation */
426 delta = last_tsc_low - delta;
427 {
428 register unsigned long eax, edx;
429 eax = delta;
430 __asm__("mull %2"
431 :"=a" (eax), "=d" (edx)
432 :"rm" (fast_gettimeoffset_quotient),
433 "0" (eax));
434 delta = edx;
435 }
436 delta += delay_at_last_interrupt;
437 lost = delta/(1000000/HZ);
438 delay = delta%(1000000/HZ);
439 if (lost >= 2 && detect_lost_ticks) {
440 jiffies_64 += lost-1;
441
442 /* sanity check to ensure we're not always losing ticks */
443 if (lost_count++ > 100) {
444 printk(KERN_WARNING "Losing too many ticks!\n");
445 printk(KERN_WARNING "TSC cannot be used as a timesource. \n");
446 printk(KERN_WARNING "Possible reasons for this are:\n");
447 printk(KERN_WARNING " You're running with Speedstep,\n");
448 printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n");
449 printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n");
450 printk(KERN_WARNING "Falling back to a sane timesource now.\n");
451
452 clock_fallback();
453 }
454 /* ... but give the TSC a fair chance */
455 if (lost_count > 25)
456 cpufreq_delayed_get();
457 } else
458 lost_count = 0;
459 /* update the monotonic base value */
460 this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
461 monotonic_base += cycles_2_ns(this_offset - last_offset);
462 write_sequnlock(&monotonic_lock);
463
464 /* calculate delay_at_last_interrupt */
465 count = ((LATCH-1) - count) * TICK_SIZE;
466 delay_at_last_interrupt = (count + LATCH/2) / LATCH;
467
468 /* catch corner case where tick rollover occured
469 * between tsc and pit reads (as noted when
470 * usec delta is > 90% # of usecs/tick)
471 */
472 if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
473 jiffies_64++;
474}
475
476static int __init init_tsc(char* override)
477{
478
479 /* check clock override */
480 if (override[0] && strncmp(override,"tsc",3)) {
481#ifdef CONFIG_HPET_TIMER
482 if (is_hpet_enabled()) {
483 printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
484 } else
485#endif
486 {
487 return -ENODEV;
488 }
489 }
490
491 /*
492 * If we have APM enabled or the CPU clock speed is variable
493 * (CPU stops clock on HLT or slows clock to save power)
494 * then the TSC timestamps may diverge by up to 1 jiffy from
495 * 'real time' but nothing will break.
496 * The most frequent case is that the CPU is "woken" from a halt
497 * state by the timer interrupt itself, so we get 0 error. In the
498 * rare cases where a driver would "wake" the CPU and request a
499 * timestamp, the maximum error is < 1 jiffy. But timestamps are
500 * still perfectly ordered.
501 * Note that the TSC counter will be reset if APM suspends
502 * to disk; this won't break the kernel, though, 'cuz we're
503 * smart. See arch/i386/kernel/apm.c.
504 */
505 /*
506 * Firstly we have to do a CPU check for chips with
507 * a potentially buggy TSC. At this point we haven't run
508 * the ident/bugs checks so we must run this hook as it
509 * may turn off the TSC flag.
510 *
511 * NOTE: this doesn't yet handle SMP 486 machines where only
512 * some CPU's have a TSC. Thats never worked and nobody has
513 * moaned if you have the only one in the world - you fix it!
514 */
515
516 count2 = LATCH; /* initialize counter for mark_offset_tsc() */
517
518 if (cpu_has_tsc) {
519 unsigned long tsc_quotient;
520#ifdef CONFIG_HPET_TIMER
521 if (is_hpet_enabled() && hpet_use_timer) {
522 unsigned long result, remain;
523 printk("Using TSC for gettimeofday\n");
524 tsc_quotient = calibrate_tsc_hpet(NULL);
525 timer_tsc.mark_offset = &mark_offset_tsc_hpet;
526 /*
527 * Math to calculate hpet to usec multiplier
528 * Look for the comments at get_offset_tsc_hpet()
529 */
530 ASM_DIV64_REG(result, remain, hpet_tick,
531 0, KERNEL_TICK_USEC);
532 if (remain > (hpet_tick >> 1))
533 result++; /* rounding the result */
534
535 hpet_usec_quotient = result;
536 } else
537#endif
538 {
539 tsc_quotient = calibrate_tsc();
540 }
541
542 if (tsc_quotient) {
543 fast_gettimeoffset_quotient = tsc_quotient;
544 use_tsc = 1;
545 /*
546 * We could be more selective here I suspect
547 * and just enable this for the next intel chips ?
548 */
549 /* report CPU clock rate in Hz.
550 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
551 * clock/second. Our precision is about 100 ppm.
552 */
553 { unsigned long eax=0, edx=1000;
554 __asm__("divl %2"
555 :"=a" (cpu_khz), "=d" (edx)
556 :"r" (tsc_quotient),
557 "0" (eax), "1" (edx));
558 printk("Detected %u.%03u MHz processor.\n",
559 cpu_khz / 1000, cpu_khz % 1000);
560 }
561 set_cyc2ns_scale(cpu_khz);
562 return 0;
563 }
564 }
565 return -ENODEV;
566}
567
568static int tsc_resume(void)
569{
570 write_seqlock(&monotonic_lock);
571 /* Assume this is the last mark offset time */
572 rdtsc(last_tsc_low, last_tsc_high);
573#ifdef CONFIG_HPET_TIMER
574 if (is_hpet_enabled() && hpet_use_timer)
575 hpet_last = hpet_readl(HPET_COUNTER);
576#endif
577 write_sequnlock(&monotonic_lock);
578 return 0;
579}
580
581#ifndef CONFIG_X86_TSC
582/* disable flag for tsc. Takes effect by clearing the TSC cpu flag
583 * in cpu/common.c */
584static int __init tsc_setup(char *str)
585{
586 tsc_disable = 1;
587 return 1;
588}
589#else
590static int __init tsc_setup(char *str)
591{
592 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
593 "cannot disable TSC.\n");
594 return 1;
595}
596#endif
597__setup("notsc", tsc_setup);
598
599
600
601/************************************************************/
602
603/* tsc timer_opts struct */
604static struct timer_opts timer_tsc = {
605 .name = "tsc",
606 .mark_offset = mark_offset_tsc,
607 .get_offset = get_offset_tsc,
608 .monotonic_clock = monotonic_clock_tsc,
609 .delay = delay_tsc,
610 .read_timer = read_timer_tsc,
611 .resume = tsc_resume,
612};
613
614struct init_timer_opts __initdata timer_tsc_init = {
615 .init = init_tsc,
616 .opts = &timer_tsc,
617};
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 296355292c7c..e2e281d4bcc8 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -32,15 +32,8 @@
32 32
33static struct i386_cpu cpu_devices[NR_CPUS]; 33static struct i386_cpu cpu_devices[NR_CPUS];
34 34
35int arch_register_cpu(int num){ 35int arch_register_cpu(int num)
36 struct node *parent = NULL; 36{
37
38#ifdef CONFIG_NUMA
39 int node = cpu_to_node(num);
40 if (node_online(node))
41 parent = &node_devices[node].node;
42#endif /* CONFIG_NUMA */
43
44 /* 37 /*
45 * CPU0 cannot be offlined due to several 38 * CPU0 cannot be offlined due to several
46 * restrictions and assumptions in kernel. This basically 39 * restrictions and assumptions in kernel. This basically
@@ -50,21 +43,13 @@ int arch_register_cpu(int num){
50 if (!num) 43 if (!num)
51 cpu_devices[num].cpu.no_control = 1; 44 cpu_devices[num].cpu.no_control = 1;
52 45
53 return register_cpu(&cpu_devices[num].cpu, num, parent); 46 return register_cpu(&cpu_devices[num].cpu, num);
54} 47}
55 48
56#ifdef CONFIG_HOTPLUG_CPU 49#ifdef CONFIG_HOTPLUG_CPU
57 50
58void arch_unregister_cpu(int num) { 51void arch_unregister_cpu(int num) {
59 struct node *parent = NULL; 52 return unregister_cpu(&cpu_devices[num].cpu);
60
61#ifdef CONFIG_NUMA
62 int node = cpu_to_node(num);
63 if (node_online(node))
64 parent = &node_devices[node].node;
65#endif /* CONFIG_NUMA */
66
67 return unregister_cpu(&cpu_devices[num].cpu, parent);
68} 53}
69EXPORT_SYMBOL(arch_register_cpu); 54EXPORT_SYMBOL(arch_register_cpu);
70EXPORT_SYMBOL(arch_unregister_cpu); 55EXPORT_SYMBOL(arch_unregister_cpu);
@@ -74,16 +59,13 @@ EXPORT_SYMBOL(arch_unregister_cpu);
74 59
75#ifdef CONFIG_NUMA 60#ifdef CONFIG_NUMA
76#include <linux/mmzone.h> 61#include <linux/mmzone.h>
77#include <asm/node.h>
78
79struct i386_node node_devices[MAX_NUMNODES];
80 62
81static int __init topology_init(void) 63static int __init topology_init(void)
82{ 64{
83 int i; 65 int i;
84 66
85 for_each_online_node(i) 67 for_each_online_node(i)
86 arch_register_node(i); 68 register_one_node(i);
87 69
88 for_each_present_cpu(i) 70 for_each_present_cpu(i)
89 arch_register_cpu(i); 71 arch_register_cpu(i);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index dcc14477af1f..78464097470a 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -28,6 +28,7 @@
28#include <linux/utsname.h> 28#include <linux/utsname.h>
29#include <linux/kprobes.h> 29#include <linux/kprobes.h>
30#include <linux/kexec.h> 30#include <linux/kexec.h>
31#include <linux/unwind.h>
31 32
32#ifdef CONFIG_EISA 33#ifdef CONFIG_EISA
33#include <linux/ioport.h> 34#include <linux/ioport.h>
@@ -47,7 +48,7 @@
47#include <asm/desc.h> 48#include <asm/desc.h>
48#include <asm/i387.h> 49#include <asm/i387.h>
49#include <asm/nmi.h> 50#include <asm/nmi.h>
50 51#include <asm/unwind.h>
51#include <asm/smp.h> 52#include <asm/smp.h>
52#include <asm/arch_hooks.h> 53#include <asm/arch_hooks.h>
53#include <asm/kdebug.h> 54#include <asm/kdebug.h>
@@ -92,6 +93,7 @@ asmlinkage void spurious_interrupt_bug(void);
92asmlinkage void machine_check(void); 93asmlinkage void machine_check(void);
93 94
94static int kstack_depth_to_print = 24; 95static int kstack_depth_to_print = 24;
96static int call_trace = 1;
95ATOMIC_NOTIFIER_HEAD(i386die_chain); 97ATOMIC_NOTIFIER_HEAD(i386die_chain);
96 98
97int register_die_notifier(struct notifier_block *nb) 99int register_die_notifier(struct notifier_block *nb)
@@ -170,7 +172,23 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
170 return ebp; 172 return ebp;
171} 173}
172 174
173static void show_trace_log_lvl(struct task_struct *task, 175static asmlinkage int show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
176{
177 int n = 0;
178 int printed = 0; /* nr of entries already printed on current line */
179
180 while (unwind(info) == 0 && UNW_PC(info)) {
181 ++n;
182 printed = print_addr_and_symbol(UNW_PC(info), log_lvl, printed);
183 if (arch_unw_user_mode(info))
184 break;
185 }
186 if (printed)
187 printk("\n");
188 return n;
189}
190
191static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *stack, char *log_lvl) 192 unsigned long *stack, char *log_lvl)
175{ 193{
176 unsigned long ebp; 194 unsigned long ebp;
@@ -178,6 +196,26 @@ static void show_trace_log_lvl(struct task_struct *task,
178 if (!task) 196 if (!task)
179 task = current; 197 task = current;
180 198
199 if (call_trace >= 0) {
200 int unw_ret = 0;
201 struct unwind_frame_info info;
202
203 if (regs) {
204 if (unwind_init_frame_info(&info, task, regs) == 0)
205 unw_ret = show_trace_unwind(&info, log_lvl);
206 } else if (task == current)
207 unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
208 else {
209 if (unwind_init_blocked(&info, task) == 0)
210 unw_ret = show_trace_unwind(&info, log_lvl);
211 }
212 if (unw_ret > 0) {
213 if (call_trace > 0)
214 return;
215 printk("%sLegacy call trace:\n", log_lvl);
216 }
217 }
218
181 if (task == current) { 219 if (task == current) {
182 /* Grab ebp right from our regs */ 220 /* Grab ebp right from our regs */
183 asm ("movl %%ebp, %0" : "=r" (ebp) : ); 221 asm ("movl %%ebp, %0" : "=r" (ebp) : );
@@ -198,13 +236,13 @@ static void show_trace_log_lvl(struct task_struct *task,
198 } 236 }
199} 237}
200 238
201void show_trace(struct task_struct *task, unsigned long * stack) 239void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
202{ 240{
203 show_trace_log_lvl(task, stack, ""); 241 show_trace_log_lvl(task, regs, stack, "");
204} 242}
205 243
206static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, 244static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
207 char *log_lvl) 245 unsigned long *esp, char *log_lvl)
208{ 246{
209 unsigned long *stack; 247 unsigned long *stack;
210 int i; 248 int i;
@@ -225,13 +263,13 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
225 printk("%08lx ", *stack++); 263 printk("%08lx ", *stack++);
226 } 264 }
227 printk("\n%sCall Trace:\n", log_lvl); 265 printk("\n%sCall Trace:\n", log_lvl);
228 show_trace_log_lvl(task, esp, log_lvl); 266 show_trace_log_lvl(task, regs, esp, log_lvl);
229} 267}
230 268
231void show_stack(struct task_struct *task, unsigned long *esp) 269void show_stack(struct task_struct *task, unsigned long *esp)
232{ 270{
233 printk(" "); 271 printk(" ");
234 show_stack_log_lvl(task, esp, ""); 272 show_stack_log_lvl(task, NULL, esp, "");
235} 273}
236 274
237/* 275/*
@@ -241,7 +279,7 @@ void dump_stack(void)
241{ 279{
242 unsigned long stack; 280 unsigned long stack;
243 281
244 show_trace(current, &stack); 282 show_trace(current, NULL, &stack);
245} 283}
246 284
247EXPORT_SYMBOL(dump_stack); 285EXPORT_SYMBOL(dump_stack);
@@ -285,7 +323,7 @@ void show_registers(struct pt_regs *regs)
285 u8 __user *eip; 323 u8 __user *eip;
286 324
287 printk("\n" KERN_EMERG "Stack: "); 325 printk("\n" KERN_EMERG "Stack: ");
288 show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG); 326 show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
289 327
290 printk(KERN_EMERG "Code: "); 328 printk(KERN_EMERG "Code: ");
291 329
@@ -1215,3 +1253,15 @@ static int __init kstack_setup(char *s)
1215 return 1; 1253 return 1;
1216} 1254}
1217__setup("kstack=", kstack_setup); 1255__setup("kstack=", kstack_setup);
1256
1257static int __init call_trace_setup(char *s)
1258{
1259 if (strcmp(s, "old") == 0)
1260 call_trace = -1;
1261 else if (strcmp(s, "both") == 0)
1262 call_trace = 0;
1263 else if (strcmp(s, "new") == 0)
1264 call_trace = 1;
1265 return 1;
1266}
1267__setup("call_trace=", call_trace_setup);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
new file mode 100644
index 000000000000..7e0d8dab2075
--- /dev/null
+++ b/arch/i386/kernel/tsc.c
@@ -0,0 +1,478 @@
1/*
2 * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
3 * which was originally moved from arch/i386/kernel/time.c.
4 * See comments there for proper credits.
5 */
6
7#include <linux/clocksource.h>
8#include <linux/workqueue.h>
9#include <linux/cpufreq.h>
10#include <linux/jiffies.h>
11#include <linux/init.h>
12#include <linux/dmi.h>
13
14#include <asm/delay.h>
15#include <asm/tsc.h>
16#include <asm/delay.h>
17#include <asm/io.h>
18
19#include "mach_timer.h"
20
21/*
22 * On some systems the TSC frequency does not
23 * change with the cpu frequency. So we need
24 * an extra value to store the TSC freq
25 */
26unsigned int tsc_khz;
27
28int tsc_disable __cpuinitdata = 0;
29
30#ifdef CONFIG_X86_TSC
31static int __init tsc_setup(char *str)
32{
33 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
34 "cannot disable TSC.\n");
35 return 1;
36}
37#else
38/*
39 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
40 * in cpu/common.c
41 */
42static int __init tsc_setup(char *str)
43{
44 tsc_disable = 1;
45
46 return 1;
47}
48#endif
49
50__setup("notsc", tsc_setup);
51
52/*
53 * code to mark and check if the TSC is unstable
54 * due to cpufreq or due to unsynced TSCs
55 */
56static int tsc_unstable;
57
58static inline int check_tsc_unstable(void)
59{
60 return tsc_unstable;
61}
62
63void mark_tsc_unstable(void)
64{
65 tsc_unstable = 1;
66}
67EXPORT_SYMBOL_GPL(mark_tsc_unstable);
68
69/* Accellerators for sched_clock()
70 * convert from cycles(64bits) => nanoseconds (64bits)
71 * basic equation:
72 * ns = cycles / (freq / ns_per_sec)
73 * ns = cycles * (ns_per_sec / freq)
74 * ns = cycles * (10^9 / (cpu_khz * 10^3))
75 * ns = cycles * (10^6 / cpu_khz)
76 *
77 * Then we use scaling math (suggested by george@mvista.com) to get:
78 * ns = cycles * (10^6 * SC / cpu_khz) / SC
79 * ns = cycles * cyc2ns_scale / SC
80 *
81 * And since SC is a constant power of two, we can convert the div
82 * into a shift.
83 *
84 * We can use khz divisor instead of mhz to keep a better percision, since
85 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
86 * (mathieu.desnoyers@polymtl.ca)
87 *
88 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
89 */
90static unsigned long cyc2ns_scale __read_mostly;
91
92#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
93
94static inline void set_cyc2ns_scale(unsigned long cpu_khz)
95{
96 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
97}
98
99static inline unsigned long long cycles_2_ns(unsigned long long cyc)
100{
101 return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
102}
103
104/*
105 * Scheduler clock - returns current time in nanosec units.
106 */
107unsigned long long sched_clock(void)
108{
109 unsigned long long this_offset;
110
111 /*
112 * in the NUMA case we dont use the TSC as they are not
113 * synchronized across all CPUs.
114 */
115#ifndef CONFIG_NUMA
116 if (!cpu_khz || check_tsc_unstable())
117#endif
118 /* no locking but a rare wrong value is not a big deal */
119 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
120
121 /* read the Time Stamp Counter: */
122 rdtscll(this_offset);
123
124 /* return the value in ns */
125 return cycles_2_ns(this_offset);
126}
127
128static unsigned long calculate_cpu_khz(void)
129{
130 unsigned long long start, end;
131 unsigned long count;
132 u64 delta64;
133 int i;
134 unsigned long flags;
135
136 local_irq_save(flags);
137
138 /* run 3 times to ensure the cache is warm */
139 for (i = 0; i < 3; i++) {
140 mach_prepare_counter();
141 rdtscll(start);
142 mach_countup(&count);
143 rdtscll(end);
144 }
145 /*
146 * Error: ECTCNEVERSET
147 * The CTC wasn't reliable: we got a hit on the very first read,
148 * or the CPU was so fast/slow that the quotient wouldn't fit in
149 * 32 bits..
150 */
151 if (count <= 1)
152 goto err;
153
154 delta64 = end - start;
155
156 /* cpu freq too fast: */
157 if (delta64 > (1ULL<<32))
158 goto err;
159
160 /* cpu freq too slow: */
161 if (delta64 <= CALIBRATE_TIME_MSEC)
162 goto err;
163
164 delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
165 do_div(delta64,CALIBRATE_TIME_MSEC);
166
167 local_irq_restore(flags);
168 return (unsigned long)delta64;
169err:
170 local_irq_restore(flags);
171 return 0;
172}
173
174int recalibrate_cpu_khz(void)
175{
176#ifndef CONFIG_SMP
177 unsigned long cpu_khz_old = cpu_khz;
178
179 if (cpu_has_tsc) {
180 cpu_khz = calculate_cpu_khz();
181 tsc_khz = cpu_khz;
182 cpu_data[0].loops_per_jiffy =
183 cpufreq_scale(cpu_data[0].loops_per_jiffy,
184 cpu_khz_old, cpu_khz);
185 return 0;
186 } else
187 return -ENODEV;
188#else
189 return -ENODEV;
190#endif
191}
192
193EXPORT_SYMBOL(recalibrate_cpu_khz);
194
195void tsc_init(void)
196{
197 if (!cpu_has_tsc || tsc_disable)
198 return;
199
200 cpu_khz = calculate_cpu_khz();
201 tsc_khz = cpu_khz;
202
203 if (!cpu_khz)
204 return;
205
206 printk("Detected %lu.%03lu MHz processor.\n",
207 (unsigned long)cpu_khz / 1000,
208 (unsigned long)cpu_khz % 1000);
209
210 set_cyc2ns_scale(cpu_khz);
211 use_tsc_delay();
212}
213
214#ifdef CONFIG_CPU_FREQ
215
216static unsigned int cpufreq_delayed_issched = 0;
217static unsigned int cpufreq_init = 0;
218static struct work_struct cpufreq_delayed_get_work;
219
220static void handle_cpufreq_delayed_get(void *v)
221{
222 unsigned int cpu;
223
224 for_each_online_cpu(cpu)
225 cpufreq_get(cpu);
226
227 cpufreq_delayed_issched = 0;
228}
229
230/*
231 * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
232 * to verify the CPU frequency the timing core thinks the CPU is running
233 * at is still correct.
234 */
235static inline void cpufreq_delayed_get(void)
236{
237 if (cpufreq_init && !cpufreq_delayed_issched) {
238 cpufreq_delayed_issched = 1;
239 printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
240 schedule_work(&cpufreq_delayed_get_work);
241 }
242}
243
244/*
245 * if the CPU frequency is scaled, TSC-based delays will need a different
246 * loops_per_jiffy value to function properly.
247 */
248static unsigned int ref_freq = 0;
249static unsigned long loops_per_jiffy_ref = 0;
250static unsigned long cpu_khz_ref = 0;
251
252static int
253time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
254{
255 struct cpufreq_freqs *freq = data;
256
257 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
258 write_seqlock_irq(&xtime_lock);
259
260 if (!ref_freq) {
261 if (!freq->old){
262 ref_freq = freq->new;
263 goto end;
264 }
265 ref_freq = freq->old;
266 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
267 cpu_khz_ref = cpu_khz;
268 }
269
270 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
271 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
272 (val == CPUFREQ_RESUMECHANGE)) {
273 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
274 cpu_data[freq->cpu].loops_per_jiffy =
275 cpufreq_scale(loops_per_jiffy_ref,
276 ref_freq, freq->new);
277
278 if (cpu_khz) {
279
280 if (num_online_cpus() == 1)
281 cpu_khz = cpufreq_scale(cpu_khz_ref,
282 ref_freq, freq->new);
283 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
284 tsc_khz = cpu_khz;
285 set_cyc2ns_scale(cpu_khz);
286 /*
287 * TSC based sched_clock turns
288 * to junk w/ cpufreq
289 */
290 mark_tsc_unstable();
291 }
292 }
293 }
294end:
295 if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
296 write_sequnlock_irq(&xtime_lock);
297
298 return 0;
299}
300
301static struct notifier_block time_cpufreq_notifier_block = {
302 .notifier_call = time_cpufreq_notifier
303};
304
305static int __init cpufreq_tsc(void)
306{
307 int ret;
308
309 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
310 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
311 CPUFREQ_TRANSITION_NOTIFIER);
312 if (!ret)
313 cpufreq_init = 1;
314
315 return ret;
316}
317
318core_initcall(cpufreq_tsc);
319
320#endif
321
322/* clock source code */
323
324static unsigned long current_tsc_khz = 0;
325static int tsc_update_callback(void);
326
327static cycle_t read_tsc(void)
328{
329 cycle_t ret;
330
331 rdtscll(ret);
332
333 return ret;
334}
335
336static struct clocksource clocksource_tsc = {
337 .name = "tsc",
338 .rating = 300,
339 .read = read_tsc,
340 .mask = CLOCKSOURCE_MASK(64),
341 .mult = 0, /* to be set */
342 .shift = 22,
343 .update_callback = tsc_update_callback,
344 .is_continuous = 1,
345};
346
347static int tsc_update_callback(void)
348{
349 int change = 0;
350
351 /* check to see if we should switch to the safe clocksource: */
352 if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
353 clocksource_tsc.rating = 50;
354 clocksource_reselect();
355 change = 1;
356 }
357
358 /* only update if tsc_khz has changed: */
359 if (current_tsc_khz != tsc_khz) {
360 current_tsc_khz = tsc_khz;
361 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
362 clocksource_tsc.shift);
363 change = 1;
364 }
365
366 return change;
367}
368
369static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
370{
371 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
372 d->ident);
373 mark_tsc_unstable();
374 return 0;
375}
376
377/* List of systems that have known TSC problems */
378static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
379 {
380 .callback = dmi_mark_tsc_unstable,
381 .ident = "IBM Thinkpad 380XD",
382 .matches = {
383 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
384 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
385 },
386 },
387 {}
388};
389
390#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */
391static struct timer_list verify_tsc_freq_timer;
392
393/* XXX - Probably should add locking */
394static void verify_tsc_freq(unsigned long unused)
395{
396 static u64 last_tsc;
397 static unsigned long last_jiffies;
398
399 u64 now_tsc, interval_tsc;
400 unsigned long now_jiffies, interval_jiffies;
401
402
403 if (check_tsc_unstable())
404 return;
405
406 rdtscll(now_tsc);
407 now_jiffies = jiffies;
408
409 if (!last_jiffies) {
410 goto out;
411 }
412
413 interval_jiffies = now_jiffies - last_jiffies;
414 interval_tsc = now_tsc - last_tsc;
415 interval_tsc *= HZ;
416 do_div(interval_tsc, cpu_khz*1000);
417
418 if (interval_tsc < (interval_jiffies * 3 / 4)) {
419 printk("TSC appears to be running slowly. "
420 "Marking it as unstable\n");
421 mark_tsc_unstable();
422 return;
423 }
424
425out:
426 last_tsc = now_tsc;
427 last_jiffies = now_jiffies;
428 /* set us up to go off on the next interval: */
429 mod_timer(&verify_tsc_freq_timer,
430 jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL));
431}
432
433/*
434 * Make an educated guess if the TSC is trustworthy and synchronized
435 * over all CPUs.
436 */
437static __init int unsynchronized_tsc(void)
438{
439 /*
440 * Intel systems are normally all synchronized.
441 * Exceptions must mark TSC as unstable:
442 */
443 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
444 return 0;
445
446 /* assume multi socket systems are not synchronized: */
447 return num_possible_cpus() > 1;
448}
449
450static int __init init_tsc_clocksource(void)
451{
452
453 if (cpu_has_tsc && tsc_khz && !tsc_disable) {
454 /* check blacklist */
455 dmi_check_system(bad_tsc_dmi_table);
456
457 if (unsynchronized_tsc()) /* mark unstable if unsynced */
458 mark_tsc_unstable();
459 current_tsc_khz = tsc_khz;
460 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
461 clocksource_tsc.shift);
462 /* lower the rating if we already know its unstable: */
463 if (check_tsc_unstable())
464 clocksource_tsc.rating = 50;
465
466 init_timer(&verify_tsc_freq_timer);
467 verify_tsc_freq_timer.function = verify_tsc_freq;
468 verify_tsc_freq_timer.expires =
469 jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
470 add_timer(&verify_tsc_freq_timer);
471
472 return clocksource_register(&clocksource_tsc);
473 }
474
475 return 0;
476}
477
478module_init(init_tsc_clocksource);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 7512f39c9f25..2d4f1386e2b1 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -71,6 +71,15 @@ SECTIONS
71 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } 71 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
72 _edata = .; /* End of data section */ 72 _edata = .; /* End of data section */
73 73
74#ifdef CONFIG_STACK_UNWIND
75 . = ALIGN(4);
76 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
77 __start_unwind = .;
78 *(.eh_frame)
79 __end_unwind = .;
80 }
81#endif
82
74 . = ALIGN(THREAD_SIZE); /* init_task */ 83 . = ALIGN(THREAD_SIZE); /* init_task */
75 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 84 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
76 *(.data.init_task) 85 *(.data.init_task)
diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S
index 3b62baa6a371..1a36d26e15eb 100644
--- a/arch/i386/kernel/vsyscall-sysenter.S
+++ b/arch/i386/kernel/vsyscall-sysenter.S
@@ -42,10 +42,10 @@ __kernel_vsyscall:
42 /* 7: align return point with nop's to make disassembly easier */ 42 /* 7: align return point with nop's to make disassembly easier */
43 .space 7,0x90 43 .space 7,0x90
44 44
45 /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ 45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel 46 jmp .Lenter_kernel
47 /* 16: System call normal return point is here! */ 47 /* 16: System call normal return point is here! */
48 .globl SYSENTER_RETURN /* Symbol used by entry.S. */ 48 .globl SYSENTER_RETURN /* Symbol used by sysenter.c */
49SYSENTER_RETURN: 49SYSENTER_RETURN:
50 pop %ebp 50 pop %ebp
51.Lpop_ebp: 51.Lpop_ebp:
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index 98699ca6e52d..e26975fc68b6 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
7 7
8SECTIONS 8SECTIONS
9{ 9{
10 . = VSYSCALL_BASE + SIZEOF_HEADERS; 10 . = VDSO_PRELINK + SIZEOF_HEADERS;
11 11
12 .hash : { *(.hash) } :text 12 .hash : { *(.hash) } :text
13 .dynsym : { *(.dynsym) } 13 .dynsym : { *(.dynsym) }
@@ -20,7 +20,7 @@ SECTIONS
20 For the layouts to match, we need to skip more than enough 20 For the layouts to match, we need to skip more than enough
21 space for the dynamic symbol table et al. If this amount 21 space for the dynamic symbol table et al. If this amount
22 is insufficient, ld -shared will barf. Just increase it here. */ 22 is insufficient, ld -shared will barf. Just increase it here. */
23 . = VSYSCALL_BASE + 0x400; 23 . = VDSO_PRELINK + 0x400;
24 24
25 .text : { *(.text) } :text =0x90909090 25 .text : { *(.text) } :text =0x90909090
26 .note : { *(.note.*) } :text :note 26 .note : { *(.note.*) } :text :note