aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2007-10-11 05:17:01 -0400
committerThomas Gleixner <tglx@linutronix.de>2007-10-11 05:17:01 -0400
commit9a163ed8e0552fdcffe405d2ea7134819a81456e (patch)
treeb322fd2afbb812ba7ddfd22f3734aaab007c2aa5 /arch/x86/kernel
parentf7627e2513987bb5d4e8cb13c4e0a478352141ac (diff)
i386: move kernel
Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/.gitignore1
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/Makefile_3288
-rw-r--r--arch/x86/kernel/alternative.c450
-rw-r--r--arch/x86/kernel/apic_32.c1566
-rw-r--r--arch/x86/kernel/apm_32.c2403
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c147
-rw-r--r--arch/x86/kernel/bootflag.c98
-rw-r--r--arch/x86/kernel/cpuid.c242
-rw-r--r--arch/x86/kernel/crash_32.c137
-rw-r--r--arch/x86/kernel/crash_dump_32.c74
-rw-r--r--arch/x86/kernel/doublefault_32.c70
-rw-r--r--arch/x86/kernel/e820_32.c944
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi_32.c712
-rw-r--r--arch/x86/kernel/efi_stub_32.S122
-rw-r--r--arch/x86/kernel/entry_32.S1112
-rw-r--r--arch/x86/kernel/geode_32.c155
-rw-r--r--arch/x86/kernel/head_32.S578
-rw-r--r--arch/x86/kernel/hpet_32.c553
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c30
-rw-r--r--arch/x86/kernel/i387_32.c546
-rw-r--r--arch/x86/kernel/i8237.c72
-rw-r--r--arch/x86/kernel/i8253_32.c206
-rw-r--r--arch/x86/kernel/i8259_32.c420
-rw-r--r--arch/x86/kernel/init_task_32.c46
-rw-r--r--arch/x86/kernel/io_apic_32.c2847
-rw-r--r--arch/x86/kernel/ioport_32.c153
-rw-r--r--arch/x86/kernel/irq_32.c343
-rw-r--r--arch/x86/kernel/kprobes_32.c751
-rw-r--r--arch/x86/kernel/ldt_32.c250
-rw-r--r--arch/x86/kernel/machine_kexec_32.c171
-rw-r--r--arch/x86/kernel/mca_32.c470
-rw-r--r--arch/x86/kernel/microcode.c850
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/mpparse_32.c1132
-rw-r--r--arch/x86/kernel/msr.c224
-rw-r--r--arch/x86/kernel/nmi_32.c468
-rw-r--r--arch/x86/kernel/numaq_32.c89
-rw-r--r--arch/x86/kernel/paravirt_32.c392
-rw-r--r--arch/x86/kernel/pci-dma_32.c177
-rw-r--r--arch/x86/kernel/pcspeaker.c20
-rw-r--r--arch/x86/kernel/process_32.c951
-rw-r--r--arch/x86/kernel/ptrace_32.c723
-rw-r--r--arch/x86/kernel/quirks.c49
-rw-r--r--arch/x86/kernel/reboot_32.c413
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c68
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S252
-rw-r--r--arch/x86/kernel/scx200_32.c131
-rw-r--r--arch/x86/kernel/setup_32.c653
-rw-r--r--arch/x86/kernel/sigframe_32.h21
-rw-r--r--arch/x86/kernel/signal_32.c667
-rw-r--r--arch/x86/kernel/smp_32.c707
-rw-r--r--arch/x86/kernel/smpboot_32.c1322
-rw-r--r--arch/x86/kernel/smpcommon_32.c81
-rw-r--r--arch/x86/kernel/srat_32.c360
-rw-r--r--arch/x86/kernel/summit_32.c180
-rw-r--r--arch/x86/kernel/sys_i386_32.c265
-rw-r--r--arch/x86/kernel/syscall_table_32.S326
-rw-r--r--arch/x86/kernel/sysenter_32.c348
-rw-r--r--arch/x86/kernel/time_32.c236
-rw-r--r--arch/x86/kernel/topology.c77
-rw-r--r--arch/x86/kernel/trampoline_32.S85
-rw-r--r--arch/x86/kernel/traps_32.c1250
-rw-r--r--arch/x86/kernel/tsc_32.c413
-rw-r--r--arch/x86/kernel/tsc_sync.c1
-rw-r--r--arch/x86/kernel/vm86_32.c843
-rw-r--r--arch/x86/kernel/vmi_32.c981
-rw-r--r--arch/x86/kernel/vmiclock_32.c320
-rw-r--r--arch/x86/kernel/vmlinux.lds.S5
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S213
-rw-r--r--arch/x86/kernel/vsyscall-int80_32.S53
-rw-r--r--arch/x86/kernel/vsyscall-note_32.S45
-rw-r--r--arch/x86/kernel/vsyscall-sigreturn_32.S143
-rw-r--r--arch/x86/kernel/vsyscall-sysenter_32.S122
-rw-r--r--arch/x86/kernel/vsyscall_32.S15
-rw-r--r--arch/x86/kernel/vsyscall_32.lds.S67
78 files changed, 31659 insertions, 0 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644
index 000000000000..40836ad9079c
--- /dev/null
+++ b/arch/x86/kernel/.gitignore
@@ -0,0 +1 @@
vsyscall.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 000000000000..577d08f4b8bb
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,5 @@
1ifeq ($(CONFIG_X86_32),y)
2include ${srctree}/arch/x86/kernel/Makefile_32
3else
4include ${srctree}/arch/x86_64/kernel/Makefile_64
5endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
new file mode 100644
index 000000000000..5096f486d389
--- /dev/null
+++ b/arch/x86/kernel/Makefile_32
@@ -0,0 +1,88 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_32.o init_task_32.o vmlinux.lds
6
7obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
8 ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
9 pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
10 quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o
13obj-y += ../../x86/kernel/cpu/
14obj-y += ../../x86/kernel/acpi/
15obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o
16obj-$(CONFIG_MCA) += mca_32.o
17obj-$(CONFIG_X86_MSR) += msr.o
18obj-$(CONFIG_X86_CPUID) += cpuid.o
19obj-$(CONFIG_MICROCODE) += microcode.o
20obj-$(CONFIG_APM) += apm_32.o
21obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
22obj-$(CONFIG_SMP) += smpcommon_32.o
23obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
24obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
25obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
26obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
27obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
28obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash_32.o
29obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o
30obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
31obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
32obj-$(CONFIG_KPROBES) += kprobes_32.o
33obj-$(CONFIG_MODULES) += module_32.o
34obj-y += sysenter_32.o vsyscall_32.o
35obj-$(CONFIG_ACPI_SRAT) += srat_32.o
36obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o
37obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
38obj-$(CONFIG_VM86) += vm86_32.o
39obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet_32.o
41obj-$(CONFIG_K8_NB) += k8.o
42obj-$(CONFIG_MGEODE_LX) += geode_32.o
43
44obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
45obj-$(CONFIG_PARAVIRT) += paravirt_32.o
46obj-y += pcspeaker.o
47
48obj-$(CONFIG_SCx200) += scx200_32.o
49
50# vsyscall_32.o contains the vsyscall DSO images as __initdata.
51# We must build both images before we can assemble it.
52# Note: kbuild does not track this dependency due to usage of .incbin
53$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
54targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
55targets += vsyscall-note_32.o vsyscall_32.lds
56
57# The DSO images are built using a special linker script.
58quiet_cmd_syscall = SYSCALL $@
59 cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
60 -Wl,-T,$(filter-out FORCE,$^) -o $@
61
62export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
63
64vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
65 $(call ld-option, -Wl$(comma)--hash-style=sysv)
66SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags)
67SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
68
69$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
70$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
71 $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
72 $(call if_changed,syscall)
73
74# We also create a special relocatable object that should mirror the symbol
75# table and layout of the linked DSO. With ld -R we can then refer to
76# these symbols in the kernel code rather than hand-coded addresses.
77extra-y += vsyscall-syms.o
78$(obj)/built-in.o: $(obj)/vsyscall-syms.o
79$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
80
81SYSCFLAGS_vsyscall-syms.o = -r
82$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
83 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
84 $(call if_changed,syscall)
85
86k8-y += ../../x86_64/kernel/k8.o
87stacktrace-y += ../../x86_64/kernel/stacktrace.o
88
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 000000000000..bd72d94e713e
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,450 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/spinlock.h>
4#include <linux/list.h>
5#include <linux/kprobes.h>
6#include <linux/mm.h>
7#include <linux/vmalloc.h>
8#include <asm/alternative.h>
9#include <asm/sections.h>
10#include <asm/pgtable.h>
11#include <asm/mce.h>
12#include <asm/nmi.h>
13
14#define MAX_PATCH_LEN (255-1)
15
16#ifdef CONFIG_HOTPLUG_CPU
17static int smp_alt_once;
18
19static int __init bootonly(char *str)
20{
21 smp_alt_once = 1;
22 return 1;
23}
24__setup("smp-alt-boot", bootonly);
25#else
26#define smp_alt_once 1
27#endif
28
29static int debug_alternative;
30
31static int __init debug_alt(char *str)
32{
33 debug_alternative = 1;
34 return 1;
35}
36__setup("debug-alternative", debug_alt);
37
38static int noreplace_smp;
39
40static int __init setup_noreplace_smp(char *str)
41{
42 noreplace_smp = 1;
43 return 1;
44}
45__setup("noreplace-smp", setup_noreplace_smp);
46
47#ifdef CONFIG_PARAVIRT
48static int noreplace_paravirt = 0;
49
50static int __init setup_noreplace_paravirt(char *str)
51{
52 noreplace_paravirt = 1;
53 return 1;
54}
55__setup("noreplace-paravirt", setup_noreplace_paravirt);
56#endif
57
58#define DPRINTK(fmt, args...) if (debug_alternative) \
59 printk(KERN_DEBUG fmt, args)
60
61#ifdef GENERIC_NOP1
62/* Use inline assembly to define this because the nops are defined
63 as inline assembly strings in the include files and we cannot
64 get them easily into strings. */
65asm("\t.data\nintelnops: "
66 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
67 GENERIC_NOP7 GENERIC_NOP8);
68extern unsigned char intelnops[];
69static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
70 NULL,
71 intelnops,
72 intelnops + 1,
73 intelnops + 1 + 2,
74 intelnops + 1 + 2 + 3,
75 intelnops + 1 + 2 + 3 + 4,
76 intelnops + 1 + 2 + 3 + 4 + 5,
77 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
78 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
79};
80#endif
81
82#ifdef K8_NOP1
83asm("\t.data\nk8nops: "
84 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
85 K8_NOP7 K8_NOP8);
86extern unsigned char k8nops[];
87static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
88 NULL,
89 k8nops,
90 k8nops + 1,
91 k8nops + 1 + 2,
92 k8nops + 1 + 2 + 3,
93 k8nops + 1 + 2 + 3 + 4,
94 k8nops + 1 + 2 + 3 + 4 + 5,
95 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
96 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
97};
98#endif
99
100#ifdef K7_NOP1
101asm("\t.data\nk7nops: "
102 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
103 K7_NOP7 K7_NOP8);
104extern unsigned char k7nops[];
105static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
106 NULL,
107 k7nops,
108 k7nops + 1,
109 k7nops + 1 + 2,
110 k7nops + 1 + 2 + 3,
111 k7nops + 1 + 2 + 3 + 4,
112 k7nops + 1 + 2 + 3 + 4 + 5,
113 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
114 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
115};
116#endif
117
118#ifdef CONFIG_X86_64
119
120extern char __vsyscall_0;
121static inline unsigned char** find_nop_table(void)
122{
123 return k8_nops;
124}
125
126#else /* CONFIG_X86_64 */
127
128static struct nop {
129 int cpuid;
130 unsigned char **noptable;
131} noptypes[] = {
132 { X86_FEATURE_K8, k8_nops },
133 { X86_FEATURE_K7, k7_nops },
134 { -1, NULL }
135};
136
137static unsigned char** find_nop_table(void)
138{
139 unsigned char **noptable = intel_nops;
140 int i;
141
142 for (i = 0; noptypes[i].cpuid >= 0; i++) {
143 if (boot_cpu_has(noptypes[i].cpuid)) {
144 noptable = noptypes[i].noptable;
145 break;
146 }
147 }
148 return noptable;
149}
150
151#endif /* CONFIG_X86_64 */
152
153/* Use this to add nops to a buffer, then text_poke the whole buffer. */
154static void add_nops(void *insns, unsigned int len)
155{
156 unsigned char **noptable = find_nop_table();
157
158 while (len > 0) {
159 unsigned int noplen = len;
160 if (noplen > ASM_NOP_MAX)
161 noplen = ASM_NOP_MAX;
162 memcpy(insns, noptable[noplen], noplen);
163 insns += noplen;
164 len -= noplen;
165 }
166}
167
168extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
169extern u8 *__smp_locks[], *__smp_locks_end[];
170
171/* Replace instructions with better alternatives for this CPU type.
172 This runs before SMP is initialized to avoid SMP problems with
173 self modifying code. This implies that assymetric systems where
174 APs have less capabilities than the boot processor are not handled.
175 Tough. Make sure you disable such features by hand. */
176
177void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
178{
179 struct alt_instr *a;
180 char insnbuf[MAX_PATCH_LEN];
181
182 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
183 for (a = start; a < end; a++) {
184 u8 *instr = a->instr;
185 BUG_ON(a->replacementlen > a->instrlen);
186 BUG_ON(a->instrlen > sizeof(insnbuf));
187 if (!boot_cpu_has(a->cpuid))
188 continue;
189#ifdef CONFIG_X86_64
190 /* vsyscall code is not mapped yet. resolve it manually. */
191 if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
192 instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
193 DPRINTK("%s: vsyscall fixup: %p => %p\n",
194 __FUNCTION__, a->instr, instr);
195 }
196#endif
197 memcpy(insnbuf, a->replacement, a->replacementlen);
198 add_nops(insnbuf + a->replacementlen,
199 a->instrlen - a->replacementlen);
200 text_poke(instr, insnbuf, a->instrlen);
201 }
202}
203
204#ifdef CONFIG_SMP
205
206static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
207{
208 u8 **ptr;
209
210 for (ptr = start; ptr < end; ptr++) {
211 if (*ptr < text)
212 continue;
213 if (*ptr > text_end)
214 continue;
215 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
216 };
217}
218
219static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
220{
221 u8 **ptr;
222 char insn[1];
223
224 if (noreplace_smp)
225 return;
226
227 add_nops(insn, 1);
228 for (ptr = start; ptr < end; ptr++) {
229 if (*ptr < text)
230 continue;
231 if (*ptr > text_end)
232 continue;
233 text_poke(*ptr, insn, 1);
234 };
235}
236
237struct smp_alt_module {
238 /* what is this ??? */
239 struct module *mod;
240 char *name;
241
242 /* ptrs to lock prefixes */
243 u8 **locks;
244 u8 **locks_end;
245
246 /* .text segment, needed to avoid patching init code ;) */
247 u8 *text;
248 u8 *text_end;
249
250 struct list_head next;
251};
252static LIST_HEAD(smp_alt_modules);
253static DEFINE_SPINLOCK(smp_alt);
254
255void alternatives_smp_module_add(struct module *mod, char *name,
256 void *locks, void *locks_end,
257 void *text, void *text_end)
258{
259 struct smp_alt_module *smp;
260 unsigned long flags;
261
262 if (noreplace_smp)
263 return;
264
265 if (smp_alt_once) {
266 if (boot_cpu_has(X86_FEATURE_UP))
267 alternatives_smp_unlock(locks, locks_end,
268 text, text_end);
269 return;
270 }
271
272 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
273 if (NULL == smp)
274 return; /* we'll run the (safe but slow) SMP code then ... */
275
276 smp->mod = mod;
277 smp->name = name;
278 smp->locks = locks;
279 smp->locks_end = locks_end;
280 smp->text = text;
281 smp->text_end = text_end;
282 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
283 __FUNCTION__, smp->locks, smp->locks_end,
284 smp->text, smp->text_end, smp->name);
285
286 spin_lock_irqsave(&smp_alt, flags);
287 list_add_tail(&smp->next, &smp_alt_modules);
288 if (boot_cpu_has(X86_FEATURE_UP))
289 alternatives_smp_unlock(smp->locks, smp->locks_end,
290 smp->text, smp->text_end);
291 spin_unlock_irqrestore(&smp_alt, flags);
292}
293
294void alternatives_smp_module_del(struct module *mod)
295{
296 struct smp_alt_module *item;
297 unsigned long flags;
298
299 if (smp_alt_once || noreplace_smp)
300 return;
301
302 spin_lock_irqsave(&smp_alt, flags);
303 list_for_each_entry(item, &smp_alt_modules, next) {
304 if (mod != item->mod)
305 continue;
306 list_del(&item->next);
307 spin_unlock_irqrestore(&smp_alt, flags);
308 DPRINTK("%s: %s\n", __FUNCTION__, item->name);
309 kfree(item);
310 return;
311 }
312 spin_unlock_irqrestore(&smp_alt, flags);
313}
314
315void alternatives_smp_switch(int smp)
316{
317 struct smp_alt_module *mod;
318 unsigned long flags;
319
320#ifdef CONFIG_LOCKDEP
321 /*
322 * A not yet fixed binutils section handling bug prevents
323 * alternatives-replacement from working reliably, so turn
324 * it off:
325 */
326 printk("lockdep: not fixing up alternatives.\n");
327 return;
328#endif
329
330 if (noreplace_smp || smp_alt_once)
331 return;
332 BUG_ON(!smp && (num_online_cpus() > 1));
333
334 spin_lock_irqsave(&smp_alt, flags);
335 if (smp) {
336 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
337 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
338 clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
339 list_for_each_entry(mod, &smp_alt_modules, next)
340 alternatives_smp_lock(mod->locks, mod->locks_end,
341 mod->text, mod->text_end);
342 } else {
343 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
344 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
345 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
346 list_for_each_entry(mod, &smp_alt_modules, next)
347 alternatives_smp_unlock(mod->locks, mod->locks_end,
348 mod->text, mod->text_end);
349 }
350 spin_unlock_irqrestore(&smp_alt, flags);
351}
352
353#endif
354
355#ifdef CONFIG_PARAVIRT
356void apply_paravirt(struct paravirt_patch_site *start,
357 struct paravirt_patch_site *end)
358{
359 struct paravirt_patch_site *p;
360 char insnbuf[MAX_PATCH_LEN];
361
362 if (noreplace_paravirt)
363 return;
364
365 for (p = start; p < end; p++) {
366 unsigned int used;
367
368 BUG_ON(p->len > MAX_PATCH_LEN);
369 /* prep the buffer with the original instructions */
370 memcpy(insnbuf, p->instr, p->len);
371 used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
372 (unsigned long)p->instr, p->len);
373
374 BUG_ON(used > p->len);
375
376 /* Pad the rest with nops */
377 add_nops(insnbuf + used, p->len - used);
378 text_poke(p->instr, insnbuf, p->len);
379 }
380}
381extern struct paravirt_patch_site __start_parainstructions[],
382 __stop_parainstructions[];
383#endif /* CONFIG_PARAVIRT */
384
385void __init alternative_instructions(void)
386{
387 unsigned long flags;
388
389 /* The patching is not fully atomic, so try to avoid local interruptions
390 that might execute the to be patched code.
391 Other CPUs are not running. */
392 stop_nmi();
393#ifdef CONFIG_X86_MCE
394 stop_mce();
395#endif
396
397 local_irq_save(flags);
398 apply_alternatives(__alt_instructions, __alt_instructions_end);
399
400 /* switch to patch-once-at-boottime-only mode and free the
401 * tables in case we know the number of CPUs will never ever
402 * change */
403#ifdef CONFIG_HOTPLUG_CPU
404 if (num_possible_cpus() < 2)
405 smp_alt_once = 1;
406#endif
407
408#ifdef CONFIG_SMP
409 if (smp_alt_once) {
410 if (1 == num_possible_cpus()) {
411 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
412 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
413 set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
414 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
415 _text, _etext);
416 }
417 free_init_pages("SMP alternatives",
418 (unsigned long)__smp_locks,
419 (unsigned long)__smp_locks_end);
420 } else {
421 alternatives_smp_module_add(NULL, "core kernel",
422 __smp_locks, __smp_locks_end,
423 _text, _etext);
424 alternatives_smp_switch(0);
425 }
426#endif
427 apply_paravirt(__parainstructions, __parainstructions_end);
428 local_irq_restore(flags);
429
430 restart_nmi();
431#ifdef CONFIG_X86_MCE
432 restart_mce();
433#endif
434}
435
436/*
437 * Warning:
438 * When you use this code to patch more than one byte of an instruction
439 * you need to make sure that other CPUs cannot execute this code in parallel.
440 * Also no thread must be currently preempted in the middle of these instructions.
441 * And on the local CPU you need to be protected again NMI or MCE handlers
442 * seeing an inconsistent instruction while you patch.
443 */
444void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
445{
446 memcpy(addr, opcode, len);
447 sync_core();
448 /* Could also do a CLFLUSH here to speed up CPU recovery; but
449 that causes hangs on some VIA CPUs. */
450}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
new file mode 100644
index 000000000000..3d67ae18d762
--- /dev/null
+++ b/arch/x86/kernel/apic_32.c
@@ -0,0 +1,1566 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/cpu.h>
27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
30#include <linux/dmi.h>
31
32#include <asm/atomic.h>
33#include <asm/smp.h>
34#include <asm/mtrr.h>
35#include <asm/mpspec.h>
36#include <asm/desc.h>
37#include <asm/arch_hooks.h>
38#include <asm/hpet.h>
39#include <asm/i8253.h>
40#include <asm/nmi.h>
41
42#include <mach_apic.h>
43#include <mach_apicdef.h>
44#include <mach_ipi.h>
45
46#include "io_ports.h"
47
48/*
49 * Sanity check
50 */
51#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
52# error SPURIOUS_APIC_VECTOR definition error
53#endif
54
55/*
56 * Knob to control our willingness to enable the local APIC.
57 *
58 * -1=force-disable, +1=force-enable
59 */
60static int enable_local_apic __initdata = 0;
61
62/* Local APIC timer verification ok */
63static int local_apic_timer_verify_ok;
64/* Disable local APIC timer from the kernel commandline or via dmi quirk
65 or using CPU MSR check */
66int local_apic_timer_disabled;
67/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
70
71/*
72 * Debug level, exported for io_apic.c
73 */
74int apic_verbosity;
75
76static unsigned int calibration_result;
77
78static int lapic_next_event(unsigned long delta,
79 struct clock_event_device *evt);
80static void lapic_timer_setup(enum clock_event_mode mode,
81 struct clock_event_device *evt);
82static void lapic_timer_broadcast(cpumask_t mask);
83static void apic_pm_activate(void);
84
85/*
86 * The local apic timer can be used for any function which is CPU local.
87 */
88static struct clock_event_device lapic_clockevent = {
89 .name = "lapic",
90 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
91 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
92 .shift = 32,
93 .set_mode = lapic_timer_setup,
94 .set_next_event = lapic_next_event,
95 .broadcast = lapic_timer_broadcast,
96 .rating = 100,
97 .irq = -1,
98};
99static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
100
101/* Local APIC was disabled by the BIOS and enabled by the kernel */
102static int enabled_via_apicbase;
103
104/*
105 * Get the LAPIC version
106 */
107static inline int lapic_get_version(void)
108{
109 return GET_APIC_VERSION(apic_read(APIC_LVR));
110}
111
112/*
113 * Check, if the APIC is integrated or a seperate chip
114 */
115static inline int lapic_is_integrated(void)
116{
117 return APIC_INTEGRATED(lapic_get_version());
118}
119
120/*
121 * Check, whether this is a modern or a first generation APIC
122 */
123static int modern_apic(void)
124{
125 /* AMD systems use old APIC versions, so check the CPU */
126 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
127 boot_cpu_data.x86 >= 0xf)
128 return 1;
129 return lapic_get_version() >= 0x14;
130}
131
132void apic_wait_icr_idle(void)
133{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
135 cpu_relax();
136}
137
138unsigned long safe_apic_wait_icr_idle(void)
139{
140 unsigned long send_status;
141 int timeout;
142
143 timeout = 0;
144 do {
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150
151 return send_status;
152}
153
154/**
155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
156 */
157void enable_NMI_through_LVT0 (void * dummy)
158{
159 unsigned int v = APIC_DM_NMI;
160
161 /* Level triggered for 82489DX */
162 if (!lapic_is_integrated())
163 v |= APIC_LVT_LEVEL_TRIGGER;
164 apic_write_around(APIC_LVT0, v);
165}
166
167/**
168 * get_physical_broadcast - Get number of physical broadcast IDs
169 */
170int get_physical_broadcast(void)
171{
172 return modern_apic() ? 0xff : 0xf;
173}
174
175/**
176 * lapic_get_maxlvt - get the maximum number of local vector table entries
177 */
178int lapic_get_maxlvt(void)
179{
180 unsigned int v = apic_read(APIC_LVR);
181
182 /* 82489DXs do not report # of LVT entries. */
183 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
184}
185
186/*
187 * Local APIC timer
188 */
189
190/* Clock divisor is set to 16 */
191#define APIC_DIVISOR 16
192
193/*
194 * This function sets up the local APIC timer, with a timeout of
195 * 'clocks' APIC bus clock. During calibration we actually call
196 * this function twice on the boot CPU, once with a bogus timeout
197 * value, second time for real. The other (noncalibrating) CPUs
198 * call this function only once, with the real, calibrated value.
199 *
200 * We do reads before writes even if unnecessary, to get around the
201 * P5 APIC double write bug.
202 */
203static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
204{
205 unsigned int lvtt_value, tmp_value;
206
207 lvtt_value = LOCAL_TIMER_VECTOR;
208 if (!oneshot)
209 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
210 if (!lapic_is_integrated())
211 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
212
213 if (!irqen)
214 lvtt_value |= APIC_LVT_MASKED;
215
216 apic_write_around(APIC_LVTT, lvtt_value);
217
218 /*
219 * Divide PICLK by 16
220 */
221 tmp_value = apic_read(APIC_TDCR);
222 apic_write_around(APIC_TDCR, (tmp_value
223 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
224 | APIC_TDR_DIV_16);
225
226 if (!oneshot)
227 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
228}
229
230/*
231 * Program the next event, relative to now
232 */
233static int lapic_next_event(unsigned long delta,
234 struct clock_event_device *evt)
235{
236 apic_write_around(APIC_TMICT, delta);
237 return 0;
238}
239
240/*
241 * Setup the lapic timer in periodic or oneshot mode
242 */
243static void lapic_timer_setup(enum clock_event_mode mode,
244 struct clock_event_device *evt)
245{
246 unsigned long flags;
247 unsigned int v;
248
249 /* Lapic used for broadcast ? */
250 if (!local_apic_timer_verify_ok)
251 return;
252
253 local_irq_save(flags);
254
255 switch (mode) {
256 case CLOCK_EVT_MODE_PERIODIC:
257 case CLOCK_EVT_MODE_ONESHOT:
258 __setup_APIC_LVTT(calibration_result,
259 mode != CLOCK_EVT_MODE_PERIODIC, 1);
260 break;
261 case CLOCK_EVT_MODE_UNUSED:
262 case CLOCK_EVT_MODE_SHUTDOWN:
263 v = apic_read(APIC_LVTT);
264 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
265 apic_write_around(APIC_LVTT, v);
266 break;
267 case CLOCK_EVT_MODE_RESUME:
268 /* Nothing to do here */
269 break;
270 }
271
272 local_irq_restore(flags);
273}
274
275/*
276 * Local APIC timer broadcast function
277 */
278static void lapic_timer_broadcast(cpumask_t mask)
279{
280#ifdef CONFIG_SMP
281 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
282#endif
283}
284
285/*
286 * Setup the local APIC timer for this CPU. Copy the initilized values
287 * of the boot CPU and register the clock event in the framework.
288 */
289static void __devinit setup_APIC_timer(void)
290{
291 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
292
293 memcpy(levt, &lapic_clockevent, sizeof(*levt));
294 levt->cpumask = cpumask_of_cpu(smp_processor_id());
295
296 clockevents_register_device(levt);
297}
298
299/*
300 * In this functions we calibrate APIC bus clocks to the external timer.
301 *
302 * We want to do the calibration only once since we want to have local timer
303 * irqs syncron. CPUs connected by the same APIC bus have the very same bus
304 * frequency.
305 *
306 * This was previously done by reading the PIT/HPET and waiting for a wrap
307 * around to find out, that a tick has elapsed. I have a box, where the PIT
308 * readout is broken, so it never gets out of the wait loop again. This was
309 * also reported by others.
310 *
311 * Monitoring the jiffies value is inaccurate and the clockevents
312 * infrastructure allows us to do a simple substitution of the interrupt
313 * handler.
314 *
315 * The calibration routine also uses the pm_timer when possible, as the PIT
316 * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
317 * back to normal later in the boot process).
318 */
319
320#define LAPIC_CAL_LOOPS (HZ/10)
321
322static __initdata int lapic_cal_loops = -1;
323static __initdata long lapic_cal_t1, lapic_cal_t2;
324static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
325static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
326static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
327
328/*
329 * Temporary interrupt handler.
330 */
331static void __init lapic_cal_handler(struct clock_event_device *dev)
332{
333 unsigned long long tsc = 0;
334 long tapic = apic_read(APIC_TMCCT);
335 unsigned long pm = acpi_pm_read_early();
336
337 if (cpu_has_tsc)
338 rdtscll(tsc);
339
340 switch (lapic_cal_loops++) {
341 case 0:
342 lapic_cal_t1 = tapic;
343 lapic_cal_tsc1 = tsc;
344 lapic_cal_pm1 = pm;
345 lapic_cal_j1 = jiffies;
346 break;
347
348 case LAPIC_CAL_LOOPS:
349 lapic_cal_t2 = tapic;
350 lapic_cal_tsc2 = tsc;
351 if (pm < lapic_cal_pm1)
352 pm += ACPI_PM_OVRRUN;
353 lapic_cal_pm2 = pm;
354 lapic_cal_j2 = jiffies;
355 break;
356 }
357}
358
359/*
360 * Setup the boot APIC
361 *
362 * Calibrate and verify the result.
363 */
364void __init setup_boot_APIC_clock(void)
365{
366 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
367 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
368 const long pm_thresh = pm_100ms/100;
369 void (*real_handler)(struct clock_event_device *dev);
370 unsigned long deltaj;
371 long delta, deltapm;
372 int pm_referenced = 0;
373
374 /*
375 * The local apic timer can be disabled via the kernel
376 * commandline or from the CPU detection code. Register the lapic
377 * timer as a dummy clock event source on SMP systems, so the
378 * broadcast mechanism is used. On UP systems simply ignore it.
379 */
380 if (local_apic_timer_disabled) {
381 /* No broadcast on UP ! */
382 if (num_possible_cpus() > 1)
383 setup_APIC_timer();
384 return;
385 }
386
387 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
388 "calibrating APIC timer ...\n");
389
390 local_irq_disable();
391
392 /* Replace the global interrupt handler */
393 real_handler = global_clock_event->event_handler;
394 global_clock_event->event_handler = lapic_cal_handler;
395
396 /*
397 * Setup the APIC counter to 1e9. There is no way the lapic
398 * can underflow in the 100ms detection time frame
399 */
400 __setup_APIC_LVTT(1000000000, 0, 0);
401
402 /* Let the interrupts run */
403 local_irq_enable();
404
405 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
406 cpu_relax();
407
408 local_irq_disable();
409
410 /* Restore the real event handler */
411 global_clock_event->event_handler = real_handler;
412
413 /* Build delta t1-t2 as apic timer counts down */
414 delta = lapic_cal_t1 - lapic_cal_t2;
415 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
416
417 /* Check, if the PM timer is available */
418 deltapm = lapic_cal_pm2 - lapic_cal_pm1;
419 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
420
421 if (deltapm) {
422 unsigned long mult;
423 u64 res;
424
425 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
426
427 if (deltapm > (pm_100ms - pm_thresh) &&
428 deltapm < (pm_100ms + pm_thresh)) {
429 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
430 } else {
431 res = (((u64) deltapm) * mult) >> 22;
432 do_div(res, 1000000);
433 printk(KERN_WARNING "APIC calibration not consistent "
434 "with PM Timer: %ldms instead of 100ms\n",
435 (long)res);
436 /* Correct the lapic counter value */
437 res = (((u64) delta ) * pm_100ms);
438 do_div(res, deltapm);
439 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
440 "%lu (%ld)\n", (unsigned long) res, delta);
441 delta = (long) res;
442 }
443 pm_referenced = 1;
444 }
445
446 /* Calculate the scaled math multiplication factor */
447 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
448 lapic_clockevent.max_delta_ns =
449 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
450 lapic_clockevent.min_delta_ns =
451 clockevent_delta2ns(0xF, &lapic_clockevent);
452
453 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
454
455 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
456 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
457 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
458 calibration_result);
459
460 if (cpu_has_tsc) {
461 delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
462 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
463 "%ld.%04ld MHz.\n",
464 (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
465 (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
466 }
467
468 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
469 "%u.%04u MHz.\n",
470 calibration_result / (1000000 / HZ),
471 calibration_result % (1000000 / HZ));
472
473 local_apic_timer_verify_ok = 1;
474
475 /* We trust the pm timer based calibration */
476 if (!pm_referenced) {
477 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
478
479 /*
480 * Setup the apic timer manually
481 */
482 levt->event_handler = lapic_cal_handler;
483 lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
484 lapic_cal_loops = -1;
485
486 /* Let the interrupts run */
487 local_irq_enable();
488
489 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
490 cpu_relax();
491
492 local_irq_disable();
493
494 /* Stop the lapic timer */
495 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
496
497 local_irq_enable();
498
499 /* Jiffies delta */
500 deltaj = lapic_cal_j2 - lapic_cal_j1;
501 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
502
503 /* Check, if the jiffies result is consistent */
504 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
505 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
506 else
507 local_apic_timer_verify_ok = 0;
508 } else
509 local_irq_enable();
510
511 if (!local_apic_timer_verify_ok) {
512 printk(KERN_WARNING
513 "APIC timer disabled due to verification failure.\n");
514 /* No broadcast on UP ! */
515 if (num_possible_cpus() == 1)
516 return;
517 } else {
518 /*
519 * If nmi_watchdog is set to IO_APIC, we need the
520 * PIT/HPET going. Otherwise register lapic as a dummy
521 * device.
522 */
523 if (nmi_watchdog != NMI_IO_APIC)
524 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
525 else
526 printk(KERN_WARNING "APIC timer registered as dummy,"
527 " due to nmi_watchdog=1!\n");
528 }
529
530 /* Setup the lapic or request the broadcast */
531 setup_APIC_timer();
532}
533
534void __devinit setup_secondary_APIC_clock(void)
535{
536 setup_APIC_timer();
537}
538
539/*
540 * The guts of the apic timer interrupt
541 */
542static void local_apic_timer_interrupt(void)
543{
544 int cpu = smp_processor_id();
545 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
546
547 /*
548 * Normally we should not be here till LAPIC has been initialized but
549 * in some cases like kdump, its possible that there is a pending LAPIC
550 * timer interrupt from previous kernel's context and is delivered in
551 * new kernel the moment interrupts are enabled.
552 *
553 * Interrupts are enabled early and LAPIC is setup much later, hence
554 * its possible that when we get here evt->event_handler is NULL.
555 * Check for event_handler being NULL and discard the interrupt as
556 * spurious.
557 */
558 if (!evt->event_handler) {
559 printk(KERN_WARNING
560 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
561 /* Switch it off */
562 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
563 return;
564 }
565
566 per_cpu(irq_stat, cpu).apic_timer_irqs++;
567
568 evt->event_handler(evt);
569}
570
571/*
572 * Local APIC timer interrupt. This is the most natural way for doing
573 * local interrupts, but local timer interrupts can be emulated by
574 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
575 *
576 * [ if a single-CPU system runs an SMP kernel then we call the local
577 * interrupt as well. Thus we cannot inline the local irq ... ]
578 */
579
580void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
581{
582 struct pt_regs *old_regs = set_irq_regs(regs);
583
584 /*
585 * NOTE! We'd better ACK the irq immediately,
586 * because timer handling can be slow.
587 */
588 ack_APIC_irq();
589 /*
590 * update_process_times() expects us to have done irq_enter().
591 * Besides, if we don't timer interrupts ignore the global
592 * interrupt lock, which is the WrongThing (tm) to do.
593 */
594 irq_enter();
595 local_apic_timer_interrupt();
596 irq_exit();
597
598 set_irq_regs(old_regs);
599}
600
601int setup_profiling_timer(unsigned int multiplier)
602{
603 return -EINVAL;
604}
605
606/*
607 * Local APIC start and shutdown
608 */
609
610/**
611 * clear_local_APIC - shutdown the local APIC
612 *
613 * This is called, when a CPU is disabled and before rebooting, so the state of
614 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
615 * leftovers during boot.
616 */
617void clear_local_APIC(void)
618{
619 int maxlvt = lapic_get_maxlvt();
620 unsigned long v;
621
622 /*
623 * Masking an LVT entry can trigger a local APIC error
624 * if the vector is zero. Mask LVTERR first to prevent this.
625 */
626 if (maxlvt >= 3) {
627 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
628 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
629 }
630 /*
631 * Careful: we have to set masks only first to deassert
632 * any level-triggered sources.
633 */
634 v = apic_read(APIC_LVTT);
635 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
636 v = apic_read(APIC_LVT0);
637 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
638 v = apic_read(APIC_LVT1);
639 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
640 if (maxlvt >= 4) {
641 v = apic_read(APIC_LVTPC);
642 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
643 }
644
645 /* lets not touch this if we didn't frob it */
646#ifdef CONFIG_X86_MCE_P4THERMAL
647 if (maxlvt >= 5) {
648 v = apic_read(APIC_LVTTHMR);
649 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
650 }
651#endif
652 /*
653 * Clean APIC state for other OSs:
654 */
655 apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
656 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
657 apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
658 if (maxlvt >= 3)
659 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
660 if (maxlvt >= 4)
661 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
662
663#ifdef CONFIG_X86_MCE_P4THERMAL
664 if (maxlvt >= 5)
665 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
666#endif
667 /* Integrated APIC (!82489DX) ? */
668 if (lapic_is_integrated()) {
669 if (maxlvt > 3)
670 /* Clear ESR due to Pentium errata 3AP and 11AP */
671 apic_write(APIC_ESR, 0);
672 apic_read(APIC_ESR);
673 }
674}
675
676/**
677 * disable_local_APIC - clear and disable the local APIC
678 */
679void disable_local_APIC(void)
680{
681 unsigned long value;
682
683 clear_local_APIC();
684
685 /*
686 * Disable APIC (implies clearing of registers
687 * for 82489DX!).
688 */
689 value = apic_read(APIC_SPIV);
690 value &= ~APIC_SPIV_APIC_ENABLED;
691 apic_write_around(APIC_SPIV, value);
692
693 /*
694 * When LAPIC was disabled by the BIOS and enabled by the kernel,
695 * restore the disabled state.
696 */
697 if (enabled_via_apicbase) {
698 unsigned int l, h;
699
700 rdmsr(MSR_IA32_APICBASE, l, h);
701 l &= ~MSR_IA32_APICBASE_ENABLE;
702 wrmsr(MSR_IA32_APICBASE, l, h);
703 }
704}
705
706/*
707 * If Linux enabled the LAPIC against the BIOS default disable it down before
708 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
709 * not power-off. Additionally clear all LVT entries before disable_local_APIC
710 * for the case where Linux didn't enable the LAPIC.
711 */
712void lapic_shutdown(void)
713{
714 unsigned long flags;
715
716 if (!cpu_has_apic)
717 return;
718
719 local_irq_save(flags);
720 clear_local_APIC();
721
722 if (enabled_via_apicbase)
723 disable_local_APIC();
724
725 local_irq_restore(flags);
726}
727
728/*
729 * This is to verify that we're looking at a real local APIC.
730 * Check these against your board if the CPUs aren't getting
731 * started for no apparent reason.
732 */
733int __init verify_local_APIC(void)
734{
735 unsigned int reg0, reg1;
736
737 /*
738 * The version register is read-only in a real APIC.
739 */
740 reg0 = apic_read(APIC_LVR);
741 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
742 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
743 reg1 = apic_read(APIC_LVR);
744 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
745
746 /*
747 * The two version reads above should print the same
748 * numbers. If the second one is different, then we
749 * poke at a non-APIC.
750 */
751 if (reg1 != reg0)
752 return 0;
753
754 /*
755 * Check if the version looks reasonably.
756 */
757 reg1 = GET_APIC_VERSION(reg0);
758 if (reg1 == 0x00 || reg1 == 0xff)
759 return 0;
760 reg1 = lapic_get_maxlvt();
761 if (reg1 < 0x02 || reg1 == 0xff)
762 return 0;
763
764 /*
765 * The ID register is read/write in a real APIC.
766 */
767 reg0 = apic_read(APIC_ID);
768 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
769
770 /*
771 * The next two are just to see if we have sane values.
772 * They're only really relevant if we're in Virtual Wire
773 * compatibility mode, but most boxes are anymore.
774 */
775 reg0 = apic_read(APIC_LVT0);
776 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
777 reg1 = apic_read(APIC_LVT1);
778 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
779
780 return 1;
781}
782
783/**
784 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
785 */
786void __init sync_Arb_IDs(void)
787{
788 /*
789 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
790 * needed on AMD.
791 */
792 if (modern_apic())
793 return;
794 /*
795 * Wait for idle.
796 */
797 apic_wait_icr_idle();
798
799 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
800 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
801 | APIC_DM_INIT);
802}
803
804/*
805 * An initial setup of the virtual wire mode.
806 */
807void __init init_bsp_APIC(void)
808{
809 unsigned long value;
810
811 /*
812 * Don't do the setup now if we have a SMP BIOS as the
813 * through-I/O-APIC virtual wire mode might be active.
814 */
815 if (smp_found_config || !cpu_has_apic)
816 return;
817
818 /*
819 * Do not trust the local APIC being empty at bootup.
820 */
821 clear_local_APIC();
822
823 /*
824 * Enable APIC.
825 */
826 value = apic_read(APIC_SPIV);
827 value &= ~APIC_VECTOR_MASK;
828 value |= APIC_SPIV_APIC_ENABLED;
829
830 /* This bit is reserved on P4/Xeon and should be cleared */
831 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
832 (boot_cpu_data.x86 == 15))
833 value &= ~APIC_SPIV_FOCUS_DISABLED;
834 else
835 value |= APIC_SPIV_FOCUS_DISABLED;
836 value |= SPURIOUS_APIC_VECTOR;
837 apic_write_around(APIC_SPIV, value);
838
839 /*
840 * Set up the virtual wire mode.
841 */
842 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
843 value = APIC_DM_NMI;
844 if (!lapic_is_integrated()) /* 82489DX */
845 value |= APIC_LVT_LEVEL_TRIGGER;
846 apic_write_around(APIC_LVT1, value);
847}
848
849/**
850 * setup_local_APIC - setup the local APIC
851 */
852void __devinit setup_local_APIC(void)
853{
854 unsigned long oldvalue, value, maxlvt, integrated;
855 int i, j;
856
857 /* Pound the ESR really hard over the head with a big hammer - mbligh */
858 if (esr_disable) {
859 apic_write(APIC_ESR, 0);
860 apic_write(APIC_ESR, 0);
861 apic_write(APIC_ESR, 0);
862 apic_write(APIC_ESR, 0);
863 }
864
865 integrated = lapic_is_integrated();
866
867 /*
868 * Double-check whether this APIC is really registered.
869 */
870 if (!apic_id_registered())
871 BUG();
872
873 /*
874 * Intel recommends to set DFR, LDR and TPR before enabling
875 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
876 * document number 292116). So here it goes...
877 */
878 init_apic_ldr();
879
880 /*
881 * Set Task Priority to 'accept all'. We never change this
882 * later on.
883 */
884 value = apic_read(APIC_TASKPRI);
885 value &= ~APIC_TPRI_MASK;
886 apic_write_around(APIC_TASKPRI, value);
887
888 /*
889 * After a crash, we no longer service the interrupts and a pending
890 * interrupt from previous kernel might still have ISR bit set.
891 *
892 * Most probably by now CPU has serviced that pending interrupt and
893 * it might not have done the ack_APIC_irq() because it thought,
894 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
895 * does not clear the ISR bit and cpu thinks it has already serivced
896 * the interrupt. Hence a vector might get locked. It was noticed
897 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
898 */
899 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
900 value = apic_read(APIC_ISR + i*0x10);
901 for (j = 31; j >= 0; j--) {
902 if (value & (1<<j))
903 ack_APIC_irq();
904 }
905 }
906
907 /*
908 * Now that we are all set up, enable the APIC
909 */
910 value = apic_read(APIC_SPIV);
911 value &= ~APIC_VECTOR_MASK;
912 /*
913 * Enable APIC
914 */
915 value |= APIC_SPIV_APIC_ENABLED;
916
917 /*
918 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
919 * certain networking cards. If high frequency interrupts are
920 * happening on a particular IOAPIC pin, plus the IOAPIC routing
921 * entry is masked/unmasked at a high rate as well then sooner or
922 * later IOAPIC line gets 'stuck', no more interrupts are received
923 * from the device. If focus CPU is disabled then the hang goes
924 * away, oh well :-(
925 *
926 * [ This bug can be reproduced easily with a level-triggered
927 * PCI Ne2000 networking cards and PII/PIII processors, dual
928 * BX chipset. ]
929 */
930 /*
931 * Actually disabling the focus CPU check just makes the hang less
932 * frequent as it makes the interrupt distributon model be more
933 * like LRU than MRU (the short-term load is more even across CPUs).
934 * See also the comment in end_level_ioapic_irq(). --macro
935 */
936
937 /* Enable focus processor (bit==0) */
938 value &= ~APIC_SPIV_FOCUS_DISABLED;
939
940 /*
941 * Set spurious IRQ vector
942 */
943 value |= SPURIOUS_APIC_VECTOR;
944 apic_write_around(APIC_SPIV, value);
945
946 /*
947 * Set up LVT0, LVT1:
948 *
949 * set up through-local-APIC on the BP's LINT0. This is not
950 * strictly necessery in pure symmetric-IO mode, but sometimes
951 * we delegate interrupts to the 8259A.
952 */
953 /*
954 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
955 */
956 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
957 if (!smp_processor_id() && (pic_mode || !value)) {
958 value = APIC_DM_EXTINT;
959 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
960 smp_processor_id());
961 } else {
962 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
963 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
964 smp_processor_id());
965 }
966 apic_write_around(APIC_LVT0, value);
967
968 /*
969 * only the BP should see the LINT1 NMI signal, obviously.
970 */
971 if (!smp_processor_id())
972 value = APIC_DM_NMI;
973 else
974 value = APIC_DM_NMI | APIC_LVT_MASKED;
975 if (!integrated) /* 82489DX */
976 value |= APIC_LVT_LEVEL_TRIGGER;
977 apic_write_around(APIC_LVT1, value);
978
979 if (integrated && !esr_disable) { /* !82489DX */
980 maxlvt = lapic_get_maxlvt();
981 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
982 apic_write(APIC_ESR, 0);
983 oldvalue = apic_read(APIC_ESR);
984
985 /* enables sending errors */
986 value = ERROR_APIC_VECTOR;
987 apic_write_around(APIC_LVTERR, value);
988 /*
989 * spec says clear errors after enabling vector.
990 */
991 if (maxlvt > 3)
992 apic_write(APIC_ESR, 0);
993 value = apic_read(APIC_ESR);
994 if (value != oldvalue)
995 apic_printk(APIC_VERBOSE, "ESR value before enabling "
996 "vector: 0x%08lx after: 0x%08lx\n",
997 oldvalue, value);
998 } else {
999 if (esr_disable)
1000 /*
1001 * Something untraceble is creating bad interrupts on
1002 * secondary quads ... for the moment, just leave the
1003 * ESR disabled - we can't do anything useful with the
1004 * errors anyway - mbligh
1005 */
1006 printk(KERN_INFO "Leaving ESR disabled.\n");
1007 else
1008 printk(KERN_INFO "No ESR for 82489DX.\n");
1009 }
1010
1011 /* Disable the local apic timer */
1012 value = apic_read(APIC_LVTT);
1013 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1014 apic_write_around(APIC_LVTT, value);
1015
1016 setup_apic_nmi_watchdog(NULL);
1017 apic_pm_activate();
1018}
1019
1020/*
1021 * Detect and initialize APIC
1022 */
1023static int __init detect_init_APIC (void)
1024{
1025 u32 h, l, features;
1026
1027 /* Disabled by kernel option? */
1028 if (enable_local_apic < 0)
1029 return -1;
1030
1031 switch (boot_cpu_data.x86_vendor) {
1032 case X86_VENDOR_AMD:
1033 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
1034 (boot_cpu_data.x86 == 15))
1035 break;
1036 goto no_apic;
1037 case X86_VENDOR_INTEL:
1038 if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
1039 (boot_cpu_data.x86 == 5 && cpu_has_apic))
1040 break;
1041 goto no_apic;
1042 default:
1043 goto no_apic;
1044 }
1045
1046 if (!cpu_has_apic) {
1047 /*
1048 * Over-ride BIOS and try to enable the local APIC only if
1049 * "lapic" specified.
1050 */
1051 if (enable_local_apic <= 0) {
1052 printk(KERN_INFO "Local APIC disabled by BIOS -- "
1053 "you can enable it with \"lapic\"\n");
1054 return -1;
1055 }
1056 /*
1057 * Some BIOSes disable the local APIC in the APIC_BASE
1058 * MSR. This can only be done in software for Intel P6 or later
1059 * and AMD K7 (Model > 1) or later.
1060 */
1061 rdmsr(MSR_IA32_APICBASE, l, h);
1062 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1063 printk(KERN_INFO
1064 "Local APIC disabled by BIOS -- reenabling.\n");
1065 l &= ~MSR_IA32_APICBASE_BASE;
1066 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1067 wrmsr(MSR_IA32_APICBASE, l, h);
1068 enabled_via_apicbase = 1;
1069 }
1070 }
1071 /*
1072 * The APIC feature bit should now be enabled
1073 * in `cpuid'
1074 */
1075 features = cpuid_edx(1);
1076 if (!(features & (1 << X86_FEATURE_APIC))) {
1077 printk(KERN_WARNING "Could not enable APIC!\n");
1078 return -1;
1079 }
1080 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1081 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1082
1083 /* The BIOS may have set up the APIC at some other address */
1084 rdmsr(MSR_IA32_APICBASE, l, h);
1085 if (l & MSR_IA32_APICBASE_ENABLE)
1086 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1087
1088 if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
1089 nmi_watchdog = NMI_LOCAL_APIC;
1090
1091 printk(KERN_INFO "Found and enabled local APIC!\n");
1092
1093 apic_pm_activate();
1094
1095 return 0;
1096
1097no_apic:
1098 printk(KERN_INFO "No local APIC present or hardware disabled\n");
1099 return -1;
1100}
1101
1102/**
1103 * init_apic_mappings - initialize APIC mappings
1104 */
1105void __init init_apic_mappings(void)
1106{
1107 unsigned long apic_phys;
1108
1109 /*
1110 * If no local APIC can be found then set up a fake all
1111 * zeroes page to simulate the local APIC and another
1112 * one for the IO-APIC.
1113 */
1114 if (!smp_found_config && detect_init_APIC()) {
1115 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
1116 apic_phys = __pa(apic_phys);
1117 } else
1118 apic_phys = mp_lapic_addr;
1119
1120 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1121 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
1122 apic_phys);
1123
1124 /*
1125 * Fetch the APIC ID of the BSP in case we have a
1126 * default configuration (or the MP table is broken).
1127 */
1128 if (boot_cpu_physical_apicid == -1U)
1129 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
1130
1131#ifdef CONFIG_X86_IO_APIC
1132 {
1133 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
1134 int i;
1135
1136 for (i = 0; i < nr_ioapics; i++) {
1137 if (smp_found_config) {
1138 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
1139 if (!ioapic_phys) {
1140 printk(KERN_ERR
1141 "WARNING: bogus zero IO-APIC "
1142 "address found in MPTABLE, "
1143 "disabling IO/APIC support!\n");
1144 smp_found_config = 0;
1145 skip_ioapic_setup = 1;
1146 goto fake_ioapic_page;
1147 }
1148 } else {
1149fake_ioapic_page:
1150 ioapic_phys = (unsigned long)
1151 alloc_bootmem_pages(PAGE_SIZE);
1152 ioapic_phys = __pa(ioapic_phys);
1153 }
1154 set_fixmap_nocache(idx, ioapic_phys);
1155 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
1156 __fix_to_virt(idx), ioapic_phys);
1157 idx++;
1158 }
1159 }
1160#endif
1161}
1162
1163/*
1164 * This initializes the IO-APIC and APIC hardware if this is
1165 * a UP kernel.
1166 */
1167int __init APIC_init_uniprocessor (void)
1168{
1169 if (enable_local_apic < 0)
1170 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1171
1172 if (!smp_found_config && !cpu_has_apic)
1173 return -1;
1174
1175 /*
1176 * Complain if the BIOS pretends there is one.
1177 */
1178 if (!cpu_has_apic &&
1179 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1180 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1181 boot_cpu_physical_apicid);
1182 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1183 return -1;
1184 }
1185
1186 verify_local_APIC();
1187
1188 connect_bsp_APIC();
1189
1190 /*
1191 * Hack: In case of kdump, after a crash, kernel might be booting
1192 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1193 * might be zero if read from MP tables. Get it from LAPIC.
1194 */
1195#ifdef CONFIG_CRASH_DUMP
1196 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
1197#endif
1198 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
1199
1200 setup_local_APIC();
1201
1202#ifdef CONFIG_X86_IO_APIC
1203 if (smp_found_config)
1204 if (!skip_ioapic_setup && nr_ioapics)
1205 setup_IO_APIC();
1206#endif
1207 setup_boot_clock();
1208
1209 return 0;
1210}
1211
1212/*
1213 * APIC command line parameters
1214 */
1215static int __init parse_lapic(char *arg)
1216{
1217 enable_local_apic = 1;
1218 return 0;
1219}
1220early_param("lapic", parse_lapic);
1221
1222static int __init parse_nolapic(char *arg)
1223{
1224 enable_local_apic = -1;
1225 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1226 return 0;
1227}
1228early_param("nolapic", parse_nolapic);
1229
1230static int __init parse_disable_lapic_timer(char *arg)
1231{
1232 local_apic_timer_disabled = 1;
1233 return 0;
1234}
1235early_param("nolapic_timer", parse_disable_lapic_timer);
1236
1237static int __init parse_lapic_timer_c2_ok(char *arg)
1238{
1239 local_apic_timer_c2_ok = 1;
1240 return 0;
1241}
1242early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1243
1244static int __init apic_set_verbosity(char *str)
1245{
1246 if (strcmp("debug", str) == 0)
1247 apic_verbosity = APIC_DEBUG;
1248 else if (strcmp("verbose", str) == 0)
1249 apic_verbosity = APIC_VERBOSE;
1250 return 1;
1251}
1252
1253__setup("apic=", apic_set_verbosity);
1254
1255
1256/*
1257 * Local APIC interrupts
1258 */
1259
1260/*
1261 * This interrupt should _never_ happen with our APIC/SMP architecture
1262 */
1263void smp_spurious_interrupt(struct pt_regs *regs)
1264{
1265 unsigned long v;
1266
1267 irq_enter();
1268 /*
1269 * Check if this really is a spurious interrupt and ACK it
1270 * if it is a vectored one. Just in case...
1271 * Spurious interrupts should not be ACKed.
1272 */
1273 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1274 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1275 ack_APIC_irq();
1276
1277 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1278 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1279 "should never happen.\n", smp_processor_id());
1280 irq_exit();
1281}
1282
1283/*
1284 * This interrupt should never happen with our APIC/SMP architecture
1285 */
1286void smp_error_interrupt(struct pt_regs *regs)
1287{
1288 unsigned long v, v1;
1289
1290 irq_enter();
1291 /* First tickle the hardware, only then report what went on. -- REW */
1292 v = apic_read(APIC_ESR);
1293 apic_write(APIC_ESR, 0);
1294 v1 = apic_read(APIC_ESR);
1295 ack_APIC_irq();
1296 atomic_inc(&irq_err_count);
1297
1298 /* Here is what the APIC error bits mean:
1299 0: Send CS error
1300 1: Receive CS error
1301 2: Send accept error
1302 3: Receive accept error
1303 4: Reserved
1304 5: Send illegal vector
1305 6: Received illegal vector
1306 7: Illegal register address
1307 */
1308 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1309 smp_processor_id(), v , v1);
1310 irq_exit();
1311}
1312
1313/*
1314 * Initialize APIC interrupts
1315 */
1316void __init apic_intr_init(void)
1317{
1318#ifdef CONFIG_SMP
1319 smp_intr_init();
1320#endif
1321 /* self generated IPI for local APIC timer */
1322 set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1323
1324 /* IPI vectors for APIC spurious and error interrupts */
1325 set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1326 set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1327
1328 /* thermal monitor LVT interrupt */
1329#ifdef CONFIG_X86_MCE_P4THERMAL
1330 set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1331#endif
1332}
1333
1334/**
1335 * connect_bsp_APIC - attach the APIC to the interrupt system
1336 */
1337void __init connect_bsp_APIC(void)
1338{
1339 if (pic_mode) {
1340 /*
1341 * Do not trust the local APIC being empty at bootup.
1342 */
1343 clear_local_APIC();
1344 /*
1345 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1346 * local APIC to INT and NMI lines.
1347 */
1348 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1349 "enabling APIC mode.\n");
1350 outb(0x70, 0x22);
1351 outb(0x01, 0x23);
1352 }
1353 enable_apic_mode();
1354}
1355
1356/**
1357 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1358 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1359 *
1360 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1361 * APIC is disabled.
1362 */
1363void disconnect_bsp_APIC(int virt_wire_setup)
1364{
1365 if (pic_mode) {
1366 /*
1367 * Put the board back into PIC mode (has an effect only on
1368 * certain older boards). Note that APIC interrupts, including
1369 * IPIs, won't work beyond this point! The only exception are
1370 * INIT IPIs.
1371 */
1372 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1373 "entering PIC mode.\n");
1374 outb(0x70, 0x22);
1375 outb(0x00, 0x23);
1376 } else {
1377 /* Go back to Virtual Wire compatibility mode */
1378 unsigned long value;
1379
1380 /* For the spurious interrupt use vector F, and enable it */
1381 value = apic_read(APIC_SPIV);
1382 value &= ~APIC_VECTOR_MASK;
1383 value |= APIC_SPIV_APIC_ENABLED;
1384 value |= 0xf;
1385 apic_write_around(APIC_SPIV, value);
1386
1387 if (!virt_wire_setup) {
1388 /*
1389 * For LVT0 make it edge triggered, active high,
1390 * external and enabled
1391 */
1392 value = apic_read(APIC_LVT0);
1393 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1394 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1395 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
1396 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1397 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1398 apic_write_around(APIC_LVT0, value);
1399 } else {
1400 /* Disable LVT0 */
1401 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1402 }
1403
1404 /*
1405 * For LVT1 make it edge triggered, active high, nmi and
1406 * enabled
1407 */
1408 value = apic_read(APIC_LVT1);
1409 value &= ~(
1410 APIC_MODE_MASK | APIC_SEND_PENDING |
1411 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1412 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1413 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1414 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1415 apic_write_around(APIC_LVT1, value);
1416 }
1417}
1418
1419/*
1420 * Power management
1421 */
1422#ifdef CONFIG_PM
1423
1424static struct {
1425 int active;
1426 /* r/w apic fields */
1427 unsigned int apic_id;
1428 unsigned int apic_taskpri;
1429 unsigned int apic_ldr;
1430 unsigned int apic_dfr;
1431 unsigned int apic_spiv;
1432 unsigned int apic_lvtt;
1433 unsigned int apic_lvtpc;
1434 unsigned int apic_lvt0;
1435 unsigned int apic_lvt1;
1436 unsigned int apic_lvterr;
1437 unsigned int apic_tmict;
1438 unsigned int apic_tdcr;
1439 unsigned int apic_thmr;
1440} apic_pm_state;
1441
1442static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1443{
1444 unsigned long flags;
1445 int maxlvt;
1446
1447 if (!apic_pm_state.active)
1448 return 0;
1449
1450 maxlvt = lapic_get_maxlvt();
1451
1452 apic_pm_state.apic_id = apic_read(APIC_ID);
1453 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1454 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1455 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1456 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1457 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1458 if (maxlvt >= 4)
1459 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1460 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1461 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1462 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1463 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1464 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1465#ifdef CONFIG_X86_MCE_P4THERMAL
1466 if (maxlvt >= 5)
1467 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1468#endif
1469
1470 local_irq_save(flags);
1471 disable_local_APIC();
1472 local_irq_restore(flags);
1473 return 0;
1474}
1475
1476static int lapic_resume(struct sys_device *dev)
1477{
1478 unsigned int l, h;
1479 unsigned long flags;
1480 int maxlvt;
1481
1482 if (!apic_pm_state.active)
1483 return 0;
1484
1485 maxlvt = lapic_get_maxlvt();
1486
1487 local_irq_save(flags);
1488
1489 /*
1490 * Make sure the APICBASE points to the right address
1491 *
1492 * FIXME! This will be wrong if we ever support suspend on
1493 * SMP! We'll need to do this as part of the CPU restore!
1494 */
1495 rdmsr(MSR_IA32_APICBASE, l, h);
1496 l &= ~MSR_IA32_APICBASE_BASE;
1497 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1498 wrmsr(MSR_IA32_APICBASE, l, h);
1499
1500 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1501 apic_write(APIC_ID, apic_pm_state.apic_id);
1502 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1503 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1504 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1505 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1506 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1507 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1508#ifdef CONFIG_X86_MCE_P4THERMAL
1509 if (maxlvt >= 5)
1510 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1511#endif
1512 if (maxlvt >= 4)
1513 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1514 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1515 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1516 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1517 apic_write(APIC_ESR, 0);
1518 apic_read(APIC_ESR);
1519 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1520 apic_write(APIC_ESR, 0);
1521 apic_read(APIC_ESR);
1522 local_irq_restore(flags);
1523 return 0;
1524}
1525
1526/*
1527 * This device has no shutdown method - fully functioning local APICs
1528 * are needed on every CPU up until machine_halt/restart/poweroff.
1529 */
1530
1531static struct sysdev_class lapic_sysclass = {
1532 set_kset_name("lapic"),
1533 .resume = lapic_resume,
1534 .suspend = lapic_suspend,
1535};
1536
1537static struct sys_device device_lapic = {
1538 .id = 0,
1539 .cls = &lapic_sysclass,
1540};
1541
1542static void __devinit apic_pm_activate(void)
1543{
1544 apic_pm_state.active = 1;
1545}
1546
1547static int __init init_lapic_sysfs(void)
1548{
1549 int error;
1550
1551 if (!cpu_has_apic)
1552 return 0;
1553 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1554
1555 error = sysdev_class_register(&lapic_sysclass);
1556 if (!error)
1557 error = sysdev_register(&device_lapic);
1558 return error;
1559}
1560device_initcall(init_lapic_sysfs);
1561
1562#else /* CONFIG_PM */
1563
1564static void apic_pm_activate(void) { }
1565
1566#endif /* CONFIG_PM */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 000000000000..f02a8aca826b
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2403 @@
1/* -*- linux-c -*-
2 * APM BIOS driver for Linux
3 * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
4 *
5 * Initial development of this driver was funded by NEC Australia P/L
6 * and NEC Corporation
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2, or (at your option) any
11 * later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * October 1995, Rik Faith (faith@cs.unc.edu):
19 * Minor enhancements and updates (to the patch set) for 1.3.x
20 * Documentation
21 * January 1996, Rik Faith (faith@cs.unc.edu):
22 * Make /proc/apm easy to format (bump driver version)
23 * March 1996, Rik Faith (faith@cs.unc.edu):
24 * Prohibit APM BIOS calls unless apm_enabled.
25 * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
26 * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
27 * Version 1.0 and 1.1
28 * May 1996, Version 1.2
29 * Feb 1998, Version 1.3
30 * Feb 1998, Version 1.4
31 * Aug 1998, Version 1.5
32 * Sep 1998, Version 1.6
33 * Nov 1998, Version 1.7
34 * Jan 1999, Version 1.8
35 * Jan 1999, Version 1.9
36 * Oct 1999, Version 1.10
37 * Nov 1999, Version 1.11
38 * Jan 2000, Version 1.12
39 * Feb 2000, Version 1.13
40 * Nov 2000, Version 1.14
41 * Oct 2001, Version 1.15
42 * Jan 2002, Version 1.16
43 * Oct 2002, Version 1.16ac
44 *
45 * History:
46 * 0.6b: first version in official kernel, Linux 1.3.46
47 * 0.7: changed /proc/apm format, Linux 1.3.58
48 * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
49 * 0.9: only call bios if bios is present, Linux 1.3.72
50 * 1.0: use fixed device number, consolidate /proc/apm into this file,
51 * Linux 1.3.85
52 * 1.1: support user-space standby and suspend, power off after system
53 * halted, Linux 1.3.98
54 * 1.2: When resetting RTC after resume, take care so that the time
55 * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
56 * <jtoth@princeton.edu>); improve interaction between
57 * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
58 * 1.2a:Simple change to stop mysterious bug reports with SMP also added
59 * levels to the printk calls. APM is not defined for SMP machines.
60 * The new replacment for it is, but Linux doesn't yet support this.
61 * Alan Cox Linux 2.1.55
62 * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
63 * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
64 * Dean Gaudet <dgaudet@arctic.org>.
65 * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call).
68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences.
70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de>
73 * Stephen Rothwell
74 * 1.7: Modify driver's cached copy of the disabled/disengaged flags
75 * to reflect current state of APM BIOS.
76 * Chris Rankin <rankinc@bellsouth.net>
77 * Reset interrupt 0 timer to 100Hz after suspend
78 * Chad Miller <cmiller@surfsouth.com>
79 * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
80 * Richard Gooch <rgooch@atnf.csiro.au>
81 * Allow boot time disabling of APM
82 * Make boot messages far less verbose by default
83 * Make asm safer
84 * Stephen Rothwell
85 * 1.8: Add CONFIG_APM_RTC_IS_GMT
86 * Richard Gooch <rgooch@atnf.csiro.au>
87 * change APM_NOINTS to CONFIG_APM_ALLOW_INTS
88 * remove dependency on CONFIG_PROC_FS
89 * Stephen Rothwell
90 * 1.9: Fix small typo. <laslo@wodip.opole.pl>
91 * Try to cope with BIOS's that need to have all display
92 * devices blanked and not just the first one.
93 * Ross Paterson <ross@soi.city.ac.uk>
94 * Fix segment limit setting it has always been wrong as
95 * the segments needed to have byte granularity.
96 * Mark a few things __init.
97 * Add hack to allow power off of SMP systems by popular request.
98 * Use CONFIG_SMP instead of __SMP__
99 * Ignore BOUNCES for three seconds.
100 * Stephen Rothwell
101 * 1.10: Fix for Thinkpad return code.
102 * Merge 2.2 and 2.3 drivers.
103 * Remove APM dependencies in arch/i386/kernel/process.c
104 * Remove APM dependencies in drivers/char/sysrq.c
105 * Reset time across standby.
106 * Allow more inititialisation on SMP.
107 * Remove CONFIG_APM_POWER_OFF and make it boot time
108 * configurable (default on).
109 * Make debug only a boot time parameter (remove APM_DEBUG).
110 * Try to blank all devices on any error.
111 * 1.11: Remove APM dependencies in drivers/char/console.c
112 * Check nr_running to detect if we are idle (from
113 * Borislav Deianov <borislav@lix.polytechnique.fr>)
114 * Fix for bioses that don't zero the top part of the
115 * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
116 * (reported by Panos Katsaloulis <teras@writeme.com>).
117 * Real mode power off patch (Walter Hofmann
118 * <Walter.Hofmann@physik.stud.uni-erlangen.de>).
119 * 1.12: Remove CONFIG_SMP as the compiler will optimize
120 * the code away anyway (smp_num_cpus == 1 in UP)
121 * noted by Artur Skawina <skawina@geocities.com>.
122 * Make power off under SMP work again.
123 * Fix thinko with initial engaging of BIOS.
124 * Make sure power off only happens on CPU 0
125 * (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
126 * Do error notification to user mode if BIOS calls fail.
127 * Move entrypoint offset fix to ...boot/setup.S
128 * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
129 * Remove smp-power-off. SMP users must now specify
130 * "apm=power-off" on the kernel command line. Suggested
131 * by Jim Avera <jima@hal.com>, modified by Alan Cox
132 * <alan@lxorguk.ukuu.org.uk>.
133 * Register the /proc/apm entry even on SMP so that
134 * scripts that check for it before doing power off
135 * work (Jim Avera <jima@hal.com>).
136 * 1.13: Changes for new pm_ interfaces (Andy Henroid
137 * <andy_henroid@yahoo.com>).
138 * Modularize the code.
139 * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
140 * is now the way life works).
141 * Fix thinko in suspend() (wrong return).
142 * Notify drivers on critical suspend.
143 * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
144 * modified by sfr).
145 * Disable interrupts while we are suspended (Andy Henroid
146 * <andy_henroid@yahoo.com> fixed by sfr).
147 * Make power off work on SMP again (Tony Hoyle
148 * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
149 * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore
150 * interval is now configurable.
151 * 1.14: Make connection version persist across module unload/load.
152 * Enable and engage power management earlier.
153 * Disengage power management on module unload.
154 * Changed to use the sysrq-register hack for registering the
155 * power off function called by magic sysrq based upon discussions
156 * in irc://irc.openprojects.net/#kernelnewbies
157 * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
158 * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
159 * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
160 * Work around byte swap bug in one of the Vaio's BIOS's
161 * (Marc Boucher <marc@mbsi.ca>).
162 * Exposed the disable flag to dmi so that we can handle known
163 * broken APM (Alan Cox <alan@redhat.com>).
164 * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
165 * calling it - instead idle. (Alan Cox <alan@redhat.com>)
166 * If an APM idle fails log it and idle sensibly
167 * 1.15: Don't queue events to clients who open the device O_WRONLY.
168 * Don't expect replies from clients who open the device O_RDONLY.
169 * (Idea from Thomas Hood)
170 * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
171 * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
172 * Notify listeners of standby or suspend events before notifying
173 * drivers. Return EBUSY to ioctl() if suspend is rejected.
174 * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
175 * Ignore first resume after we generate our own resume event
176 * after a suspend (Thomas Hood)
177 * Daemonize now gets rid of our controlling terminal (sfr).
178 * CONFIG_APM_CPU_IDLE now just affects the default value of
179 * idle_threshold (sfr).
180 * Change name of kernel apm daemon (as it no longer idles) (sfr).
181 * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
182 * make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
183 * TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
184 *
185 * APM 1.1 Reference:
186 *
187 * Intel Corporation, Microsoft Corporation. Advanced Power Management
188 * (APM) BIOS Interface Specification, Revision 1.1, September 1993.
189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
190 *
191 * [This document is available free from Intel by calling 800.628.8686 (fax
192 * 916.356.6100) or 800.548.4725; or via anonymous ftp from
193 * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
194 * available from Microsoft by calling 206.882.8080.]
195 *
196 * APM 1.2 Reference:
197 * Intel Corporation, Microsoft Corporation. Advanced Power Management
198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
199 *
200 * [This document is available from Microsoft at:
201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */
203
204#include <linux/module.h>
205
206#include <linux/poll.h>
207#include <linux/types.h>
208#include <linux/stddef.h>
209#include <linux/timer.h>
210#include <linux/fcntl.h>
211#include <linux/slab.h>
212#include <linux/stat.h>
213#include <linux/proc_fs.h>
214#include <linux/seq_file.h>
215#include <linux/miscdevice.h>
216#include <linux/apm_bios.h>
217#include <linux/init.h>
218#include <linux/time.h>
219#include <linux/sched.h>
220#include <linux/pm.h>
221#include <linux/pm_legacy.h>
222#include <linux/capability.h>
223#include <linux/device.h>
224#include <linux/kernel.h>
225#include <linux/freezer.h>
226#include <linux/smp.h>
227#include <linux/dmi.h>
228#include <linux/suspend.h>
229#include <linux/kthread.h>
230
231#include <asm/system.h>
232#include <asm/uaccess.h>
233#include <asm/desc.h>
234#include <asm/i8253.h>
235#include <asm/paravirt.h>
236#include <asm/reboot.h>
237
238#include "io_ports.h"
239
240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
241extern int (*console_blank_hook)(int);
242#endif
243
244/*
245 * The apm_bios device is one of the misc char devices.
246 * This is its minor number.
247 */
248#define APM_MINOR_DEV 134
249
250/*
251 * See Documentation/Config.help for the configuration options.
252 *
253 * Various options can be changed at boot time as follows:
254 * (We allow underscores for compatibility with the modules code)
255 * apm=on/off enable/disable APM
256 * [no-]allow[-_]ints allow interrupts during BIOS calls
257 * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call
258 * [no-]realmode[-_]power[-_]off switch to real mode before
259 * powering off
260 * [no-]debug log some debugging messages
261 * [no-]power[-_]off power off on shutdown
262 * [no-]smp Use apm even on an SMP box
263 * bounce[-_]interval=<n> number of ticks to ignore suspend
264 * bounces
265 * idle[-_]threshold=<n> System idle percentage above which to
266 * make APM BIOS idle calls. Set it to
267 * 100 to disable.
268 * idle[-_]period=<n> Period (in 1/100s of a second) over
269 * which the idle percentage is
270 * calculated.
271 */
272
273/* KNOWN PROBLEM MACHINES:
274 *
275 * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
276 * [Confirmed by TI representative]
277 * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
278 * [Confirmed by BIOS disassembly]
279 * [This may work now ...]
280 * P: Toshiba 1950S: battery life information only gets updated after resume
281 * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
282 * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
283 * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
284 * Neale Banks <neale@lowendale.com.au> December 2000
285 *
286 * Legend: U = unusable with APM patches
287 * P = partially usable with APM patches
288 */
289
290/*
291 * Define as 1 to make the driver always call the APM BIOS busy
292 * routine even if the clock was not reported as slowed by the
293 * idle routine. Otherwise, define as 0.
294 */
295#define ALWAYS_CALL_BUSY 1
296
297/*
298 * Define to make the APM BIOS calls zero all data segment registers (so
299 * that an incorrect BIOS implementation will cause a kernel panic if it
300 * tries to write to arbitrary memory).
301 */
302#define APM_ZERO_SEGS
303
304#include "apm.h"
305
306/*
307 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
308 * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
309 * David Chen <chen@ctpa04.mit.edu>
310 */
311#undef INIT_TIMER_AFTER_SUSPEND
312
313#ifdef INIT_TIMER_AFTER_SUSPEND
314#include <linux/timex.h>
315#include <asm/io.h>
316#include <linux/delay.h>
317#endif
318
319/*
320 * Need to poll the APM BIOS every second
321 */
322#define APM_CHECK_TIMEOUT (HZ)
323
324/*
325 * Ignore suspend events for this amount of time after a resume
326 */
327#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
328
329/*
330 * Maximum number of events stored
331 */
332#define APM_MAX_EVENTS 20
333
334/*
335 * The per-file APM data
336 */
337struct apm_user {
338 int magic;
339 struct apm_user * next;
340 unsigned int suser: 1;
341 unsigned int writer: 1;
342 unsigned int reader: 1;
343 unsigned int suspend_wait: 1;
344 int suspend_result;
345 int suspends_pending;
346 int standbys_pending;
347 int suspends_read;
348 int standbys_read;
349 int event_head;
350 int event_tail;
351 apm_event_t events[APM_MAX_EVENTS];
352};
353
354/*
355 * The magic number in apm_user
356 */
357#define APM_BIOS_MAGIC 0x4101
358
359/*
360 * idle percentage above which bios idle calls are done
361 */
362#ifdef CONFIG_APM_CPU_IDLE
363#define DEFAULT_IDLE_THRESHOLD 95
364#else
365#define DEFAULT_IDLE_THRESHOLD 100
366#endif
367#define DEFAULT_IDLE_PERIOD (100 / 3)
368
369/*
370 * Local variables
371 */
372static struct {
373 unsigned long offset;
374 unsigned short segment;
375} apm_bios_entry;
376static int clock_slowed;
377static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
378static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
379static int set_pm_idle;
380static int suspends_pending;
381static int standbys_pending;
382static int ignore_sys_suspend;
383static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385
386static int debug __read_mostly;
387static int smp __read_mostly;
388static int apm_disabled = -1;
389#ifdef CONFIG_SMP
390static int power_off;
391#else
392static int power_off = 1;
393#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1;
396#else
397static int realmode_power_off;
398#endif
399#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1;
401#else
402static int allow_ints;
403#endif
404static int broken_psr;
405
406static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
407static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
408static struct apm_user * user_list;
409static DEFINE_SPINLOCK(user_list_lock);
410static const struct desc_struct bad_bios_desc = { 0, 0x00409200 };
411
412static const char driver_version[] = "1.16ac"; /* no spaces */
413
414static struct task_struct *kapmd_task;
415
416/*
417 * APM event names taken from the APM 1.2 specification. These are
418 * the message codes that the BIOS uses to tell us about events
419 */
420static const char * const apm_event_name[] = {
421 "system standby",
422 "system suspend",
423 "normal resume",
424 "critical resume",
425 "low battery",
426 "power status change",
427 "update time",
428 "critical suspend",
429 "user standby",
430 "user suspend",
431 "system standby resume",
432 "capabilities change"
433};
434#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
435
436typedef struct lookup_t {
437 int key;
438 char * msg;
439} lookup_t;
440
441/*
442 * The BIOS returns a set of standard error codes in AX when the
443 * carry flag is set.
444 */
445
446static const lookup_t error_table[] = {
447/* N/A { APM_SUCCESS, "Operation succeeded" }, */
448 { APM_DISABLED, "Power management disabled" },
449 { APM_CONNECTED, "Real mode interface already connected" },
450 { APM_NOT_CONNECTED, "Interface not connected" },
451 { APM_16_CONNECTED, "16 bit interface already connected" },
452/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */
453 { APM_32_CONNECTED, "32 bit interface already connected" },
454 { APM_32_UNSUPPORTED, "32 bit interface not supported" },
455 { APM_BAD_DEVICE, "Unrecognized device ID" },
456 { APM_BAD_PARAM, "Parameter out of range" },
457 { APM_NOT_ENGAGED, "Interface not engaged" },
458 { APM_BAD_FUNCTION, "Function not supported" },
459 { APM_RESUME_DISABLED, "Resume timer disabled" },
460 { APM_BAD_STATE, "Unable to enter requested state" },
461/* N/A { APM_NO_EVENTS, "No events pending" }, */
462 { APM_NO_ERROR, "BIOS did not set a return code" },
463 { APM_NOT_PRESENT, "No APM present" }
464};
465#define ERROR_COUNT ARRAY_SIZE(error_table)
466
467/**
468 * apm_error - display an APM error
469 * @str: information string
470 * @err: APM BIOS return code
471 *
472 * Write a meaningful log entry to the kernel log in the event of
473 * an APM error.
474 */
475
476static void apm_error(char *str, int err)
477{
478 int i;
479
480 for (i = 0; i < ERROR_COUNT; i++)
481 if (error_table[i].key == err) break;
482 if (i < ERROR_COUNT)
483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
484 else
485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
486 str, err);
487}
488
489/*
490 * Lock APM functionality to physical CPU 0
491 */
492
493#ifdef CONFIG_SMP
494
495static cpumask_t apm_save_cpus(void)
496{
497 cpumask_t x = current->cpus_allowed;
498 /* Some bioses don't like being called from CPU != 0 */
499 set_cpus_allowed(current, cpumask_of_cpu(0));
500 BUG_ON(smp_processor_id() != 0);
501 return x;
502}
503
504static inline void apm_restore_cpus(cpumask_t mask)
505{
506 set_cpus_allowed(current, mask);
507}
508
509#else
510
511/*
512 * No CPU lockdown needed on a uniprocessor
513 */
514
515#define apm_save_cpus() (current->cpus_allowed)
516#define apm_restore_cpus(x) (void)(x)
517
518#endif
519
520/*
521 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
522 * apm_info.allow_ints, we are being really paranoid here! Not only
523 * are interrupts disabled, but all the segment registers (except SS)
524 * are saved and zeroed this means that if the BIOS tries to reference
525 * any data without explicitly loading the segment registers, the kernel
526 * will fault immediately rather than have some unforeseen circumstances
527 * for the rest of the kernel. And it will be very obvious! :-) Doing
528 * this depends on CS referring to the same physical memory as DS so that
529 * DS can be zeroed before the call. Unfortunately, we can't do anything
530 * about the stack segment/pointer. Also, we tell the compiler that
531 * everything could change.
532 *
533 * Also, we KNOW that for the non error case of apm_bios_call, there
534 * is no useful data returned in the low order 8 bits of eax.
535 */
536
537static inline unsigned long __apm_irq_save(void)
538{
539 unsigned long flags;
540 local_save_flags(flags);
541 if (apm_info.allow_ints) {
542 if (irqs_disabled_flags(flags))
543 local_irq_enable();
544 } else
545 local_irq_disable();
546
547 return flags;
548}
549
550#define apm_irq_save(flags) \
551 do { flags = __apm_irq_save(); } while (0)
552
553static inline void apm_irq_restore(unsigned long flags)
554{
555 if (irqs_disabled_flags(flags))
556 local_irq_disable();
557 else if (irqs_disabled())
558 local_irq_enable();
559}
560
561#ifdef APM_ZERO_SEGS
562# define APM_DECL_SEGS \
563 unsigned int saved_fs; unsigned int saved_gs;
564# define APM_DO_SAVE_SEGS \
565 savesegment(fs, saved_fs); savesegment(gs, saved_gs)
566# define APM_DO_RESTORE_SEGS \
567 loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
568#else
569# define APM_DECL_SEGS
570# define APM_DO_SAVE_SEGS
571# define APM_DO_RESTORE_SEGS
572#endif
573
574/**
575 * apm_bios_call - Make an APM BIOS 32bit call
576 * @func: APM function to execute
577 * @ebx_in: EBX register for call entry
578 * @ecx_in: ECX register for call entry
579 * @eax: EAX register return
580 * @ebx: EBX register return
581 * @ecx: ECX register return
582 * @edx: EDX register return
583 * @esi: ESI register return
584 *
585 * Make an APM call using the 32bit protected mode interface. The
586 * caller is responsible for knowing if APM BIOS is configured and
587 * enabled. This call can disable interrupts for a long period of
588 * time on some laptops. The return value is in AH and the carry
589 * flag is loaded into AL. If there is an error, then the error
590 * code is returned in AH (bits 8-15 of eax) and this function
591 * returns non-zero.
592 */
593
594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
596{
597 APM_DECL_SEGS
598 unsigned long flags;
599 cpumask_t cpus;
600 int cpu;
601 struct desc_struct save_desc_40;
602 struct desc_struct *gdt;
603
604 cpus = apm_save_cpus();
605
606 cpu = get_cpu();
607 gdt = get_cpu_gdt_table(cpu);
608 save_desc_40 = gdt[0x40 / 8];
609 gdt[0x40 / 8] = bad_bios_desc;
610
611 apm_irq_save(flags);
612 APM_DO_SAVE_SEGS;
613 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
614 APM_DO_RESTORE_SEGS;
615 apm_irq_restore(flags);
616 gdt[0x40 / 8] = save_desc_40;
617 put_cpu();
618 apm_restore_cpus(cpus);
619
620 return *eax & 0xff;
621}
622
623/**
624 * apm_bios_call_simple - make a simple APM BIOS 32bit call
625 * @func: APM function to invoke
626 * @ebx_in: EBX register value for BIOS call
627 * @ecx_in: ECX register value for BIOS call
628 * @eax: EAX register on return from the BIOS call
629 *
630 * Make a BIOS call that returns one value only, or just status.
631 * If there is an error, then the error code is returned in AH
632 * (bits 8-15 of eax) and this function returns non-zero. This is
633 * used for simpler BIOS operations. This call may hold interrupts
634 * off for a long time on some laptops.
635 */
636
637static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
638{
639 u8 error;
640 APM_DECL_SEGS
641 unsigned long flags;
642 cpumask_t cpus;
643 int cpu;
644 struct desc_struct save_desc_40;
645 struct desc_struct *gdt;
646
647 cpus = apm_save_cpus();
648
649 cpu = get_cpu();
650 gdt = get_cpu_gdt_table(cpu);
651 save_desc_40 = gdt[0x40 / 8];
652 gdt[0x40 / 8] = bad_bios_desc;
653
654 apm_irq_save(flags);
655 APM_DO_SAVE_SEGS;
656 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
657 APM_DO_RESTORE_SEGS;
658 apm_irq_restore(flags);
659 gdt[0x40 / 8] = save_desc_40;
660 put_cpu();
661 apm_restore_cpus(cpus);
662 return error;
663}
664
665/**
666 * apm_driver_version - APM driver version
667 * @val: loaded with the APM version on return
668 *
669 * Retrieve the APM version supported by the BIOS. This is only
670 * supported for APM 1.1 or higher. An error indicates APM 1.0 is
671 * probably present.
672 *
673 * On entry val should point to a value indicating the APM driver
674 * version with the high byte being the major and the low byte the
675 * minor number both in BCD
676 *
677 * On return it will hold the BIOS revision supported in the
678 * same format.
679 */
680
681static int apm_driver_version(u_short *val)
682{
683 u32 eax;
684
685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
686 return (eax >> 8) & 0xff;
687 *val = eax;
688 return APM_SUCCESS;
689}
690
691/**
692 * apm_get_event - get an APM event from the BIOS
693 * @event: pointer to the event
694 * @info: point to the event information
695 *
696 * The APM BIOS provides a polled information for event
697 * reporting. The BIOS expects to be polled at least every second
698 * when events are pending. When a message is found the caller should
699 * poll until no more messages are present. However, this causes
700 * problems on some laptops where a suspend event notification is
701 * not cleared until it is acknowledged.
702 *
703 * Additional information is returned in the info pointer, providing
704 * that APM 1.2 is in use. If no messges are pending the value 0x80
705 * is returned (No power management events pending).
706 */
707
708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
709{
710 u32 eax;
711 u32 ebx;
712 u32 ecx;
713 u32 dummy;
714
715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
716 &dummy, &dummy))
717 return (eax >> 8) & 0xff;
718 *event = ebx;
719 if (apm_info.connection_version < 0x0102)
720 *info = ~0; /* indicate info not valid */
721 else
722 *info = ecx;
723 return APM_SUCCESS;
724}
725
726/**
727 * set_power_state - set the power management state
728 * @what: which items to transition
729 * @state: state to transition to
730 *
731 * Request an APM change of state for one or more system devices. The
732 * processor state must be transitioned last of all. what holds the
733 * class of device in the upper byte and the device number (0xFF for
734 * all) for the object to be transitioned.
735 *
736 * The state holds the state to transition to, which may in fact
737 * be an acceptance of a BIOS requested state change.
738 */
739
740static int set_power_state(u_short what, u_short state)
741{
742 u32 eax;
743
744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
745 return (eax >> 8) & 0xff;
746 return APM_SUCCESS;
747}
748
749/**
750 * set_system_power_state - set system wide power state
751 * @state: which state to enter
752 *
753 * Transition the entire system into a new APM power state.
754 */
755
756static int set_system_power_state(u_short state)
757{
758 return set_power_state(APM_DEVICE_ALL, state);
759}
760
761/**
762 * apm_do_idle - perform power saving
763 *
764 * This function notifies the BIOS that the processor is (in the view
765 * of the OS) idle. It returns -1 in the event that the BIOS refuses
766 * to handle the idle request. On a success the function returns 1
767 * if the BIOS did clock slowing or 0 otherwise.
768 */
769
770static int apm_do_idle(void)
771{
772 u32 eax;
773 u8 ret = 0;
774 int idled = 0;
775 int polling;
776
777 polling = !!(current_thread_info()->status & TS_POLLING);
778 if (polling) {
779 current_thread_info()->status &= ~TS_POLLING;
780 /*
781 * TS_POLLING-cleared state must be visible before we
782 * test NEED_RESCHED:
783 */
784 smp_mb();
785 }
786 if (!need_resched()) {
787 idled = 1;
788 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
789 }
790 if (polling)
791 current_thread_info()->status |= TS_POLLING;
792
793 if (!idled)
794 return 0;
795
796 if (ret) {
797 static unsigned long t;
798
799 /* This always fails on some SMP boards running UP kernels.
800 * Only report the failure the first 5 times.
801 */
802 if (++t < 5)
803 {
804 printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
805 (eax >> 8) & 0xff);
806 t = jiffies;
807 }
808 return -1;
809 }
810 clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
811 return clock_slowed;
812}
813
814/**
815 * apm_do_busy - inform the BIOS the CPU is busy
816 *
817 * Request that the BIOS brings the CPU back to full performance.
818 */
819
820static void apm_do_busy(void)
821{
822 u32 dummy;
823
824 if (clock_slowed || ALWAYS_CALL_BUSY) {
825 (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
826 clock_slowed = 0;
827 }
828}
829
830/*
831 * If no process has really been interested in
832 * the CPU for some time, we want to call BIOS
833 * power management - we probably want
834 * to conserve power.
835 */
836#define IDLE_CALC_LIMIT (HZ * 100)
837#define IDLE_LEAKY_MAX 16
838
839static void (*original_pm_idle)(void) __read_mostly;
840
841/**
842 * apm_cpu_idle - cpu idling for APM capable Linux
843 *
844 * This is the idling function the kernel executes when APM is available. It
845 * tries to do BIOS powermanagement based on the average system idle time.
846 * Furthermore it calls the system default idle routine.
847 */
848
849static void apm_cpu_idle(void)
850{
851 static int use_apm_idle; /* = 0 */
852 static unsigned int last_jiffies; /* = 0 */
853 static unsigned int last_stime; /* = 0 */
854
855 int apm_idle_done = 0;
856 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
857 unsigned int bucket;
858
859recalc:
860 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
861 use_apm_idle = 0;
862 last_jiffies = jiffies;
863 last_stime = current->stime;
864 } else if (jiffies_since_last_check > idle_period) {
865 unsigned int idle_percentage;
866
867 idle_percentage = current->stime - last_stime;
868 idle_percentage *= 100;
869 idle_percentage /= jiffies_since_last_check;
870 use_apm_idle = (idle_percentage > idle_threshold);
871 if (apm_info.forbid_idle)
872 use_apm_idle = 0;
873 last_jiffies = jiffies;
874 last_stime = current->stime;
875 }
876
877 bucket = IDLE_LEAKY_MAX;
878
879 while (!need_resched()) {
880 if (use_apm_idle) {
881 unsigned int t;
882
883 t = jiffies;
884 switch (apm_do_idle()) {
885 case 0: apm_idle_done = 1;
886 if (t != jiffies) {
887 if (bucket) {
888 bucket = IDLE_LEAKY_MAX;
889 continue;
890 }
891 } else if (bucket) {
892 bucket--;
893 continue;
894 }
895 break;
896 case 1: apm_idle_done = 1;
897 break;
898 default: /* BIOS refused */
899 break;
900 }
901 }
902 if (original_pm_idle)
903 original_pm_idle();
904 else
905 default_idle();
906 jiffies_since_last_check = jiffies - last_jiffies;
907 if (jiffies_since_last_check > idle_period)
908 goto recalc;
909 }
910
911 if (apm_idle_done)
912 apm_do_busy();
913}
914
915/**
916 * apm_power_off - ask the BIOS to power off
917 *
918 * Handle the power off sequence. This is the one piece of code we
919 * will execute even on SMP machines. In order to deal with BIOS
920 * bugs we support real mode APM BIOS power off calls. We also make
921 * the SMP call on CPU0 as some systems will only honour this call
922 * on their first cpu.
923 */
924
925static void apm_power_off(void)
926{
927 unsigned char po_bios_call[] = {
928 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
929 0x8e, 0xd0, /* movw ax,ss */
930 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
931 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
932 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
933 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
934 0xcd, 0x15 /* int $0x15 */
935 };
936
937 /* Some bioses don't like being called from CPU != 0 */
938 if (apm_info.realmode_power_off)
939 {
940 (void)apm_save_cpus();
941 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 }
943 else
944 (void) set_system_power_state(APM_STATE_OFF);
945}
946
947#ifdef CONFIG_APM_DO_ENABLE
948
949/**
950 * apm_enable_power_management - enable BIOS APM power management
951 * @enable: enable yes/no
952 *
953 * Enable or disable the APM BIOS power services.
954 */
955
956static int apm_enable_power_management(int enable)
957{
958 u32 eax;
959
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax))
964 return (eax >> 8) & 0xff;
965 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
967 else
968 apm_info.bios.flags |= APM_BIOS_DISABLED;
969 return APM_SUCCESS;
970}
971#endif
972
973/**
974 * apm_get_power_status - get current power state
975 * @status: returned status
976 * @bat: battery info
977 * @life: estimated life
978 *
979 * Obtain the current power status from the APM BIOS. We return a
980 * status which gives the rough battery status, and current power
981 * source. The bat value returned give an estimate as a percentage
982 * of life and a status value for the battery. The estimated life
983 * if reported is a lifetime in secodnds/minutes at current powwer
984 * consumption.
985 */
986
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{
989 u32 eax;
990 u32 ebx;
991 u32 ecx;
992 u32 edx;
993 u32 dummy;
994
995 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
998 &eax, &ebx, &ecx, &edx, &dummy))
999 return (eax >> 8) & 0xff;
1000 *status = ebx;
1001 *bat = ecx;
1002 if (apm_info.get_power_status_swabinminutes) {
1003 *life = swab16((u16)edx);
1004 *life |= 0x8000;
1005 } else
1006 *life = edx;
1007 return APM_SUCCESS;
1008}
1009
1010#if 0
1011static int apm_get_battery_status(u_short which, u_short *status,
1012 u_short *bat, u_short *life, u_short *nbat)
1013{
1014 u32 eax;
1015 u32 ebx;
1016 u32 ecx;
1017 u32 edx;
1018 u32 esi;
1019
1020 if (apm_info.connection_version < 0x0102) {
1021 /* pretend we only have one battery. */
1022 if (which != 1)
1023 return APM_BAD_DEVICE;
1024 *nbat = 1;
1025 return apm_get_power_status(status, bat, life);
1026 }
1027
1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
1029 &ebx, &ecx, &edx, &esi))
1030 return (eax >> 8) & 0xff;
1031 *status = ebx;
1032 *bat = ecx;
1033 *life = edx;
1034 *nbat = esi;
1035 return APM_SUCCESS;
1036}
1037#endif
1038
1039/**
1040 * apm_engage_power_management - enable PM on a device
1041 * @device: identity of device
1042 * @enable: on/off
1043 *
1044 * Activate or deactive power management on either a specific device
1045 * or the entire system (%APM_DEVICE_ALL).
1046 */
1047
1048static int apm_engage_power_management(u_short device, int enable)
1049{
1050 u32 eax;
1051
1052 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED))
1054 return APM_DISABLED;
1055 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
1056 return (eax >> 8) & 0xff;
1057 if (device == APM_DEVICE_ALL) {
1058 if (enable)
1059 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
1060 else
1061 apm_info.bios.flags |= APM_BIOS_DISENGAGED;
1062 }
1063 return APM_SUCCESS;
1064}
1065
1066#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1067
1068/**
1069 * apm_console_blank - blank the display
1070 * @blank: on/off
1071 *
1072 * Attempt to blank the console, firstly by blanking just video device
1073 * zero, and if that fails (some BIOSes don't support it) then it blanks
1074 * all video devices. Typically the BIOS will do laptop backlight and
1075 * monitor powerdown for us.
1076 */
1077
1078static int apm_console_blank(int blank)
1079{
1080 int error = APM_NOT_ENGAGED; /* silence gcc */
1081 int i;
1082 u_short state;
1083 static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
1084
1085 state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
1086
1087 for (i = 0; i < ARRAY_SIZE(dev); i++) {
1088 error = set_power_state(dev[i], state);
1089
1090 if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
1091 return 1;
1092
1093 if (error == APM_NOT_ENGAGED)
1094 break;
1095 }
1096
1097 if (error == APM_NOT_ENGAGED) {
1098 static int tried;
1099 int eng_error;
1100 if (tried++ == 0) {
1101 eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
1102 if (eng_error) {
1103 apm_error("set display", error);
1104 apm_error("engage interface", eng_error);
1105 return 0;
1106 } else
1107 return apm_console_blank(blank);
1108 }
1109 }
1110 apm_error("set display", error);
1111 return 0;
1112}
1113#endif
1114
1115static int queue_empty(struct apm_user *as)
1116{
1117 return as->event_head == as->event_tail;
1118}
1119
1120static apm_event_t get_queued_event(struct apm_user *as)
1121{
1122 if (++as->event_tail >= APM_MAX_EVENTS)
1123 as->event_tail = 0;
1124 return as->events[as->event_tail];
1125}
1126
1127static void queue_event(apm_event_t event, struct apm_user *sender)
1128{
1129 struct apm_user * as;
1130
1131 spin_lock(&user_list_lock);
1132 if (user_list == NULL)
1133 goto out;
1134 for (as = user_list; as != NULL; as = as->next) {
1135 if ((as == sender) || (!as->reader))
1136 continue;
1137 if (++as->event_head >= APM_MAX_EVENTS)
1138 as->event_head = 0;
1139
1140 if (as->event_head == as->event_tail) {
1141 static int notified;
1142
1143 if (notified++ == 0)
1144 printk(KERN_ERR "apm: an event queue overflowed\n");
1145 if (++as->event_tail >= APM_MAX_EVENTS)
1146 as->event_tail = 0;
1147 }
1148 as->events[as->event_head] = event;
1149 if ((!as->suser) || (!as->writer))
1150 continue;
1151 switch (event) {
1152 case APM_SYS_SUSPEND:
1153 case APM_USER_SUSPEND:
1154 as->suspends_pending++;
1155 suspends_pending++;
1156 break;
1157
1158 case APM_SYS_STANDBY:
1159 case APM_USER_STANDBY:
1160 as->standbys_pending++;
1161 standbys_pending++;
1162 break;
1163 }
1164 }
1165 wake_up_interruptible(&apm_waitqueue);
1166out:
1167 spin_unlock(&user_list_lock);
1168}
1169
1170static void reinit_timer(void)
1171{
1172#ifdef INIT_TIMER_AFTER_SUSPEND
1173 unsigned long flags;
1174
1175 spin_lock_irqsave(&i8253_lock, flags);
1176 /* set the clock to HZ */
1177 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1178 udelay(10);
1179 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
1180 udelay(10);
1181 outb(LATCH >> 8, PIT_CH0); /* MSB */
1182 udelay(10);
1183 spin_unlock_irqrestore(&i8253_lock, flags);
1184#endif
1185}
1186
1187static int suspend(int vetoable)
1188{
1189 int err;
1190 struct apm_user *as;
1191
1192 if (pm_send_all(PM_SUSPEND, (void *)3)) {
1193 /* Vetoed */
1194 if (vetoable) {
1195 if (apm_info.connection_version > 0x100)
1196 set_system_power_state(APM_STATE_REJECT);
1197 err = -EBUSY;
1198 ignore_sys_suspend = 0;
1199 printk(KERN_WARNING "apm: suspend was vetoed.\n");
1200 goto out;
1201 }
1202 printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
1203 }
1204
1205 device_suspend(PMSG_SUSPEND);
1206 local_irq_disable();
1207 device_power_down(PMSG_SUSPEND);
1208
1209 local_irq_enable();
1210
1211 save_processor_state();
1212 err = set_system_power_state(APM_STATE_SUSPEND);
1213 ignore_normal_resume = 1;
1214 restore_processor_state();
1215
1216 local_irq_disable();
1217 reinit_timer();
1218
1219 if (err == APM_NO_ERROR)
1220 err = APM_SUCCESS;
1221 if (err != APM_SUCCESS)
1222 apm_error("suspend", err);
1223 err = (err == APM_SUCCESS) ? 0 : -EIO;
1224 device_power_up();
1225 local_irq_enable();
1226 device_resume();
1227 pm_send_all(PM_RESUME, (void *)0);
1228 queue_event(APM_NORMAL_RESUME, NULL);
1229 out:
1230 spin_lock(&user_list_lock);
1231 for (as = user_list; as != NULL; as = as->next) {
1232 as->suspend_wait = 0;
1233 as->suspend_result = err;
1234 }
1235 spin_unlock(&user_list_lock);
1236 wake_up_interruptible(&apm_suspend_waitqueue);
1237 return err;
1238}
1239
1240static void standby(void)
1241{
1242 int err;
1243
1244 local_irq_disable();
1245 device_power_down(PMSG_SUSPEND);
1246 local_irq_enable();
1247
1248 err = set_system_power_state(APM_STATE_STANDBY);
1249 if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
1250 apm_error("standby", err);
1251
1252 local_irq_disable();
1253 device_power_up();
1254 local_irq_enable();
1255}
1256
1257static apm_event_t get_event(void)
1258{
1259 int error;
1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */
1261 apm_eventinfo_t info;
1262
1263 static int notified;
1264
1265 /* we don't use the eventinfo */
1266 error = apm_get_event(&event, &info);
1267 if (error == APM_SUCCESS)
1268 return event;
1269
1270 if ((error != APM_NO_EVENTS) && (notified++ == 0))
1271 apm_error("get_event", error);
1272
1273 return 0;
1274}
1275
1276static void check_events(void)
1277{
1278 apm_event_t event;
1279 static unsigned long last_resume;
1280 static int ignore_bounce;
1281
1282 while ((event = get_event()) != 0) {
1283 if (debug) {
1284 if (event <= NR_APM_EVENT_NAME)
1285 printk(KERN_DEBUG "apm: received %s notify\n",
1286 apm_event_name[event - 1]);
1287 else
1288 printk(KERN_DEBUG "apm: received unknown "
1289 "event 0x%02x\n", event);
1290 }
1291 if (ignore_bounce
1292 && ((jiffies - last_resume) > bounce_interval))
1293 ignore_bounce = 0;
1294
1295 switch (event) {
1296 case APM_SYS_STANDBY:
1297 case APM_USER_STANDBY:
1298 queue_event(event, NULL);
1299 if (standbys_pending <= 0)
1300 standby();
1301 break;
1302
1303 case APM_USER_SUSPEND:
1304#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
1305 if (apm_info.connection_version > 0x100)
1306 set_system_power_state(APM_STATE_REJECT);
1307 break;
1308#endif
1309 case APM_SYS_SUSPEND:
1310 if (ignore_bounce) {
1311 if (apm_info.connection_version > 0x100)
1312 set_system_power_state(APM_STATE_REJECT);
1313 break;
1314 }
1315 /*
1316 * If we are already processing a SUSPEND,
1317 * then further SUSPEND events from the BIOS
1318 * will be ignored. We also return here to
1319 * cope with the fact that the Thinkpads keep
1320 * sending a SUSPEND event until something else
1321 * happens!
1322 */
1323 if (ignore_sys_suspend)
1324 return;
1325 ignore_sys_suspend = 1;
1326 queue_event(event, NULL);
1327 if (suspends_pending <= 0)
1328 (void) suspend(1);
1329 break;
1330
1331 case APM_NORMAL_RESUME:
1332 case APM_CRITICAL_RESUME:
1333 case APM_STANDBY_RESUME:
1334 ignore_sys_suspend = 0;
1335 last_resume = jiffies;
1336 ignore_bounce = 1;
1337 if ((event != APM_NORMAL_RESUME)
1338 || (ignore_normal_resume == 0)) {
1339 device_resume();
1340 pm_send_all(PM_RESUME, (void *)0);
1341 queue_event(event, NULL);
1342 }
1343 ignore_normal_resume = 0;
1344 break;
1345
1346 case APM_CAPABILITY_CHANGE:
1347 case APM_LOW_BATTERY:
1348 case APM_POWER_STATUS_CHANGE:
1349 queue_event(event, NULL);
1350 /* If needed, notify drivers here */
1351 break;
1352
1353 case APM_UPDATE_TIME:
1354 break;
1355
1356 case APM_CRITICAL_SUSPEND:
1357 /*
1358 * We are not allowed to reject a critical suspend.
1359 */
1360 (void) suspend(0);
1361 break;
1362 }
1363 }
1364}
1365
1366static void apm_event_handler(void)
1367{
1368 static int pending_count = 4;
1369 int err;
1370
1371 if ((standbys_pending > 0) || (suspends_pending > 0)) {
1372 if ((apm_info.connection_version > 0x100) &&
1373 (pending_count-- <= 0)) {
1374 pending_count = 4;
1375 if (debug)
1376 printk(KERN_DEBUG "apm: setting state busy\n");
1377 err = set_system_power_state(APM_STATE_BUSY);
1378 if (err)
1379 apm_error("busy", err);
1380 }
1381 } else
1382 pending_count = 4;
1383 check_events();
1384}
1385
1386/*
1387 * This is the APM thread main loop.
1388 */
1389
1390static void apm_mainloop(void)
1391{
1392 DECLARE_WAITQUEUE(wait, current);
1393
1394 add_wait_queue(&apm_waitqueue, &wait);
1395 set_current_state(TASK_INTERRUPTIBLE);
1396 for (;;) {
1397 schedule_timeout(APM_CHECK_TIMEOUT);
1398 if (kthread_should_stop())
1399 break;
1400 /*
1401 * Ok, check all events, check for idle (and mark us sleeping
1402 * so as not to count towards the load average)..
1403 */
1404 set_current_state(TASK_INTERRUPTIBLE);
1405 apm_event_handler();
1406 }
1407 remove_wait_queue(&apm_waitqueue, &wait);
1408}
1409
1410static int check_apm_user(struct apm_user *as, const char *func)
1411{
1412 if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
1413 printk(KERN_ERR "apm: %s passed bad filp\n", func);
1414 return 1;
1415 }
1416 return 0;
1417}
1418
1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
1420{
1421 struct apm_user * as;
1422 int i;
1423 apm_event_t event;
1424
1425 as = fp->private_data;
1426 if (check_apm_user(as, "read"))
1427 return -EIO;
1428 if ((int)count < sizeof(apm_event_t))
1429 return -EINVAL;
1430 if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
1431 return -EAGAIN;
1432 wait_event_interruptible(apm_waitqueue, !queue_empty(as));
1433 i = count;
1434 while ((i >= sizeof(event)) && !queue_empty(as)) {
1435 event = get_queued_event(as);
1436 if (copy_to_user(buf, &event, sizeof(event))) {
1437 if (i < count)
1438 break;
1439 return -EFAULT;
1440 }
1441 switch (event) {
1442 case APM_SYS_SUSPEND:
1443 case APM_USER_SUSPEND:
1444 as->suspends_read++;
1445 break;
1446
1447 case APM_SYS_STANDBY:
1448 case APM_USER_STANDBY:
1449 as->standbys_read++;
1450 break;
1451 }
1452 buf += sizeof(event);
1453 i -= sizeof(event);
1454 }
1455 if (i < count)
1456 return count - i;
1457 if (signal_pending(current))
1458 return -ERESTARTSYS;
1459 return 0;
1460}
1461
1462static unsigned int do_poll(struct file *fp, poll_table * wait)
1463{
1464 struct apm_user * as;
1465
1466 as = fp->private_data;
1467 if (check_apm_user(as, "poll"))
1468 return 0;
1469 poll_wait(fp, &apm_waitqueue, wait);
1470 if (!queue_empty(as))
1471 return POLLIN | POLLRDNORM;
1472 return 0;
1473}
1474
1475static int do_ioctl(struct inode * inode, struct file *filp,
1476 u_int cmd, u_long arg)
1477{
1478 struct apm_user * as;
1479
1480 as = filp->private_data;
1481 if (check_apm_user(as, "ioctl"))
1482 return -EIO;
1483 if ((!as->suser) || (!as->writer))
1484 return -EPERM;
1485 switch (cmd) {
1486 case APM_IOC_STANDBY:
1487 if (as->standbys_read > 0) {
1488 as->standbys_read--;
1489 as->standbys_pending--;
1490 standbys_pending--;
1491 } else
1492 queue_event(APM_USER_STANDBY, as);
1493 if (standbys_pending <= 0)
1494 standby();
1495 break;
1496 case APM_IOC_SUSPEND:
1497 if (as->suspends_read > 0) {
1498 as->suspends_read--;
1499 as->suspends_pending--;
1500 suspends_pending--;
1501 } else
1502 queue_event(APM_USER_SUSPEND, as);
1503 if (suspends_pending <= 0) {
1504 return suspend(1);
1505 } else {
1506 as->suspend_wait = 1;
1507 wait_event_interruptible(apm_suspend_waitqueue,
1508 as->suspend_wait == 0);
1509 return as->suspend_result;
1510 }
1511 break;
1512 default:
1513 return -EINVAL;
1514 }
1515 return 0;
1516}
1517
1518static int do_release(struct inode * inode, struct file * filp)
1519{
1520 struct apm_user * as;
1521
1522 as = filp->private_data;
1523 if (check_apm_user(as, "release"))
1524 return 0;
1525 filp->private_data = NULL;
1526 if (as->standbys_pending > 0) {
1527 standbys_pending -= as->standbys_pending;
1528 if (standbys_pending <= 0)
1529 standby();
1530 }
1531 if (as->suspends_pending > 0) {
1532 suspends_pending -= as->suspends_pending;
1533 if (suspends_pending <= 0)
1534 (void) suspend(1);
1535 }
1536 spin_lock(&user_list_lock);
1537 if (user_list == as)
1538 user_list = as->next;
1539 else {
1540 struct apm_user * as1;
1541
1542 for (as1 = user_list;
1543 (as1 != NULL) && (as1->next != as);
1544 as1 = as1->next)
1545 ;
1546 if (as1 == NULL)
1547 printk(KERN_ERR "apm: filp not in user list\n");
1548 else
1549 as1->next = as->next;
1550 }
1551 spin_unlock(&user_list_lock);
1552 kfree(as);
1553 return 0;
1554}
1555
1556static int do_open(struct inode * inode, struct file * filp)
1557{
1558 struct apm_user * as;
1559
1560 as = kmalloc(sizeof(*as), GFP_KERNEL);
1561 if (as == NULL) {
1562 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1563 sizeof(*as));
1564 return -ENOMEM;
1565 }
1566 as->magic = APM_BIOS_MAGIC;
1567 as->event_tail = as->event_head = 0;
1568 as->suspends_pending = as->standbys_pending = 0;
1569 as->suspends_read = as->standbys_read = 0;
1570 /*
1571 * XXX - this is a tiny bit broken, when we consider BSD
1572 * process accounting. If the device is opened by root, we
1573 * instantly flag that we used superuser privs. Who knows,
1574 * we might close the device immediately without doing a
1575 * privileged operation -- cevans
1576 */
1577 as->suser = capable(CAP_SYS_ADMIN);
1578 as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
1579 as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
1580 spin_lock(&user_list_lock);
1581 as->next = user_list;
1582 user_list = as;
1583 spin_unlock(&user_list_lock);
1584 filp->private_data = as;
1585 return 0;
1586}
1587
1588static int proc_apm_show(struct seq_file *m, void *v)
1589{
1590 unsigned short bx;
1591 unsigned short cx;
1592 unsigned short dx;
1593 int error;
1594 unsigned short ac_line_status = 0xff;
1595 unsigned short battery_status = 0xff;
1596 unsigned short battery_flag = 0xff;
1597 int percentage = -1;
1598 int time_units = -1;
1599 char *units = "?";
1600
1601 if ((num_online_cpus() == 1) &&
1602 !(error = apm_get_power_status(&bx, &cx, &dx))) {
1603 ac_line_status = (bx >> 8) & 0xff;
1604 battery_status = bx & 0xff;
1605 if ((cx & 0xff) != 0xff)
1606 percentage = cx & 0xff;
1607
1608 if (apm_info.connection_version > 0x100) {
1609 battery_flag = (cx >> 8) & 0xff;
1610 if (dx != 0xffff) {
1611 units = (dx & 0x8000) ? "min" : "sec";
1612 time_units = dx & 0x7fff;
1613 }
1614 }
1615 }
1616 /* Arguments, with symbols from linux/apm_bios.h. Information is
1617 from the Get Power Status (0x0a) call unless otherwise noted.
1618
1619 0) Linux driver version (this will change if format changes)
1620 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2.
1621 2) APM flags from APM Installation Check (0x00):
1622 bit 0: APM_16_BIT_SUPPORT
1623 bit 1: APM_32_BIT_SUPPORT
1624 bit 2: APM_IDLE_SLOWS_CLOCK
1625 bit 3: APM_BIOS_DISABLED
1626 bit 4: APM_BIOS_DISENGAGED
1627 3) AC line status
1628 0x00: Off-line
1629 0x01: On-line
1630 0x02: On backup power (BIOS >= 1.1 only)
1631 0xff: Unknown
1632 4) Battery status
1633 0x00: High
1634 0x01: Low
1635 0x02: Critical
1636 0x03: Charging
1637 0x04: Selected battery not present (BIOS >= 1.2 only)
1638 0xff: Unknown
1639 5) Battery flag
1640 bit 0: High
1641 bit 1: Low
1642 bit 2: Critical
1643 bit 3: Charging
1644 bit 7: No system battery
1645 0xff: Unknown
1646 6) Remaining battery life (percentage of charge):
1647 0-100: valid
1648 -1: Unknown
1649 7) Remaining battery life (time units):
1650 Number of remaining minutes or seconds
1651 -1: Unknown
1652 8) min = minutes; sec = seconds */
1653
1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1655 driver_version,
1656 (apm_info.bios.version >> 8) & 0xff,
1657 apm_info.bios.version & 0xff,
1658 apm_info.bios.flags,
1659 ac_line_status,
1660 battery_status,
1661 battery_flag,
1662 percentage,
1663 time_units,
1664 units);
1665 return 0;
1666}
1667
1668static int proc_apm_open(struct inode *inode, struct file *file)
1669{
1670 return single_open(file, proc_apm_show, NULL);
1671}
1672
1673static const struct file_operations apm_file_ops = {
1674 .owner = THIS_MODULE,
1675 .open = proc_apm_open,
1676 .read = seq_read,
1677 .llseek = seq_lseek,
1678 .release = single_release,
1679};
1680
1681static int apm(void *unused)
1682{
1683 unsigned short bx;
1684 unsigned short cx;
1685 unsigned short dx;
1686 int error;
1687 char * power_stat;
1688 char * bat_stat;
1689
1690#ifdef CONFIG_SMP
1691 /* 2002/08/01 - WT
1692 * This is to avoid random crashes at boot time during initialization
1693 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
1694 * Some bioses don't like being called from CPU != 0.
1695 * Method suggested by Ingo Molnar.
1696 */
1697 set_cpus_allowed(current, cpumask_of_cpu(0));
1698 BUG_ON(smp_processor_id() != 0);
1699#endif
1700
1701 if (apm_info.connection_version == 0) {
1702 apm_info.connection_version = apm_info.bios.version;
1703 if (apm_info.connection_version > 0x100) {
1704 /*
1705 * We only support BIOSs up to version 1.2
1706 */
1707 if (apm_info.connection_version > 0x0102)
1708 apm_info.connection_version = 0x0102;
1709 error = apm_driver_version(&apm_info.connection_version);
1710 if (error != APM_SUCCESS) {
1711 apm_error("driver version", error);
1712 /* Fall back to an APM 1.0 connection. */
1713 apm_info.connection_version = 0x100;
1714 }
1715 }
1716 }
1717
1718 if (debug)
1719 printk(KERN_INFO "apm: Connection version %d.%d\n",
1720 (apm_info.connection_version >> 8) & 0xff,
1721 apm_info.connection_version & 0xff);
1722
1723#ifdef CONFIG_APM_DO_ENABLE
1724 if (apm_info.bios.flags & APM_BIOS_DISABLED) {
1725 /*
1726 * This call causes my NEC UltraLite Versa 33/C to hang if it
1727 * is booted with PM disabled but not in the docking station.
1728 * Unfortunate ...
1729 */
1730 error = apm_enable_power_management(1);
1731 if (error) {
1732 apm_error("enable power management", error);
1733 return -1;
1734 }
1735 }
1736#endif
1737
1738 if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
1739 && (apm_info.connection_version > 0x0100)) {
1740 error = apm_engage_power_management(APM_DEVICE_ALL, 1);
1741 if (error) {
1742 apm_error("engage power management", error);
1743 return -1;
1744 }
1745 }
1746
1747 if (debug && (num_online_cpus() == 1 || smp )) {
1748 error = apm_get_power_status(&bx, &cx, &dx);
1749 if (error)
1750 printk(KERN_INFO "apm: power status not available\n");
1751 else {
1752 switch ((bx >> 8) & 0xff) {
1753 case 0: power_stat = "off line"; break;
1754 case 1: power_stat = "on line"; break;
1755 case 2: power_stat = "on backup power"; break;
1756 default: power_stat = "unknown"; break;
1757 }
1758 switch (bx & 0xff) {
1759 case 0: bat_stat = "high"; break;
1760 case 1: bat_stat = "low"; break;
1761 case 2: bat_stat = "critical"; break;
1762 case 3: bat_stat = "charging"; break;
1763 default: bat_stat = "unknown"; break;
1764 }
1765 printk(KERN_INFO
1766 "apm: AC %s, battery status %s, battery life ",
1767 power_stat, bat_stat);
1768 if ((cx & 0xff) == 0xff)
1769 printk("unknown\n");
1770 else
1771 printk("%d%%\n", cx & 0xff);
1772 if (apm_info.connection_version > 0x100) {
1773 printk(KERN_INFO
1774 "apm: battery flag 0x%02x, battery life ",
1775 (cx >> 8) & 0xff);
1776 if (dx == 0xffff)
1777 printk("unknown\n");
1778 else
1779 printk("%d %s\n", dx & 0x7fff,
1780 (dx & 0x8000) ?
1781 "minutes" : "seconds");
1782 }
1783 }
1784 }
1785
1786 /* Install our power off handler.. */
1787 if (power_off)
1788 pm_power_off = apm_power_off;
1789
1790 if (num_online_cpus() == 1 || smp) {
1791#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1792 console_blank_hook = apm_console_blank;
1793#endif
1794 apm_mainloop();
1795#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
1796 console_blank_hook = NULL;
1797#endif
1798 }
1799
1800 return 0;
1801}
1802
1803#ifndef MODULE
1804static int __init apm_setup(char *str)
1805{
1806 int invert;
1807
1808 while ((str != NULL) && (*str != '\0')) {
1809 if (strncmp(str, "off", 3) == 0)
1810 apm_disabled = 1;
1811 if (strncmp(str, "on", 2) == 0)
1812 apm_disabled = 0;
1813 if ((strncmp(str, "bounce-interval=", 16) == 0) ||
1814 (strncmp(str, "bounce_interval=", 16) == 0))
1815 bounce_interval = simple_strtol(str + 16, NULL, 0);
1816 if ((strncmp(str, "idle-threshold=", 15) == 0) ||
1817 (strncmp(str, "idle_threshold=", 15) == 0))
1818 idle_threshold = simple_strtol(str + 15, NULL, 0);
1819 if ((strncmp(str, "idle-period=", 12) == 0) ||
1820 (strncmp(str, "idle_period=", 12) == 0))
1821 idle_period = simple_strtol(str + 12, NULL, 0);
1822 invert = (strncmp(str, "no-", 3) == 0) ||
1823 (strncmp(str, "no_", 3) == 0);
1824 if (invert)
1825 str += 3;
1826 if (strncmp(str, "debug", 5) == 0)
1827 debug = !invert;
1828 if ((strncmp(str, "power-off", 9) == 0) ||
1829 (strncmp(str, "power_off", 9) == 0))
1830 power_off = !invert;
1831 if (strncmp(str, "smp", 3) == 0)
1832 {
1833 smp = !invert;
1834 idle_threshold = 100;
1835 }
1836 if ((strncmp(str, "allow-ints", 10) == 0) ||
1837 (strncmp(str, "allow_ints", 10) == 0))
1838 apm_info.allow_ints = !invert;
1839 if ((strncmp(str, "broken-psr", 10) == 0) ||
1840 (strncmp(str, "broken_psr", 10) == 0))
1841 apm_info.get_power_status_broken = !invert;
1842 if ((strncmp(str, "realmode-power-off", 18) == 0) ||
1843 (strncmp(str, "realmode_power_off", 18) == 0))
1844 apm_info.realmode_power_off = !invert;
1845 str = strchr(str, ',');
1846 if (str != NULL)
1847 str += strspn(str, ", \t");
1848 }
1849 return 1;
1850}
1851
1852__setup("apm=", apm_setup);
1853#endif
1854
1855static const struct file_operations apm_bios_fops = {
1856 .owner = THIS_MODULE,
1857 .read = do_read,
1858 .poll = do_poll,
1859 .ioctl = do_ioctl,
1860 .open = do_open,
1861 .release = do_release,
1862};
1863
1864static struct miscdevice apm_device = {
1865 APM_MINOR_DEV,
1866 "apm_bios",
1867 &apm_bios_fops
1868};
1869
1870
1871/* Simple "print if true" callback */
1872static int __init print_if_true(struct dmi_system_id *d)
1873{
1874 printk("%s\n", d->ident);
1875 return 0;
1876}
1877
1878/*
1879 * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
1880 * disabled before the suspend. Linux used to get terribly confused by that.
1881 */
1882static int __init broken_ps2_resume(struct dmi_system_id *d)
1883{
1884 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
1885 return 0;
1886}
1887
1888/* Some bioses have a broken protected mode poweroff and need to use realmode */
1889static int __init set_realmode_power_off(struct dmi_system_id *d)
1890{
1891 if (apm_info.realmode_power_off == 0) {
1892 apm_info.realmode_power_off = 1;
1893 printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
1894 }
1895 return 0;
1896}
1897
1898/* Some laptops require interrupts to be enabled during APM calls */
1899static int __init set_apm_ints(struct dmi_system_id *d)
1900{
1901 if (apm_info.allow_ints == 0) {
1902 apm_info.allow_ints = 1;
1903 printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
1904 }
1905 return 0;
1906}
1907
1908/* Some APM bioses corrupt memory or just plain do not work */
1909static int __init apm_is_horked(struct dmi_system_id *d)
1910{
1911 if (apm_info.disabled == 0) {
1912 apm_info.disabled = 1;
1913 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
1914 }
1915 return 0;
1916}
1917
1918static int __init apm_is_horked_d850md(struct dmi_system_id *d)
1919{
1920 if (apm_info.disabled == 0) {
1921 apm_info.disabled = 1;
1922 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
1923 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
1924 printk(KERN_INFO "download from support.intel.com \n");
1925 }
1926 return 0;
1927}
1928
1929/* Some APM bioses hang on APM idle calls */
1930static int __init apm_likes_to_melt(struct dmi_system_id *d)
1931{
1932 if (apm_info.forbid_idle == 0) {
1933 apm_info.forbid_idle = 1;
1934 printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
1935 }
1936 return 0;
1937}
1938
1939/*
1940 * Check for clue free BIOS implementations who use
1941 * the following QA technique
1942 *
1943 * [ Write BIOS Code ]<------
1944 * | ^
1945 * < Does it Compile >----N--
1946 * |Y ^
1947 * < Does it Boot Win98 >-N--
1948 * |Y
1949 * [Ship It]
1950 *
1951 * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e)
1952 * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000)
1953 */
1954static int __init broken_apm_power(struct dmi_system_id *d)
1955{
1956 apm_info.get_power_status_broken = 1;
1957 printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
1958 return 0;
1959}
1960
1961/*
1962 * This bios swaps the APM minute reporting bytes over (Many sony laptops
1963 * have this problem).
1964 */
1965static int __init swab_apm_power_in_minutes(struct dmi_system_id *d)
1966{
1967 apm_info.get_power_status_swabinminutes = 1;
1968 printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
1969 return 0;
1970}
1971
1972static struct dmi_system_id __initdata apm_dmi_table[] = {
1973 {
1974 print_if_true,
1975 KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
1976 { DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
1977 DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
1978 },
1979 { /* Handle problems with APM on the C600 */
1980 broken_ps2_resume, "Dell Latitude C600",
1981 { DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
1982 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
1983 },
1984 { /* Allow interrupts during suspend on Dell Latitude laptops*/
1985 set_apm_ints, "Dell Latitude",
1986 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1987 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
1988 },
1989 { /* APM crashes */
1990 apm_is_horked, "Dell Inspiron 2500",
1991 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1992 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
1993 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
1994 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
1995 },
1996 { /* Allow interrupts during suspend on Dell Inspiron laptops*/
1997 set_apm_ints, "Dell Inspiron", {
1998 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1999 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
2000 },
2001 { /* Handle problems with APM on Inspiron 5000e */
2002 broken_apm_power, "Dell Inspiron 5000e",
2003 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2004 DMI_MATCH(DMI_BIOS_VERSION, "A04"),
2005 DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
2006 },
2007 { /* Handle problems with APM on Inspiron 2500 */
2008 broken_apm_power, "Dell Inspiron 2500",
2009 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2010 DMI_MATCH(DMI_BIOS_VERSION, "A12"),
2011 DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
2012 },
2013 { /* APM crashes */
2014 apm_is_horked, "Dell Dimension 4100",
2015 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2016 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
2017 DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
2018 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2019 },
2020 { /* Allow interrupts during suspend on Compaq Laptops*/
2021 set_apm_ints, "Compaq 12XL125",
2022 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
2023 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
2024 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2025 DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
2026 },
2027 { /* Allow interrupts during APM or the clock goes slow */
2028 set_apm_ints, "ASUSTeK",
2029 { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
2030 DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
2031 },
2032 { /* APM blows on shutdown */
2033 apm_is_horked, "ABIT KX7-333[R]",
2034 { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
2035 DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
2036 },
2037 { /* APM crashes */
2038 apm_is_horked, "Trigem Delhi3",
2039 { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
2040 DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
2041 },
2042 { /* APM crashes */
2043 apm_is_horked, "Fujitsu-Siemens",
2044 { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
2045 DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
2046 },
2047 { /* APM crashes */
2048 apm_is_horked_d850md, "Intel D850MD",
2049 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2050 DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
2051 },
2052 { /* APM crashes */
2053 apm_is_horked, "Intel D810EMO",
2054 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2055 DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
2056 },
2057 { /* APM crashes */
2058 apm_is_horked, "Dell XPS-Z",
2059 { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2060 DMI_MATCH(DMI_BIOS_VERSION, "A11"),
2061 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
2062 },
2063 { /* APM crashes */
2064 apm_is_horked, "Sharp PC-PJ/AX",
2065 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
2066 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
2067 DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
2068 DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
2069 },
2070 { /* APM crashes */
2071 apm_is_horked, "Dell Inspiron 2500",
2072 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2073 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2074 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
2075 DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
2076 },
2077 { /* APM idle hangs */
2078 apm_likes_to_melt, "Jabil AMD",
2079 { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
2080 DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
2081 },
2082 { /* APM idle hangs */
2083 apm_likes_to_melt, "AMI Bios",
2084 { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
2085 DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
2086 },
2087 { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
2088 swab_apm_power_in_minutes, "Sony VAIO",
2089 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2090 DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
2091 DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
2092 },
2093 { /* Handle problems with APM on Sony Vaio PCG-N505VX */
2094 swab_apm_power_in_minutes, "Sony VAIO",
2095 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2096 DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
2097 DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
2098 },
2099 { /* Handle problems with APM on Sony Vaio PCG-XG29 */
2100 swab_apm_power_in_minutes, "Sony VAIO",
2101 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2102 DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
2103 DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
2104 },
2105 { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
2106 swab_apm_power_in_minutes, "Sony VAIO",
2107 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2108 DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
2109 DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
2110 },
2111 { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
2112 swab_apm_power_in_minutes, "Sony VAIO",
2113 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2114 DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
2115 DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
2116 },
2117 { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
2118 swab_apm_power_in_minutes, "Sony VAIO",
2119 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2120 DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
2121 DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
2122 },
2123 { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
2124 swab_apm_power_in_minutes, "Sony VAIO",
2125 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2126 DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
2127 DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
2128 },
2129 { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
2130 swab_apm_power_in_minutes, "Sony VAIO",
2131 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2132 DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
2133 DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
2134 },
2135 { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
2136 swab_apm_power_in_minutes, "Sony VAIO",
2137 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2138 DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
2139 DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
2140 },
2141 { /* Handle problems with APM on Sony Vaio PCG-F104K */
2142 swab_apm_power_in_minutes, "Sony VAIO",
2143 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2144 DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
2145 DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
2146 },
2147
2148 { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
2149 swab_apm_power_in_minutes, "Sony VAIO",
2150 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2151 DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
2152 DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
2153 },
2154 { /* Handle problems with APM on Sony Vaio PCG-C1VE */
2155 swab_apm_power_in_minutes, "Sony VAIO",
2156 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2157 DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
2158 DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
2159 },
2160 { /* Handle problems with APM on Sony Vaio PCG-C1VE */
2161 swab_apm_power_in_minutes, "Sony VAIO",
2162 { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2163 DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
2164 DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
2165 },
2166 { /* broken PM poweroff bios */
2167 set_realmode_power_off, "Award Software v4.60 PGMA",
2168 { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
2169 DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
2170 DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
2171 },
2172
2173 /* Generic per vendor APM settings */
2174
2175 { /* Allow interrupts during suspend on IBM laptops */
2176 set_apm_ints, "IBM",
2177 { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
2178 },
2179
2180 { }
2181};
2182
2183/*
2184 * Just start the APM thread. We do NOT want to do APM BIOS
2185 * calls from anything but the APM thread, if for no other reason
2186 * than the fact that we don't trust the APM BIOS. This way,
2187 * most common APM BIOS problems that lead to protection errors
2188 * etc will have at least some level of being contained...
2189 *
2190 * In short, if something bad happens, at least we have a choice
2191 * of just killing the apm thread..
2192 */
2193static int __init apm_init(void)
2194{
2195 struct proc_dir_entry *apm_proc;
2196 struct desc_struct *gdt;
2197 int err;
2198
2199 dmi_check_system(apm_dmi_table);
2200
2201 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2202 printk(KERN_INFO "apm: BIOS not found.\n");
2203 return -ENODEV;
2204 }
2205 printk(KERN_INFO
2206 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
2207 ((apm_info.bios.version >> 8) & 0xff),
2208 (apm_info.bios.version & 0xff),
2209 apm_info.bios.flags,
2210 driver_version);
2211 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
2212 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
2213 return -ENODEV;
2214 }
2215
2216 if (allow_ints)
2217 apm_info.allow_ints = 1;
2218 if (broken_psr)
2219 apm_info.get_power_status_broken = 1;
2220 if (realmode_power_off)
2221 apm_info.realmode_power_off = 1;
2222 /* User can override, but default is to trust DMI */
2223 if (apm_disabled != -1)
2224 apm_info.disabled = apm_disabled;
2225
2226 /*
2227 * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
2228 * but is reportedly a 1.0 BIOS.
2229 */
2230 if (apm_info.bios.version == 0x001)
2231 apm_info.bios.version = 0x100;
2232
2233 /* BIOS < 1.2 doesn't set cseg_16_len */
2234 if (apm_info.bios.version < 0x102)
2235 apm_info.bios.cseg_16_len = 0; /* 64k */
2236
2237 if (debug) {
2238 printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
2239 apm_info.bios.cseg, apm_info.bios.offset,
2240 apm_info.bios.cseg_16, apm_info.bios.dseg);
2241 if (apm_info.bios.version > 0x100)
2242 printk(" cseg len %x, dseg len %x",
2243 apm_info.bios.cseg_len,
2244 apm_info.bios.dseg_len);
2245 if (apm_info.bios.version > 0x101)
2246 printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
2247 printk("\n");
2248 }
2249
2250 if (apm_info.disabled) {
2251 printk(KERN_NOTICE "apm: disabled on user request.\n");
2252 return -ENODEV;
2253 }
2254 if ((num_online_cpus() > 1) && !power_off && !smp) {
2255 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
2256 apm_info.disabled = 1;
2257 return -ENODEV;
2258 }
2259 if (PM_IS_ACTIVE()) {
2260 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2261 apm_info.disabled = 1;
2262 return -ENODEV;
2263 }
2264#ifdef CONFIG_PM_LEGACY
2265 pm_active = 1;
2266#endif
2267
2268 /*
2269 * Set up a segment that references the real mode segment 0x40
2270 * that extends up to the end of page zero (that we have reserved).
2271 * This is for buggy BIOS's that refer to (real mode) segment 0x40
2272 * even though they are called in protected mode.
2273 */
2274 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
2275 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
2276
2277 /*
2278 * Set up the long jump entry point to the APM BIOS, which is called
2279 * from inline assembly.
2280 */
2281 apm_bios_entry.offset = apm_info.bios.offset;
2282 apm_bios_entry.segment = APM_CS;
2283
2284 /*
2285 * The APM 1.1 BIOS is supposed to provide limit information that it
2286 * recognizes. Many machines do this correctly, but many others do
2287 * not restrict themselves to their claimed limit. When this happens,
2288 * they will cause a segmentation violation in the kernel at boot time.
2289 * Most BIOS's, however, will respect a 64k limit, so we use that.
2290 *
2291 * Note we only set APM segments on CPU zero, since we pin the APM
2292 * code to that CPU.
2293 */
2294 gdt = get_cpu_gdt_table(0);
2295 set_base(gdt[APM_CS >> 3],
2296 __va((unsigned long)apm_info.bios.cseg << 4));
2297 set_base(gdt[APM_CS_16 >> 3],
2298 __va((unsigned long)apm_info.bios.cseg_16 << 4));
2299 set_base(gdt[APM_DS >> 3],
2300 __va((unsigned long)apm_info.bios.dseg << 4));
2301
2302 apm_proc = create_proc_entry("apm", 0, NULL);
2303 if (apm_proc)
2304 apm_proc->proc_fops = &apm_file_ops;
2305
2306 kapmd_task = kthread_create(apm, NULL, "kapmd");
2307 if (IS_ERR(kapmd_task)) {
2308 printk(KERN_ERR "apm: disabled - Unable to start kernel "
2309 "thread.\n");
2310 err = PTR_ERR(kapmd_task);
2311 kapmd_task = NULL;
2312 remove_proc_entry("apm", NULL);
2313 return err;
2314 }
2315 wake_up_process(kapmd_task);
2316
2317 if (num_online_cpus() > 1 && !smp ) {
2318 printk(KERN_NOTICE
2319 "apm: disabled - APM is not SMP safe (power off active).\n");
2320 return 0;
2321 }
2322
2323 /*
2324 * Note we don't actually care if the misc_device cannot be registered.
2325 * this driver can do its job without it, even if userspace can't
2326 * control it. just log the error
2327 */
2328 if (misc_register(&apm_device))
2329 printk(KERN_WARNING "apm: Could not register misc device.\n");
2330
2331 if (HZ != 100)
2332 idle_period = (idle_period * HZ) / 100;
2333 if (idle_threshold < 100) {
2334 original_pm_idle = pm_idle;
2335 pm_idle = apm_cpu_idle;
2336 set_pm_idle = 1;
2337 }
2338
2339 return 0;
2340}
2341
2342static void __exit apm_exit(void)
2343{
2344 int error;
2345
2346 if (set_pm_idle) {
2347 pm_idle = original_pm_idle;
2348 /*
2349 * We are about to unload the current idle thread pm callback
2350 * (pm_idle), Wait for all processors to update cached/local
2351 * copies of pm_idle before proceeding.
2352 */
2353 cpu_idle_wait();
2354 }
2355 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
2356 && (apm_info.connection_version > 0x0100)) {
2357 error = apm_engage_power_management(APM_DEVICE_ALL, 0);
2358 if (error)
2359 apm_error("disengage power management", error);
2360 }
2361 misc_deregister(&apm_device);
2362 remove_proc_entry("apm", NULL);
2363 if (power_off)
2364 pm_power_off = NULL;
2365 if (kapmd_task) {
2366 kthread_stop(kapmd_task);
2367 kapmd_task = NULL;
2368 }
2369#ifdef CONFIG_PM_LEGACY
2370 pm_active = 0;
2371#endif
2372}
2373
2374module_init(apm_init);
2375module_exit(apm_exit);
2376
2377MODULE_AUTHOR("Stephen Rothwell");
2378MODULE_DESCRIPTION("Advanced Power Management");
2379MODULE_LICENSE("GPL");
2380module_param(debug, bool, 0644);
2381MODULE_PARM_DESC(debug, "Enable debug mode");
2382module_param(power_off, bool, 0444);
2383MODULE_PARM_DESC(power_off, "Enable power off");
2384module_param(bounce_interval, int, 0444);
2385MODULE_PARM_DESC(bounce_interval,
2386 "Set the number of ticks to ignore suspend bounces");
2387module_param(allow_ints, bool, 0444);
2388MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
2389module_param(broken_psr, bool, 0444);
2390MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
2391module_param(realmode_power_off, bool, 0444);
2392MODULE_PARM_DESC(realmode_power_off,
2393 "Switch to real mode before powering off");
2394module_param(idle_threshold, int, 0444);
2395MODULE_PARM_DESC(idle_threshold,
2396 "System idle percentage above which to make APM BIOS idle calls");
2397module_param(idle_period, int, 0444);
2398MODULE_PARM_DESC(idle_period,
2399 "Period (in sec/100) over which to caculate the idle percentage");
2400module_param(smp, bool, 0444);
2401MODULE_PARM_DESC(smp,
2402 "Set this to enable APM use on an SMP platform. Use with caution on older systems");
2403MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644
index 000000000000..cfa82c899f47
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets.c
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c"
3#else
4# include "asm-offsets_64.c"
5#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644
index 000000000000..8029742c0fc1
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -0,0 +1,147 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <asm/ucontext.h>
13#include "sigframe_32.h"
14#include <asm/pgtable.h>
15#include <asm/fixmap.h>
16#include <asm/processor.h>
17#include <asm/thread_info.h>
18#include <asm/elf.h>
19
20#include <xen/interface/xen.h>
21
22#ifdef CONFIG_LGUEST_GUEST
23#include <linux/lguest.h>
24#include "../../../drivers/lguest/lg.h"
25#endif
26
27#define DEFINE(sym, val) \
28 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
29
30#define BLANK() asm volatile("\n->" : : )
31
32#define OFFSET(sym, str, mem) \
33 DEFINE(sym, offsetof(struct str, mem));
34
35/* workaround for a warning with -Wmissing-prototypes */
36void foo(void);
37
38void foo(void)
39{
40 OFFSET(SIGCONTEXT_eax, sigcontext, eax);
41 OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
42 OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
43 OFFSET(SIGCONTEXT_edx, sigcontext, edx);
44 OFFSET(SIGCONTEXT_esi, sigcontext, esi);
45 OFFSET(SIGCONTEXT_edi, sigcontext, edi);
46 OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
47 OFFSET(SIGCONTEXT_esp, sigcontext, esp);
48 OFFSET(SIGCONTEXT_eip, sigcontext, eip);
49 BLANK();
50
51 OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
52 OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
53 OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
54 OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
55 OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
56 OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
57 OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
58 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
59 BLANK();
60
61 OFFSET(TI_task, thread_info, task);
62 OFFSET(TI_exec_domain, thread_info, exec_domain);
63 OFFSET(TI_flags, thread_info, flags);
64 OFFSET(TI_status, thread_info, status);
65 OFFSET(TI_preempt_count, thread_info, preempt_count);
66 OFFSET(TI_addr_limit, thread_info, addr_limit);
67 OFFSET(TI_restart_block, thread_info, restart_block);
68 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
69 OFFSET(TI_cpu, thread_info, cpu);
70 BLANK();
71
72 OFFSET(GDS_size, Xgt_desc_struct, size);
73 OFFSET(GDS_address, Xgt_desc_struct, address);
74 OFFSET(GDS_pad, Xgt_desc_struct, pad);
75 BLANK();
76
77 OFFSET(PT_EBX, pt_regs, ebx);
78 OFFSET(PT_ECX, pt_regs, ecx);
79 OFFSET(PT_EDX, pt_regs, edx);
80 OFFSET(PT_ESI, pt_regs, esi);
81 OFFSET(PT_EDI, pt_regs, edi);
82 OFFSET(PT_EBP, pt_regs, ebp);
83 OFFSET(PT_EAX, pt_regs, eax);
84 OFFSET(PT_DS, pt_regs, xds);
85 OFFSET(PT_ES, pt_regs, xes);
86 OFFSET(PT_FS, pt_regs, xfs);
87 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
88 OFFSET(PT_EIP, pt_regs, eip);
89 OFFSET(PT_CS, pt_regs, xcs);
90 OFFSET(PT_EFLAGS, pt_regs, eflags);
91 OFFSET(PT_OLDESP, pt_regs, esp);
92 OFFSET(PT_OLDSS, pt_regs, xss);
93 BLANK();
94
95 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
96 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
97 BLANK();
98
99 OFFSET(pbe_address, pbe, address);
100 OFFSET(pbe_orig_address, pbe, orig_address);
101 OFFSET(pbe_next, pbe, next);
102
103 /* Offset from the sysenter stack to tss.esp0 */
104 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
105 sizeof(struct tss_struct));
106
107 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
108 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
109 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
110 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
111 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
112
113 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
114
115 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
116
117#ifdef CONFIG_PARAVIRT
118 BLANK();
119 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
120 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
121 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
122 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
123 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
124 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
125#endif
126
127#ifdef CONFIG_XEN
128 BLANK();
129 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
130 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
131#endif
132
133#ifdef CONFIG_LGUEST_GUEST
134 BLANK();
135 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
136 OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
137 OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
138 OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
139 OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
140 OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
141 OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
142 OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
143 OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
144 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
145 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
146#endif
147}
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644
index 000000000000..0b9860530a6b
--- /dev/null
+++ b/arch/x86/kernel/bootflag.c
@@ -0,0 +1,98 @@
1/*
2 * Implement 'Simple Boot Flag Specification 2.0'
3 */
4
5
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/init.h>
9#include <linux/string.h>
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/acpi.h>
13#include <asm/io.h>
14
15#include <linux/mc146818rtc.h>
16
17
18#define SBF_RESERVED (0x78)
19#define SBF_PNPOS (1<<0)
20#define SBF_BOOTING (1<<1)
21#define SBF_DIAG (1<<2)
22#define SBF_PARITY (1<<7)
23
24
25int sbf_port __initdata = -1; /* set via acpi_boot_init() */
26
27
28static int __init parity(u8 v)
29{
30 int x = 0;
31 int i;
32
33 for(i=0;i<8;i++)
34 {
35 x^=(v&1);
36 v>>=1;
37 }
38 return x;
39}
40
41static void __init sbf_write(u8 v)
42{
43 unsigned long flags;
44 if(sbf_port != -1)
45 {
46 v &= ~SBF_PARITY;
47 if(!parity(v))
48 v|=SBF_PARITY;
49
50 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
51
52 spin_lock_irqsave(&rtc_lock, flags);
53 CMOS_WRITE(v, sbf_port);
54 spin_unlock_irqrestore(&rtc_lock, flags);
55 }
56}
57
58static u8 __init sbf_read(void)
59{
60 u8 v;
61 unsigned long flags;
62 if(sbf_port == -1)
63 return 0;
64 spin_lock_irqsave(&rtc_lock, flags);
65 v = CMOS_READ(sbf_port);
66 spin_unlock_irqrestore(&rtc_lock, flags);
67 return v;
68}
69
70static int __init sbf_value_valid(u8 v)
71{
72 if(v&SBF_RESERVED) /* Reserved bits */
73 return 0;
74 if(!parity(v))
75 return 0;
76 return 1;
77}
78
79static int __init sbf_init(void)
80{
81 u8 v;
82 if(sbf_port == -1)
83 return 0;
84 v = sbf_read();
85 if(!sbf_value_valid(v))
86 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
87
88 v &= ~SBF_RESERVED;
89 v &= ~SBF_BOOTING;
90 v &= ~SBF_DIAG;
91#if defined(CONFIG_ISAPNP)
92 v |= SBF_PNPOS;
93#endif
94 sbf_write(v);
95 return 0;
96}
97
98module_init(sbf_init);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644
index 000000000000..5c2faa10e9fa
--- /dev/null
+++ b/arch/x86/kernel/cpuid.c
@@ -0,0 +1,242 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * cpuid.c
15 *
16 * x86 CPUID access device
17 *
18 * This device is accessed by lseek() to the appropriate CPUID level
19 * and then read in chunks of 16 bytes. A larger size means multiple
20 * reads of consecutive levels.
21 *
22 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27
28#include <linux/types.h>
29#include <linux/errno.h>
30#include <linux/fcntl.h>
31#include <linux/init.h>
32#include <linux/poll.h>
33#include <linux/smp.h>
34#include <linux/major.h>
35#include <linux/fs.h>
36#include <linux/smp_lock.h>
37#include <linux/device.h>
38#include <linux/cpu.h>
39#include <linux/notifier.h>
40
41#include <asm/processor.h>
42#include <asm/msr.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static struct class *cpuid_class;
47
48#ifdef CONFIG_SMP
49
50struct cpuid_command {
51 u32 reg;
52 u32 *data;
53};
54
55static void cpuid_smp_cpuid(void *cmd_block)
56{
57 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
58
59 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
60 &cmd->data[3]);
61}
62
63static inline void do_cpuid(int cpu, u32 reg, u32 * data)
64{
65 struct cpuid_command cmd;
66
67 preempt_disable();
68 if (cpu == smp_processor_id()) {
69 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
70 } else {
71 cmd.reg = reg;
72 cmd.data = data;
73
74 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
75 }
76 preempt_enable();
77}
78#else /* ! CONFIG_SMP */
79
80static inline void do_cpuid(int cpu, u32 reg, u32 * data)
81{
82 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
83}
84
85#endif /* ! CONFIG_SMP */
86
87static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
88{
89 loff_t ret;
90
91 lock_kernel();
92
93 switch (orig) {
94 case 0:
95 file->f_pos = offset;
96 ret = file->f_pos;
97 break;
98 case 1:
99 file->f_pos += offset;
100 ret = file->f_pos;
101 break;
102 default:
103 ret = -EINVAL;
104 }
105
106 unlock_kernel();
107 return ret;
108}
109
110static ssize_t cpuid_read(struct file *file, char __user *buf,
111 size_t count, loff_t * ppos)
112{
113 char __user *tmp = buf;
114 u32 data[4];
115 u32 reg = *ppos;
116 int cpu = iminor(file->f_path.dentry->d_inode);
117
118 if (count % 16)
119 return -EINVAL; /* Invalid chunk size */
120
121 for (; count; count -= 16) {
122 do_cpuid(cpu, reg, data);
123 if (copy_to_user(tmp, &data, 16))
124 return -EFAULT;
125 tmp += 16;
126 *ppos = reg++;
127 }
128
129 return tmp - buf;
130}
131
132static int cpuid_open(struct inode *inode, struct file *file)
133{
134 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
135 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
136
137 if (cpu >= NR_CPUS || !cpu_online(cpu))
138 return -ENXIO; /* No such CPU */
139 if (c->cpuid_level < 0)
140 return -EIO; /* CPUID not supported */
141
142 return 0;
143}
144
145/*
146 * File operations we support
147 */
148static const struct file_operations cpuid_fops = {
149 .owner = THIS_MODULE,
150 .llseek = cpuid_seek,
151 .read = cpuid_read,
152 .open = cpuid_open,
153};
154
155static int cpuid_device_create(int i)
156{
157 int err = 0;
158 struct device *dev;
159
160 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
161 if (IS_ERR(dev))
162 err = PTR_ERR(dev);
163 return err;
164}
165
166static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
167{
168 unsigned int cpu = (unsigned long)hcpu;
169
170 switch (action) {
171 case CPU_ONLINE:
172 case CPU_ONLINE_FROZEN:
173 cpuid_device_create(cpu);
174 break;
175 case CPU_DEAD:
176 case CPU_DEAD_FROZEN:
177 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
178 break;
179 }
180 return NOTIFY_OK;
181}
182
183static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
184{
185 .notifier_call = cpuid_class_cpu_callback,
186};
187
188static int __init cpuid_init(void)
189{
190 int i, err = 0;
191 i = 0;
192
193 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
194 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
195 CPUID_MAJOR);
196 err = -EBUSY;
197 goto out;
198 }
199 cpuid_class = class_create(THIS_MODULE, "cpuid");
200 if (IS_ERR(cpuid_class)) {
201 err = PTR_ERR(cpuid_class);
202 goto out_chrdev;
203 }
204 for_each_online_cpu(i) {
205 err = cpuid_device_create(i);
206 if (err != 0)
207 goto out_class;
208 }
209 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
210
211 err = 0;
212 goto out;
213
214out_class:
215 i = 0;
216 for_each_online_cpu(i) {
217 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
218 }
219 class_destroy(cpuid_class);
220out_chrdev:
221 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
222out:
223 return err;
224}
225
226static void __exit cpuid_exit(void)
227{
228 int cpu = 0;
229
230 for_each_online_cpu(cpu)
231 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
232 class_destroy(cpuid_class);
233 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
234 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
235}
236
237module_init(cpuid_init);
238module_exit(cpuid_exit);
239
240MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
241MODULE_DESCRIPTION("x86 generic CPUID driver");
242MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c
new file mode 100644
index 000000000000..53589d1b1a05
--- /dev/null
+++ b/arch/x86/kernel/crash_32.c
@@ -0,0 +1,137 @@
1/*
2 * Architecture specific (i386) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 *
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/smp.h>
14#include <linux/reboot.h>
15#include <linux/kexec.h>
16#include <linux/delay.h>
17#include <linux/elf.h>
18#include <linux/elfcore.h>
19
20#include <asm/processor.h>
21#include <asm/hardirq.h>
22#include <asm/nmi.h>
23#include <asm/hw_irq.h>
24#include <asm/apic.h>
25#include <linux/kdebug.h>
26#include <asm/smp.h>
27
28#include <mach_ipi.h>
29
30
31/* This keeps a track of which one is crashing cpu. */
32static int crashing_cpu;
33
34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
35static atomic_t waiting_for_crash_ipi;
36
37static int crash_nmi_callback(struct notifier_block *self,
38 unsigned long val, void *data)
39{
40 struct pt_regs *regs;
41 struct pt_regs fixed_regs;
42 int cpu;
43
44 if (val != DIE_NMI_IPI)
45 return NOTIFY_OK;
46
47 regs = ((struct die_args *)data)->regs;
48 cpu = raw_smp_processor_id();
49
50 /* Don't do anything if this handler is invoked on crashing cpu.
51 * Otherwise, system will completely hang. Crashing cpu can get
52 * an NMI if system was initially booted with nmi_watchdog parameter.
53 */
54 if (cpu == crashing_cpu)
55 return NOTIFY_STOP;
56 local_irq_disable();
57
58 if (!user_mode_vm(regs)) {
59 crash_fixup_ss_esp(&fixed_regs, regs);
60 regs = &fixed_regs;
61 }
62 crash_save_cpu(regs, cpu);
63 disable_local_APIC();
64 atomic_dec(&waiting_for_crash_ipi);
65 /* Assume hlt works */
66 halt();
67 for (;;)
68 cpu_relax();
69
70 return 1;
71}
72
73static void smp_send_nmi_allbutself(void)
74{
75 cpumask_t mask = cpu_online_map;
76 cpu_clear(safe_smp_processor_id(), mask);
77 if (!cpus_empty(mask))
78 send_IPI_mask(mask, NMI_VECTOR);
79}
80
81static struct notifier_block crash_nmi_nb = {
82 .notifier_call = crash_nmi_callback,
83};
84
85static void nmi_shootdown_cpus(void)
86{
87 unsigned long msecs;
88
89 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
90 /* Would it be better to replace the trap vector here? */
91 if (register_die_notifier(&crash_nmi_nb))
92 return; /* return what? */
93 /* Ensure the new callback function is set before sending
94 * out the NMI
95 */
96 wmb();
97
98 smp_send_nmi_allbutself();
99
100 msecs = 1000; /* Wait at most a second for the other cpus to stop */
101 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
102 mdelay(1);
103 msecs--;
104 }
105
106 /* Leave the nmi callback set */
107 disable_local_APIC();
108}
109#else
110static void nmi_shootdown_cpus(void)
111{
112 /* There are no cpus to shootdown */
113}
114#endif
115
116void machine_crash_shutdown(struct pt_regs *regs)
117{
118 /* This function is only called after the system
119 * has panicked or is otherwise in a critical state.
120 * The minimum amount of code to allow a kexec'd kernel
121 * to run successfully needs to happen here.
122 *
123 * In practice this means shooting down the other cpus in
124 * an SMP system.
125 */
126 /* The kernel is broken so disable interrupts */
127 local_irq_disable();
128
129 /* Make a note of crashing cpu. Will be used in NMI callback.*/
130 crashing_cpu = safe_smp_processor_id();
131 nmi_shootdown_cpus();
132 lapic_shutdown();
133#if defined(CONFIG_X86_IO_APIC)
134 disable_IO_APIC();
135#endif
136 crash_save_cpu(regs, safe_smp_processor_id());
137}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644
index 000000000000..3f532df488bc
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -0,0 +1,74 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/errno.h>
9#include <linux/highmem.h>
10#include <linux/crash_dump.h>
11
12#include <asm/uaccess.h>
13
14static void *kdump_buf_page;
15
16/**
17 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied
19 * @buf: target memory address for the copy; this can be in kernel address
20 * space or user address space (see @userbuf)
21 * @csize: number of bytes to copy
22 * @offset: offset in bytes into the page (based on pfn) to begin the copy
23 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
24 * otherwise @buf is in kernel address space, use memcpy().
25 *
26 * Copy a page from "oldmem". For this page, there is no pte mapped
27 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
28 *
29 * Calling copy_to_user() in atomic context is not desirable. Hence first
30 * copying the data to a pre-allocated kernel page and then copying to user
31 * space in non-atomic context.
32 */
33ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
34 size_t csize, unsigned long offset, int userbuf)
35{
36 void *vaddr;
37
38 if (!csize)
39 return 0;
40
41 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
42
43 if (!userbuf) {
44 memcpy(buf, (vaddr + offset), csize);
45 kunmap_atomic(vaddr, KM_PTE0);
46 } else {
47 if (!kdump_buf_page) {
48 printk(KERN_WARNING "Kdump: Kdump buffer page not"
49 " allocated\n");
50 return -EFAULT;
51 }
52 copy_page(kdump_buf_page, vaddr);
53 kunmap_atomic(vaddr, KM_PTE0);
54 if (copy_to_user(buf, (kdump_buf_page + offset), csize))
55 return -EFAULT;
56 }
57
58 return csize;
59}
60
61static int __init kdump_buf_page_init(void)
62{
63 int ret = 0;
64
65 kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
66 if (!kdump_buf_page) {
67 printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
68 " page\n");
69 ret = -ENOMEM;
70 }
71
72 return ret;
73}
74arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..40978af630e7
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,70 @@
1#include <linux/mm.h>
2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/init_task.h>
5#include <linux/fs.h>
6
7#include <asm/uaccess.h>
8#include <asm/pgtable.h>
9#include <asm/processor.h>
10#include <asm/desc.h>
11
12#define DOUBLEFAULT_STACKSIZE (1024)
13static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
14#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
15
16#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
17
18static void doublefault_fn(void)
19{
20 struct Xgt_desc_struct gdt_desc = {0, 0};
21 unsigned long gdt, tss;
22
23 store_gdt(&gdt_desc);
24 gdt = gdt_desc.address;
25
26 printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
27
28 if (ptr_ok(gdt)) {
29 gdt += GDT_ENTRY_TSS << 3;
30 tss = *(u16 *)(gdt+2);
31 tss += *(u8 *)(gdt+4) << 16;
32 tss += *(u8 *)(gdt+7) << 24;
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34
35 if (ptr_ok(tss)) {
36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
37
38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
39
40 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
41 t->eax, t->ebx, t->ecx, t->edx);
42 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
43 t->esi, t->edi);
44 }
45 }
46
47 for (;;)
48 cpu_relax();
49}
50
51struct tss_struct doublefault_tss __cacheline_aligned = {
52 .x86_tss = {
53 .esp0 = STACK_START,
54 .ss0 = __KERNEL_DS,
55 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
57
58 .eip = (unsigned long) doublefault_fn,
59 /* 0x2 bit is always set */
60 .eflags = X86_EFLAGS_SF | 0x2,
61 .esp = STACK_START,
62 .es = __USER_DS,
63 .cs = __KERNEL_CS,
64 .ss = __KERNEL_DS,
65 .ds = __USER_DS,
66 .fs = __KERNEL_PERCPU,
67
68 .__cr3 = __pa(swapper_pg_dir)
69 }
70};
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
new file mode 100644
index 000000000000..3c86b979a40a
--- /dev/null
+++ b/arch/x86/kernel/e820_32.c
@@ -0,0 +1,944 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13#include <linux/suspend.h>
14
15#include <asm/pgtable.h>
16#include <asm/page.h>
17#include <asm/e820.h>
18#include <asm/setup.h>
19
20#ifdef CONFIG_EFI
21int efi_enabled = 0;
22EXPORT_SYMBOL(efi_enabled);
23#endif
24
25struct e820map e820;
26struct change_member {
27 struct e820entry *pbios; /* pointer to original bios entry */
28 unsigned long long addr; /* address for this change point */
29};
30static struct change_member change_point_list[2*E820MAX] __initdata;
31static struct change_member *change_point[2*E820MAX] __initdata;
32static struct e820entry *overlap_list[E820MAX] __initdata;
33static struct e820entry new_bios[E820MAX] __initdata;
34/* For PCI or other memory-mapped resources */
35unsigned long pci_mem_start = 0x10000000;
36#ifdef CONFIG_PCI
37EXPORT_SYMBOL(pci_mem_start);
38#endif
39extern int user_defined_memmap;
40struct resource data_resource = {
41 .name = "Kernel data",
42 .start = 0,
43 .end = 0,
44 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
45};
46
47struct resource code_resource = {
48 .name = "Kernel code",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
52};
53
54static struct resource system_rom_resource = {
55 .name = "System ROM",
56 .start = 0xf0000,
57 .end = 0xfffff,
58 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
59};
60
61static struct resource extension_rom_resource = {
62 .name = "Extension ROM",
63 .start = 0xe0000,
64 .end = 0xeffff,
65 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
66};
67
68static struct resource adapter_rom_resources[] = { {
69 .name = "Adapter ROM",
70 .start = 0xc8000,
71 .end = 0,
72 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
73}, {
74 .name = "Adapter ROM",
75 .start = 0,
76 .end = 0,
77 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
78}, {
79 .name = "Adapter ROM",
80 .start = 0,
81 .end = 0,
82 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
83}, {
84 .name = "Adapter ROM",
85 .start = 0,
86 .end = 0,
87 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
88}, {
89 .name = "Adapter ROM",
90 .start = 0,
91 .end = 0,
92 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
93}, {
94 .name = "Adapter ROM",
95 .start = 0,
96 .end = 0,
97 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
98} };
99
100static struct resource video_rom_resource = {
101 .name = "Video ROM",
102 .start = 0xc0000,
103 .end = 0xc7fff,
104 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
105};
106
107static struct resource video_ram_resource = {
108 .name = "Video RAM area",
109 .start = 0xa0000,
110 .end = 0xbffff,
111 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
112};
113
114static struct resource standard_io_resources[] = { {
115 .name = "dma1",
116 .start = 0x0000,
117 .end = 0x001f,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO
119}, {
120 .name = "pic1",
121 .start = 0x0020,
122 .end = 0x0021,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO
124}, {
125 .name = "timer0",
126 .start = 0x0040,
127 .end = 0x0043,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO
129}, {
130 .name = "timer1",
131 .start = 0x0050,
132 .end = 0x0053,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO
134}, {
135 .name = "keyboard",
136 .start = 0x0060,
137 .end = 0x006f,
138 .flags = IORESOURCE_BUSY | IORESOURCE_IO
139}, {
140 .name = "dma page reg",
141 .start = 0x0080,
142 .end = 0x008f,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO
144}, {
145 .name = "pic2",
146 .start = 0x00a0,
147 .end = 0x00a1,
148 .flags = IORESOURCE_BUSY | IORESOURCE_IO
149}, {
150 .name = "dma2",
151 .start = 0x00c0,
152 .end = 0x00df,
153 .flags = IORESOURCE_BUSY | IORESOURCE_IO
154}, {
155 .name = "fpu",
156 .start = 0x00f0,
157 .end = 0x00ff,
158 .flags = IORESOURCE_BUSY | IORESOURCE_IO
159} };
160
161#define ROMSIGNATURE 0xaa55
162
163static int __init romsignature(const unsigned char *rom)
164{
165 const unsigned short * const ptr = (const unsigned short *)rom;
166 unsigned short sig;
167
168 return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
169}
170
171static int __init romchecksum(const unsigned char *rom, unsigned long length)
172{
173 unsigned char sum, c;
174
175 for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
176 sum += c;
177 return !length && !sum;
178}
179
180static void __init probe_roms(void)
181{
182 const unsigned char *rom;
183 unsigned long start, length, upper;
184 unsigned char c;
185 int i;
186
187 /* video rom */
188 upper = adapter_rom_resources[0].start;
189 for (start = video_rom_resource.start; start < upper; start += 2048) {
190 rom = isa_bus_to_virt(start);
191 if (!romsignature(rom))
192 continue;
193
194 video_rom_resource.start = start;
195
196 if (probe_kernel_address(rom + 2, c) != 0)
197 continue;
198
199 /* 0 < length <= 0x7f * 512, historically */
200 length = c * 512;
201
202 /* if checksum okay, trust length byte */
203 if (length && romchecksum(rom, length))
204 video_rom_resource.end = start + length - 1;
205
206 request_resource(&iomem_resource, &video_rom_resource);
207 break;
208 }
209
210 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
211 if (start < upper)
212 start = upper;
213
214 /* system rom */
215 request_resource(&iomem_resource, &system_rom_resource);
216 upper = system_rom_resource.start;
217
218 /* check for extension rom (ignore length byte!) */
219 rom = isa_bus_to_virt(extension_rom_resource.start);
220 if (romsignature(rom)) {
221 length = extension_rom_resource.end - extension_rom_resource.start + 1;
222 if (romchecksum(rom, length)) {
223 request_resource(&iomem_resource, &extension_rom_resource);
224 upper = extension_rom_resource.start;
225 }
226 }
227
228 /* check for adapter roms on 2k boundaries */
229 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
230 rom = isa_bus_to_virt(start);
231 if (!romsignature(rom))
232 continue;
233
234 if (probe_kernel_address(rom + 2, c) != 0)
235 continue;
236
237 /* 0 < length <= 0x7f * 512, historically */
238 length = c * 512;
239
240 /* but accept any length that fits if checksum okay */
241 if (!length || start + length > upper || !romchecksum(rom, length))
242 continue;
243
244 adapter_rom_resources[i].start = start;
245 adapter_rom_resources[i].end = start + length - 1;
246 request_resource(&iomem_resource, &adapter_rom_resources[i]);
247
248 start = adapter_rom_resources[i++].end & ~2047UL;
249 }
250}
251
252/*
253 * Request address space for all standard RAM and ROM resources
254 * and also for regions reported as reserved by the e820.
255 */
256static void __init
257legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
258{
259 int i;
260
261 probe_roms();
262 for (i = 0; i < e820.nr_map; i++) {
263 struct resource *res;
264#ifndef CONFIG_RESOURCES_64BIT
265 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
266 continue;
267#endif
268 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
269 switch (e820.map[i].type) {
270 case E820_RAM: res->name = "System RAM"; break;
271 case E820_ACPI: res->name = "ACPI Tables"; break;
272 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
273 default: res->name = "reserved";
274 }
275 res->start = e820.map[i].addr;
276 res->end = res->start + e820.map[i].size - 1;
277 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
278 if (request_resource(&iomem_resource, res)) {
279 kfree(res);
280 continue;
281 }
282 if (e820.map[i].type == E820_RAM) {
283 /*
284 * We don't know which RAM region contains kernel data,
285 * so we try it repeatedly and let the resource manager
286 * test it.
287 */
288 request_resource(res, code_resource);
289 request_resource(res, data_resource);
290#ifdef CONFIG_KEXEC
291 request_resource(res, &crashk_res);
292#endif
293 }
294 }
295}
296
297/*
298 * Request address space for all standard resources
299 *
300 * This is called just before pcibios_init(), which is also a
301 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
302 */
303static int __init request_standard_resources(void)
304{
305 int i;
306
307 printk("Setting up standard PCI resources\n");
308 if (efi_enabled)
309 efi_initialize_iomem_resources(&code_resource, &data_resource);
310 else
311 legacy_init_iomem_resources(&code_resource, &data_resource);
312
313 /* EFI systems may still have VGA */
314 request_resource(&iomem_resource, &video_ram_resource);
315
316 /* request I/O space for devices used on all i[345]86 PCs */
317 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
318 request_resource(&ioport_resource, &standard_io_resources[i]);
319 return 0;
320}
321
322subsys_initcall(request_standard_resources);
323
324#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
325/**
326 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
327 * correspond to e820 RAM areas and mark the corresponding pages as nosave for
328 * hibernation.
329 *
330 * This function requires the e820 map to be sorted and without any
331 * overlapping entries and assumes the first e820 area to be RAM.
332 */
333void __init e820_mark_nosave_regions(void)
334{
335 int i;
336 unsigned long pfn;
337
338 pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
339 for (i = 1; i < e820.nr_map; i++) {
340 struct e820entry *ei = &e820.map[i];
341
342 if (pfn < PFN_UP(ei->addr))
343 register_nosave_region(pfn, PFN_UP(ei->addr));
344
345 pfn = PFN_DOWN(ei->addr + ei->size);
346 if (ei->type != E820_RAM)
347 register_nosave_region(PFN_UP(ei->addr), pfn);
348
349 if (pfn >= max_low_pfn)
350 break;
351 }
352}
353#endif
354
355void __init add_memory_region(unsigned long long start,
356 unsigned long long size, int type)
357{
358 int x;
359
360 if (!efi_enabled) {
361 x = e820.nr_map;
362
363 if (x == E820MAX) {
364 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
365 return;
366 }
367
368 e820.map[x].addr = start;
369 e820.map[x].size = size;
370 e820.map[x].type = type;
371 e820.nr_map++;
372 }
373} /* add_memory_region */
374
375/*
376 * Sanitize the BIOS e820 map.
377 *
378 * Some e820 responses include overlapping entries. The following
379 * replaces the original e820 map with a new one, removing overlaps.
380 *
381 */
382int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
383{
384 struct change_member *change_tmp;
385 unsigned long current_type, last_type;
386 unsigned long long last_addr;
387 int chgidx, still_changing;
388 int overlap_entries;
389 int new_bios_entry;
390 int old_nr, new_nr, chg_nr;
391 int i;
392
393 /*
394 Visually we're performing the following (1,2,3,4 = memory types)...
395
396 Sample memory map (w/overlaps):
397 ____22__________________
398 ______________________4_
399 ____1111________________
400 _44_____________________
401 11111111________________
402 ____________________33__
403 ___________44___________
404 __________33333_________
405 ______________22________
406 ___________________2222_
407 _________111111111______
408 _____________________11_
409 _________________4______
410
411 Sanitized equivalent (no overlap):
412 1_______________________
413 _44_____________________
414 ___1____________________
415 ____22__________________
416 ______11________________
417 _________1______________
418 __________3_____________
419 ___________44___________
420 _____________33_________
421 _______________2________
422 ________________1_______
423 _________________4______
424 ___________________2____
425 ____________________33__
426 ______________________4_
427 */
428 /* if there's only one memory region, don't bother */
429 if (*pnr_map < 2) {
430 return -1;
431 }
432
433 old_nr = *pnr_map;
434
435 /* bail out if we find any unreasonable addresses in bios map */
436 for (i=0; i<old_nr; i++)
437 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
438 return -1;
439 }
440
441 /* create pointers for initial change-point information (for sorting) */
442 for (i=0; i < 2*old_nr; i++)
443 change_point[i] = &change_point_list[i];
444
445 /* record all known change-points (starting and ending addresses),
446 omitting those that are for empty memory regions */
447 chgidx = 0;
448 for (i=0; i < old_nr; i++) {
449 if (biosmap[i].size != 0) {
450 change_point[chgidx]->addr = biosmap[i].addr;
451 change_point[chgidx++]->pbios = &biosmap[i];
452 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
453 change_point[chgidx++]->pbios = &biosmap[i];
454 }
455 }
456 chg_nr = chgidx; /* true number of change-points */
457
458 /* sort change-point list by memory addresses (low -> high) */
459 still_changing = 1;
460 while (still_changing) {
461 still_changing = 0;
462 for (i=1; i < chg_nr; i++) {
463 /* if <current_addr> > <last_addr>, swap */
464 /* or, if current=<start_addr> & last=<end_addr>, swap */
465 if ((change_point[i]->addr < change_point[i-1]->addr) ||
466 ((change_point[i]->addr == change_point[i-1]->addr) &&
467 (change_point[i]->addr == change_point[i]->pbios->addr) &&
468 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
469 )
470 {
471 change_tmp = change_point[i];
472 change_point[i] = change_point[i-1];
473 change_point[i-1] = change_tmp;
474 still_changing=1;
475 }
476 }
477 }
478
479 /* create a new bios memory map, removing overlaps */
480 overlap_entries=0; /* number of entries in the overlap table */
481 new_bios_entry=0; /* index for creating new bios map entries */
482 last_type = 0; /* start with undefined memory type */
483 last_addr = 0; /* start with 0 as last starting address */
484 /* loop through change-points, determining affect on the new bios map */
485 for (chgidx=0; chgidx < chg_nr; chgidx++)
486 {
487 /* keep track of all overlapping bios entries */
488 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
489 {
490 /* add map entry to overlap list (> 1 entry implies an overlap) */
491 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
492 }
493 else
494 {
495 /* remove entry from list (order independent, so swap with last) */
496 for (i=0; i<overlap_entries; i++)
497 {
498 if (overlap_list[i] == change_point[chgidx]->pbios)
499 overlap_list[i] = overlap_list[overlap_entries-1];
500 }
501 overlap_entries--;
502 }
503 /* if there are overlapping entries, decide which "type" to use */
504 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
505 current_type = 0;
506 for (i=0; i<overlap_entries; i++)
507 if (overlap_list[i]->type > current_type)
508 current_type = overlap_list[i]->type;
509 /* continue building up new bios map based on this information */
510 if (current_type != last_type) {
511 if (last_type != 0) {
512 new_bios[new_bios_entry].size =
513 change_point[chgidx]->addr - last_addr;
514 /* move forward only if the new size was non-zero */
515 if (new_bios[new_bios_entry].size != 0)
516 if (++new_bios_entry >= E820MAX)
517 break; /* no more space left for new bios entries */
518 }
519 if (current_type != 0) {
520 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
521 new_bios[new_bios_entry].type = current_type;
522 last_addr=change_point[chgidx]->addr;
523 }
524 last_type = current_type;
525 }
526 }
527 new_nr = new_bios_entry; /* retain count for new bios entries */
528
529 /* copy new bios mapping into original location */
530 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
531 *pnr_map = new_nr;
532
533 return 0;
534}
535
536/*
537 * Copy the BIOS e820 map into a safe place.
538 *
539 * Sanity-check it while we're at it..
540 *
541 * If we're lucky and live on a modern system, the setup code
542 * will have given us a memory map that we can use to properly
543 * set up memory. If we aren't, we'll fake a memory map.
544 *
545 * We check to see that the memory map contains at least 2 elements
546 * before we'll use it, because the detection code in setup.S may
547 * not be perfect and most every PC known to man has two memory
548 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
549 * thinkpad 560x, for example, does not cooperate with the memory
550 * detection code.)
551 */
552int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
553{
554 /* Only one memory region (or negative)? Ignore it */
555 if (nr_map < 2)
556 return -1;
557
558 do {
559 unsigned long long start = biosmap->addr;
560 unsigned long long size = biosmap->size;
561 unsigned long long end = start + size;
562 unsigned long type = biosmap->type;
563
564 /* Overflow in 64 bits? Ignore the memory map. */
565 if (start > end)
566 return -1;
567
568 /*
569 * Some BIOSes claim RAM in the 640k - 1M region.
570 * Not right. Fix it up.
571 */
572 if (type == E820_RAM) {
573 if (start < 0x100000ULL && end > 0xA0000ULL) {
574 if (start < 0xA0000ULL)
575 add_memory_region(start, 0xA0000ULL-start, type);
576 if (end <= 0x100000ULL)
577 continue;
578 start = 0x100000ULL;
579 size = end - start;
580 }
581 }
582 add_memory_region(start, size, type);
583 } while (biosmap++,--nr_map);
584 return 0;
585}
586
587/*
588 * Callback for efi_memory_walk.
589 */
590static int __init
591efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
592{
593 unsigned long *max_pfn = arg, pfn;
594
595 if (start < end) {
596 pfn = PFN_UP(end -1);
597 if (pfn > *max_pfn)
598 *max_pfn = pfn;
599 }
600 return 0;
601}
602
603static int __init
604efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
605{
606 memory_present(0, PFN_UP(start), PFN_DOWN(end));
607 return 0;
608}
609
610/*
611 * Find the highest page frame number we have available
612 */
613void __init find_max_pfn(void)
614{
615 int i;
616
617 max_pfn = 0;
618 if (efi_enabled) {
619 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
620 efi_memmap_walk(efi_memory_present_wrapper, NULL);
621 return;
622 }
623
624 for (i = 0; i < e820.nr_map; i++) {
625 unsigned long start, end;
626 /* RAM? */
627 if (e820.map[i].type != E820_RAM)
628 continue;
629 start = PFN_UP(e820.map[i].addr);
630 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
631 if (start >= end)
632 continue;
633 if (end > max_pfn)
634 max_pfn = end;
635 memory_present(0, start, end);
636 }
637}
638
639/*
640 * Free all available memory for boot time allocation. Used
641 * as a callback function by efi_memory_walk()
642 */
643
644static int __init
645free_available_memory(unsigned long start, unsigned long end, void *arg)
646{
647 /* check max_low_pfn */
648 if (start >= (max_low_pfn << PAGE_SHIFT))
649 return 0;
650 if (end >= (max_low_pfn << PAGE_SHIFT))
651 end = max_low_pfn << PAGE_SHIFT;
652 if (start < end)
653 free_bootmem(start, end - start);
654
655 return 0;
656}
657/*
658 * Register fully available low RAM pages with the bootmem allocator.
659 */
660void __init register_bootmem_low_pages(unsigned long max_low_pfn)
661{
662 int i;
663
664 if (efi_enabled) {
665 efi_memmap_walk(free_available_memory, NULL);
666 return;
667 }
668 for (i = 0; i < e820.nr_map; i++) {
669 unsigned long curr_pfn, last_pfn, size;
670 /*
671 * Reserve usable low memory
672 */
673 if (e820.map[i].type != E820_RAM)
674 continue;
675 /*
676 * We are rounding up the start address of usable memory:
677 */
678 curr_pfn = PFN_UP(e820.map[i].addr);
679 if (curr_pfn >= max_low_pfn)
680 continue;
681 /*
682 * ... and at the end of the usable range downwards:
683 */
684 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
685
686 if (last_pfn > max_low_pfn)
687 last_pfn = max_low_pfn;
688
689 /*
690 * .. finally, did all the rounding and playing
691 * around just make the area go away?
692 */
693 if (last_pfn <= curr_pfn)
694 continue;
695
696 size = last_pfn - curr_pfn;
697 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
698 }
699}
700
701void __init e820_register_memory(void)
702{
703 unsigned long gapstart, gapsize, round;
704 unsigned long long last;
705 int i;
706
707 /*
708 * Search for the bigest gap in the low 32 bits of the e820
709 * memory space.
710 */
711 last = 0x100000000ull;
712 gapstart = 0x10000000;
713 gapsize = 0x400000;
714 i = e820.nr_map;
715 while (--i >= 0) {
716 unsigned long long start = e820.map[i].addr;
717 unsigned long long end = start + e820.map[i].size;
718
719 /*
720 * Since "last" is at most 4GB, we know we'll
721 * fit in 32 bits if this condition is true
722 */
723 if (last > end) {
724 unsigned long gap = last - end;
725
726 if (gap > gapsize) {
727 gapsize = gap;
728 gapstart = end;
729 }
730 }
731 if (start < last)
732 last = start;
733 }
734
735 /*
736 * See how much we want to round up: start off with
737 * rounding to the next 1MB area.
738 */
739 round = 0x100000;
740 while ((gapsize >> 4) > round)
741 round += round;
742 /* Fun with two's complement */
743 pci_mem_start = (gapstart + round) & -round;
744
745 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
746 pci_mem_start, gapstart, gapsize);
747}
748
749void __init print_memory_map(char *who)
750{
751 int i;
752
753 for (i = 0; i < e820.nr_map; i++) {
754 printk(" %s: %016Lx - %016Lx ", who,
755 e820.map[i].addr,
756 e820.map[i].addr + e820.map[i].size);
757 switch (e820.map[i].type) {
758 case E820_RAM: printk("(usable)\n");
759 break;
760 case E820_RESERVED:
761 printk("(reserved)\n");
762 break;
763 case E820_ACPI:
764 printk("(ACPI data)\n");
765 break;
766 case E820_NVS:
767 printk("(ACPI NVS)\n");
768 break;
769 default: printk("type %u\n", e820.map[i].type);
770 break;
771 }
772 }
773}
774
775static __init __always_inline void efi_limit_regions(unsigned long long size)
776{
777 unsigned long long current_addr = 0;
778 efi_memory_desc_t *md, *next_md;
779 void *p, *p1;
780 int i, j;
781
782 j = 0;
783 p1 = memmap.map;
784 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
785 md = p;
786 next_md = p1;
787 current_addr = md->phys_addr +
788 PFN_PHYS(md->num_pages);
789 if (is_available_memory(md)) {
790 if (md->phys_addr >= size) continue;
791 memcpy(next_md, md, memmap.desc_size);
792 if (current_addr >= size) {
793 next_md->num_pages -=
794 PFN_UP(current_addr-size);
795 }
796 p1 += memmap.desc_size;
797 next_md = p1;
798 j++;
799 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
800 EFI_MEMORY_RUNTIME) {
801 /* In order to make runtime services
802 * available we have to include runtime
803 * memory regions in memory map */
804 memcpy(next_md, md, memmap.desc_size);
805 p1 += memmap.desc_size;
806 next_md = p1;
807 j++;
808 }
809 }
810 memmap.nr_map = j;
811 memmap.map_end = memmap.map +
812 (memmap.nr_map * memmap.desc_size);
813}
814
815void __init limit_regions(unsigned long long size)
816{
817 unsigned long long current_addr;
818 int i;
819
820 print_memory_map("limit_regions start");
821 if (efi_enabled) {
822 efi_limit_regions(size);
823 return;
824 }
825 for (i = 0; i < e820.nr_map; i++) {
826 current_addr = e820.map[i].addr + e820.map[i].size;
827 if (current_addr < size)
828 continue;
829
830 if (e820.map[i].type != E820_RAM)
831 continue;
832
833 if (e820.map[i].addr >= size) {
834 /*
835 * This region starts past the end of the
836 * requested size, skip it completely.
837 */
838 e820.nr_map = i;
839 } else {
840 e820.nr_map = i + 1;
841 e820.map[i].size -= current_addr - size;
842 }
843 print_memory_map("limit_regions endfor");
844 return;
845 }
846 print_memory_map("limit_regions endfunc");
847}
848
849/*
850 * This function checks if any part of the range <start,end> is mapped
851 * with type.
852 */
853int
854e820_any_mapped(u64 start, u64 end, unsigned type)
855{
856 int i;
857 for (i = 0; i < e820.nr_map; i++) {
858 const struct e820entry *ei = &e820.map[i];
859 if (type && ei->type != type)
860 continue;
861 if (ei->addr >= end || ei->addr + ei->size <= start)
862 continue;
863 return 1;
864 }
865 return 0;
866}
867EXPORT_SYMBOL_GPL(e820_any_mapped);
868
869 /*
870 * This function checks if the entire range <start,end> is mapped with type.
871 *
872 * Note: this function only works correct if the e820 table is sorted and
873 * not-overlapping, which is the case
874 */
875int __init
876e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
877{
878 u64 start = s;
879 u64 end = e;
880 int i;
881 for (i = 0; i < e820.nr_map; i++) {
882 struct e820entry *ei = &e820.map[i];
883 if (type && ei->type != type)
884 continue;
885 /* is the region (part) in overlap with the current region ?*/
886 if (ei->addr >= end || ei->addr + ei->size <= start)
887 continue;
888 /* if the region is at the beginning of <start,end> we move
889 * start to the end of the region since it's ok until there
890 */
891 if (ei->addr <= start)
892 start = ei->addr + ei->size;
893 /* if start is now at or beyond end, we're done, full
894 * coverage */
895 if (start >= end)
896 return 1; /* we're done */
897 }
898 return 0;
899}
900
901static int __init parse_memmap(char *arg)
902{
903 if (!arg)
904 return -EINVAL;
905
906 if (strcmp(arg, "exactmap") == 0) {
907#ifdef CONFIG_CRASH_DUMP
908 /* If we are doing a crash dump, we
909 * still need to know the real mem
910 * size before original memory map is
911 * reset.
912 */
913 find_max_pfn();
914 saved_max_pfn = max_pfn;
915#endif
916 e820.nr_map = 0;
917 user_defined_memmap = 1;
918 } else {
919 /* If the user specifies memory size, we
920 * limit the BIOS-provided memory map to
921 * that size. exactmap can be used to specify
922 * the exact map. mem=number can be used to
923 * trim the existing memory map.
924 */
925 unsigned long long start_at, mem_size;
926
927 mem_size = memparse(arg, &arg);
928 if (*arg == '@') {
929 start_at = memparse(arg+1, &arg);
930 add_memory_region(start_at, mem_size, E820_RAM);
931 } else if (*arg == '#') {
932 start_at = memparse(arg+1, &arg);
933 add_memory_region(start_at, mem_size, E820_ACPI);
934 } else if (*arg == '$') {
935 start_at = memparse(arg+1, &arg);
936 add_memory_region(start_at, mem_size, E820_RESERVED);
937 } else {
938 limit_regions(mem_size);
939 user_defined_memmap = 1;
940 }
941 }
942 return 0;
943}
944early_param("memmap", parse_memmap);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644
index 000000000000..92f812ba275c
--- /dev/null
+++ b/arch/x86/kernel/early_printk.c
@@ -0,0 +1,2 @@
1
2#include "../../x86_64/kernel/early_printk.c"
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
new file mode 100644
index 000000000000..2452c6fbe992
--- /dev/null
+++ b/arch/x86/kernel/efi_32.c
@@ -0,0 +1,712 @@
1/*
2 * Extensible Firmware Interface
3 *
4 * Based on Extensible Firmware Interface Specification version 1.0
5 *
6 * Copyright (C) 1999 VA Linux Systems
7 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
8 * Copyright (C) 1999-2002 Hewlett-Packard Co.
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 * Stephane Eranian <eranian@hpl.hp.com>
11 *
12 * All EFI Runtime Services are not implemented yet as EFI only
13 * supports physical mode addressing on SoftSDV. This is to be fixed
14 * in a future version. --drummond 1999-07-20
15 *
16 * Implemented EFI runtime services and virtual mode calls. --davidm
17 *
18 * Goutham Rao: <goutham.rao@intel.com>
19 * Skip non-WB memory and ignore empty memory ranges.
20 */
21
22#include <linux/kernel.h>
23#include <linux/init.h>
24#include <linux/mm.h>
25#include <linux/types.h>
26#include <linux/time.h>
27#include <linux/spinlock.h>
28#include <linux/bootmem.h>
29#include <linux/ioport.h>
30#include <linux/module.h>
31#include <linux/efi.h>
32#include <linux/kexec.h>
33
34#include <asm/setup.h>
35#include <asm/io.h>
36#include <asm/page.h>
37#include <asm/pgtable.h>
38#include <asm/processor.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
41
42#define EFI_DEBUG 0
43#define PFX "EFI: "
44
45extern efi_status_t asmlinkage efi_call_phys(void *, ...);
46
47struct efi efi;
48EXPORT_SYMBOL(efi);
49static struct efi efi_phys;
50struct efi_memory_map memmap;
51
52/*
53 * We require an early boot_ioremap mapping mechanism initially
54 */
55extern void * boot_ioremap(unsigned long, unsigned long);
56
57/*
58 * To make EFI call EFI runtime service in physical addressing mode we need
59 * prelog/epilog before/after the invocation to disable interrupt, to
60 * claim EFI runtime service handler exclusively and to duplicate a memory in
61 * low memory space say 0 - 3G.
62 */
63
64static unsigned long efi_rt_eflags;
65static DEFINE_SPINLOCK(efi_rt_lock);
66static pgd_t efi_bak_pg_dir_pointer[2];
67
68static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
69{
70 unsigned long cr4;
71 unsigned long temp;
72 struct Xgt_desc_struct gdt_descr;
73
74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags);
76
77 /*
78 * If I don't have PSE, I should just duplicate two entries in page
79 * directory. If I have PSE, I just need to duplicate one entry in
80 * page directory.
81 */
82 cr4 = read_cr4();
83
84 if (cr4 & X86_CR4_PSE) {
85 efi_bak_pg_dir_pointer[0].pgd =
86 swapper_pg_dir[pgd_index(0)].pgd;
87 swapper_pg_dir[0].pgd =
88 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
89 } else {
90 efi_bak_pg_dir_pointer[0].pgd =
91 swapper_pg_dir[pgd_index(0)].pgd;
92 efi_bak_pg_dir_pointer[1].pgd =
93 swapper_pg_dir[pgd_index(0x400000)].pgd;
94 swapper_pg_dir[pgd_index(0)].pgd =
95 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
96 temp = PAGE_OFFSET + 0x400000;
97 swapper_pg_dir[pgd_index(0x400000)].pgd =
98 swapper_pg_dir[pgd_index(temp)].pgd;
99 }
100
101 /*
102 * After the lock is released, the original page table is restored.
103 */
104 local_flush_tlb();
105
106 gdt_descr.address = __pa(get_cpu_gdt_table(0));
107 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr);
109}
110
111static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
112{
113 unsigned long cr4;
114 struct Xgt_desc_struct gdt_descr;
115
116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
117 gdt_descr.size = GDT_SIZE - 1;
118 load_gdt(&gdt_descr);
119
120 cr4 = read_cr4();
121
122 if (cr4 & X86_CR4_PSE) {
123 swapper_pg_dir[pgd_index(0)].pgd =
124 efi_bak_pg_dir_pointer[0].pgd;
125 } else {
126 swapper_pg_dir[pgd_index(0)].pgd =
127 efi_bak_pg_dir_pointer[0].pgd;
128 swapper_pg_dir[pgd_index(0x400000)].pgd =
129 efi_bak_pg_dir_pointer[1].pgd;
130 }
131
132 /*
133 * After the lock is released, the original page table is restored.
134 */
135 local_flush_tlb();
136
137 local_irq_restore(efi_rt_eflags);
138 spin_unlock(&efi_rt_lock);
139}
140
141static efi_status_t
142phys_efi_set_virtual_address_map(unsigned long memory_map_size,
143 unsigned long descriptor_size,
144 u32 descriptor_version,
145 efi_memory_desc_t *virtual_map)
146{
147 efi_status_t status;
148
149 efi_call_phys_prelog();
150 status = efi_call_phys(efi_phys.set_virtual_address_map,
151 memory_map_size, descriptor_size,
152 descriptor_version, virtual_map);
153 efi_call_phys_epilog();
154 return status;
155}
156
157static efi_status_t
158phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
159{
160 efi_status_t status;
161
162 efi_call_phys_prelog();
163 status = efi_call_phys(efi_phys.get_time, tm, tc);
164 efi_call_phys_epilog();
165 return status;
166}
167
168inline int efi_set_rtc_mmss(unsigned long nowtime)
169{
170 int real_seconds, real_minutes;
171 efi_status_t status;
172 efi_time_t eft;
173 efi_time_cap_t cap;
174
175 spin_lock(&efi_rt_lock);
176 status = efi.get_time(&eft, &cap);
177 spin_unlock(&efi_rt_lock);
178 if (status != EFI_SUCCESS)
179 panic("Ooops, efitime: can't read time!\n");
180 real_seconds = nowtime % 60;
181 real_minutes = nowtime / 60;
182
183 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
184 real_minutes += 30;
185 real_minutes %= 60;
186
187 eft.minute = real_minutes;
188 eft.second = real_seconds;
189
190 if (status != EFI_SUCCESS) {
191 printk("Ooops: efitime: can't read time!\n");
192 return -1;
193 }
194 return 0;
195}
196/*
197 * This is used during kernel init before runtime
198 * services have been remapped and also during suspend, therefore,
199 * we'll need to call both in physical and virtual modes.
200 */
201inline unsigned long efi_get_time(void)
202{
203 efi_status_t status;
204 efi_time_t eft;
205 efi_time_cap_t cap;
206
207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
215 if (status != EFI_SUCCESS)
216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
217
218 return mktime(eft.year, eft.month, eft.day, eft.hour,
219 eft.minute, eft.second);
220}
221
222int is_available_memory(efi_memory_desc_t * md)
223{
224 if (!(md->attribute & EFI_MEMORY_WB))
225 return 0;
226
227 switch (md->type) {
228 case EFI_LOADER_CODE:
229 case EFI_LOADER_DATA:
230 case EFI_BOOT_SERVICES_CODE:
231 case EFI_BOOT_SERVICES_DATA:
232 case EFI_CONVENTIONAL_MEMORY:
233 return 1;
234 }
235 return 0;
236}
237
238/*
239 * We need to map the EFI memory map again after paging_init().
240 */
241void __init efi_map_memmap(void)
242{
243 memmap.map = NULL;
244
245 memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
246 (memmap.nr_map * memmap.desc_size));
247 if (memmap.map == NULL)
248 printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
249
250 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
251}
252
253#if EFI_DEBUG
254static void __init print_efi_memmap(void)
255{
256 efi_memory_desc_t *md;
257 void *p;
258 int i;
259
260 for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
261 md = p;
262 printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
263 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
264 i, md->type, md->attribute, md->phys_addr,
265 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
266 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
267 }
268}
269#endif /* EFI_DEBUG */
270
271/*
272 * Walks the EFI memory map and calls CALLBACK once for each EFI
273 * memory descriptor that has memory that is available for kernel use.
274 */
275void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
276{
277 int prev_valid = 0;
278 struct range {
279 unsigned long start;
280 unsigned long end;
281 } uninitialized_var(prev), curr;
282 efi_memory_desc_t *md;
283 unsigned long start, end;
284 void *p;
285
286 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
287 md = p;
288
289 if ((md->num_pages == 0) || (!is_available_memory(md)))
290 continue;
291
292 curr.start = md->phys_addr;
293 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
294
295 if (!prev_valid) {
296 prev = curr;
297 prev_valid = 1;
298 } else {
299 if (curr.start < prev.start)
300 printk(KERN_INFO PFX "Unordered memory map\n");
301 if (prev.end == curr.start)
302 prev.end = curr.end;
303 else {
304 start =
305 (unsigned long) (PAGE_ALIGN(prev.start));
306 end = (unsigned long) (prev.end & PAGE_MASK);
307 if ((end > start)
308 && (*callback) (start, end, arg) < 0)
309 return;
310 prev = curr;
311 }
312 }
313 }
314 if (prev_valid) {
315 start = (unsigned long) PAGE_ALIGN(prev.start);
316 end = (unsigned long) (prev.end & PAGE_MASK);
317 if (end > start)
318 (*callback) (start, end, arg);
319 }
320}
321
322void __init efi_init(void)
323{
324 efi_config_table_t *config_tables;
325 efi_runtime_services_t *runtime;
326 efi_char16_t *c16;
327 char vendor[100] = "unknown";
328 unsigned long num_config_tables;
329 int i = 0;
330
331 memset(&efi, 0, sizeof(efi) );
332 memset(&efi_phys, 0, sizeof(efi_phys));
333
334 efi_phys.systab = EFI_SYSTAB;
335 memmap.phys_map = EFI_MEMMAP;
336 memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
337 memmap.desc_version = EFI_MEMDESC_VERSION;
338 memmap.desc_size = EFI_MEMDESC_SIZE;
339
340 efi.systab = (efi_system_table_t *)
341 boot_ioremap((unsigned long) efi_phys.systab,
342 sizeof(efi_system_table_t));
343 /*
344 * Verify the EFI Table
345 */
346 if (efi.systab == NULL)
347 printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
348 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
349 printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
350 if ((efi.systab->hdr.revision >> 16) == 0)
351 printk(KERN_ERR PFX "Warning: EFI system table version "
352 "%d.%02d, expected 1.00 or greater\n",
353 efi.systab->hdr.revision >> 16,
354 efi.systab->hdr.revision & 0xffff);
355
356 /*
357 * Grab some details from the system table
358 */
359 num_config_tables = efi.systab->nr_tables;
360 config_tables = (efi_config_table_t *)efi.systab->tables;
361 runtime = efi.systab->runtime;
362
363 /*
364 * Show what we know for posterity
365 */
366 c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
367 if (c16) {
368 for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
369 vendor[i] = *c16++;
370 vendor[i] = '\0';
371 } else
372 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
373
374 printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
375 efi.systab->hdr.revision >> 16,
376 efi.systab->hdr.revision & 0xffff, vendor);
377
378 /*
379 * Let's see what config tables the firmware passed to us.
380 */
381 config_tables = (efi_config_table_t *)
382 boot_ioremap((unsigned long) config_tables,
383 num_config_tables * sizeof(efi_config_table_t));
384
385 if (config_tables == NULL)
386 printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
387
388 efi.mps = EFI_INVALID_TABLE_ADDR;
389 efi.acpi = EFI_INVALID_TABLE_ADDR;
390 efi.acpi20 = EFI_INVALID_TABLE_ADDR;
391 efi.smbios = EFI_INVALID_TABLE_ADDR;
392 efi.sal_systab = EFI_INVALID_TABLE_ADDR;
393 efi.boot_info = EFI_INVALID_TABLE_ADDR;
394 efi.hcdp = EFI_INVALID_TABLE_ADDR;
395 efi.uga = EFI_INVALID_TABLE_ADDR;
396
397 for (i = 0; i < num_config_tables; i++) {
398 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
399 efi.mps = config_tables[i].table;
400 printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
401 } else
402 if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
403 efi.acpi20 = config_tables[i].table;
404 printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
405 } else
406 if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
407 efi.acpi = config_tables[i].table;
408 printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
409 } else
410 if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
411 efi.smbios = config_tables[i].table;
412 printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
413 } else
414 if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
415 efi.hcdp = config_tables[i].table;
416 printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
417 } else
418 if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
419 efi.uga = config_tables[i].table;
420 printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
421 }
422 }
423 printk("\n");
424
425 /*
426 * Check out the runtime services table. We need to map
427 * the runtime services table so that we can grab the physical
428 * address of several of the EFI runtime functions, needed to
429 * set the firmware into virtual mode.
430 */
431
432 runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
433 runtime,
434 sizeof(efi_runtime_services_t));
435 if (runtime != NULL) {
436 /*
437 * We will only need *early* access to the following
438 * two EFI runtime services before set_virtual_address_map
439 * is invoked.
440 */
441 efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
442 efi_phys.set_virtual_address_map =
443 (efi_set_virtual_address_map_t *)
444 runtime->set_virtual_address_map;
445 } else
446 printk(KERN_ERR PFX "Could not map the runtime service table!\n");
447
448 /* Map the EFI memory map for use until paging_init() */
449 memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
450 if (memmap.map == NULL)
451 printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
452
453 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
454
455#if EFI_DEBUG
456 print_efi_memmap();
457#endif
458}
459
460static inline void __init check_range_for_systab(efi_memory_desc_t *md)
461{
462 if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
463 ((unsigned long)efi_phys.systab < md->phys_addr +
464 ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
465 unsigned long addr;
466
467 addr = md->virt_addr - md->phys_addr +
468 (unsigned long)efi_phys.systab;
469 efi.systab = (efi_system_table_t *)addr;
470 }
471}
472
473/*
474 * Wrap all the virtual calls in a way that forces the parameters on the stack.
475 */
476
477#define efi_call_virt(f, args...) \
478 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
479
480static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
481{
482 return efi_call_virt(get_time, tm, tc);
483}
484
485static efi_status_t virt_efi_set_time (efi_time_t *tm)
486{
487 return efi_call_virt(set_time, tm);
488}
489
490static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
491 efi_bool_t *pending,
492 efi_time_t *tm)
493{
494 return efi_call_virt(get_wakeup_time, enabled, pending, tm);
495}
496
497static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
498 efi_time_t *tm)
499{
500 return efi_call_virt(set_wakeup_time, enabled, tm);
501}
502
503static efi_status_t virt_efi_get_variable (efi_char16_t *name,
504 efi_guid_t *vendor, u32 *attr,
505 unsigned long *data_size, void *data)
506{
507 return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
508}
509
510static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
511 efi_char16_t *name,
512 efi_guid_t *vendor)
513{
514 return efi_call_virt(get_next_variable, name_size, name, vendor);
515}
516
517static efi_status_t virt_efi_set_variable (efi_char16_t *name,
518 efi_guid_t *vendor,
519 unsigned long attr,
520 unsigned long data_size, void *data)
521{
522 return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
523}
524
525static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
526{
527 return efi_call_virt(get_next_high_mono_count, count);
528}
529
530static void virt_efi_reset_system (int reset_type, efi_status_t status,
531 unsigned long data_size,
532 efi_char16_t *data)
533{
534 efi_call_virt(reset_system, reset_type, status, data_size, data);
535}
536
537/*
538 * This function will switch the EFI runtime services to virtual mode.
539 * Essentially, look through the EFI memmap and map every region that
540 * has the runtime attribute bit set in its memory descriptor and update
541 * that memory descriptor with the virtual address obtained from ioremap().
542 * This enables the runtime services to be called without having to
543 * thunk back into physical mode for every invocation.
544 */
545
546void __init efi_enter_virtual_mode(void)
547{
548 efi_memory_desc_t *md;
549 efi_status_t status;
550 void *p;
551
552 efi.systab = NULL;
553
554 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
555 md = p;
556
557 if (!(md->attribute & EFI_MEMORY_RUNTIME))
558 continue;
559
560 md->virt_addr = (unsigned long)ioremap(md->phys_addr,
561 md->num_pages << EFI_PAGE_SHIFT);
562 if (!(unsigned long)md->virt_addr) {
563 printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
564 (unsigned long)md->phys_addr);
565 }
566 /* update the virtual address of the EFI system table */
567 check_range_for_systab(md);
568 }
569
570 BUG_ON(!efi.systab);
571
572 status = phys_efi_set_virtual_address_map(
573 memmap.desc_size * memmap.nr_map,
574 memmap.desc_size,
575 memmap.desc_version,
576 memmap.phys_map);
577
578 if (status != EFI_SUCCESS) {
579 printk (KERN_ALERT "You are screwed! "
580 "Unable to switch EFI into virtual mode "
581 "(status=%lx)\n", status);
582 panic("EFI call to SetVirtualAddressMap() failed!");
583 }
584
585 /*
586 * Now that EFI is in virtual mode, update the function
587 * pointers in the runtime service table to the new virtual addresses.
588 */
589
590 efi.get_time = virt_efi_get_time;
591 efi.set_time = virt_efi_set_time;
592 efi.get_wakeup_time = virt_efi_get_wakeup_time;
593 efi.set_wakeup_time = virt_efi_set_wakeup_time;
594 efi.get_variable = virt_efi_get_variable;
595 efi.get_next_variable = virt_efi_get_next_variable;
596 efi.set_variable = virt_efi_set_variable;
597 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
598 efi.reset_system = virt_efi_reset_system;
599}
600
601void __init
602efi_initialize_iomem_resources(struct resource *code_resource,
603 struct resource *data_resource)
604{
605 struct resource *res;
606 efi_memory_desc_t *md;
607 void *p;
608
609 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
610 md = p;
611
612 if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
613 0x100000000ULL)
614 continue;
615 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
616 switch (md->type) {
617 case EFI_RESERVED_TYPE:
618 res->name = "Reserved Memory";
619 break;
620 case EFI_LOADER_CODE:
621 res->name = "Loader Code";
622 break;
623 case EFI_LOADER_DATA:
624 res->name = "Loader Data";
625 break;
626 case EFI_BOOT_SERVICES_DATA:
627 res->name = "BootServices Data";
628 break;
629 case EFI_BOOT_SERVICES_CODE:
630 res->name = "BootServices Code";
631 break;
632 case EFI_RUNTIME_SERVICES_CODE:
633 res->name = "Runtime Service Code";
634 break;
635 case EFI_RUNTIME_SERVICES_DATA:
636 res->name = "Runtime Service Data";
637 break;
638 case EFI_CONVENTIONAL_MEMORY:
639 res->name = "Conventional Memory";
640 break;
641 case EFI_UNUSABLE_MEMORY:
642 res->name = "Unusable Memory";
643 break;
644 case EFI_ACPI_RECLAIM_MEMORY:
645 res->name = "ACPI Reclaim";
646 break;
647 case EFI_ACPI_MEMORY_NVS:
648 res->name = "ACPI NVS";
649 break;
650 case EFI_MEMORY_MAPPED_IO:
651 res->name = "Memory Mapped IO";
652 break;
653 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
654 res->name = "Memory Mapped IO Port Space";
655 break;
656 default:
657 res->name = "Reserved";
658 break;
659 }
660 res->start = md->phys_addr;
661 res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
662 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
663 if (request_resource(&iomem_resource, res) < 0)
664 printk(KERN_ERR PFX "Failed to allocate res %s : "
665 "0x%llx-0x%llx\n", res->name,
666 (unsigned long long)res->start,
667 (unsigned long long)res->end);
668 /*
669 * We don't know which region contains kernel data so we try
670 * it repeatedly and let the resource manager test it.
671 */
672 if (md->type == EFI_CONVENTIONAL_MEMORY) {
673 request_resource(res, code_resource);
674 request_resource(res, data_resource);
675#ifdef CONFIG_KEXEC
676 request_resource(res, &crashk_res);
677#endif
678 }
679 }
680}
681
682/*
683 * Convenience functions to obtain memory types and attributes
684 */
685
686u32 efi_mem_type(unsigned long phys_addr)
687{
688 efi_memory_desc_t *md;
689 void *p;
690
691 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
692 md = p;
693 if ((md->phys_addr <= phys_addr) && (phys_addr <
694 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
695 return md->type;
696 }
697 return 0;
698}
699
700u64 efi_mem_attributes(unsigned long phys_addr)
701{
702 efi_memory_desc_t *md;
703 void *p;
704
705 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
706 md = p;
707 if ((md->phys_addr <= phys_addr) && (phys_addr <
708 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
709 return md->attribute;
710 }
711 return 0;
712}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
new file mode 100644
index 000000000000..ef00bb77d7e4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -0,0 +1,122 @@
1/*
2 * EFI call stub for IA32.
3 *
4 * This stub allows us to make EFI calls in physical mode with interrupts
5 * turned off.
6 */
7
8#include <linux/linkage.h>
9#include <asm/page.h>
10
11/*
12 * efi_call_phys(void *, ...) is a function with variable parameters.
13 * All the callers of this function assure that all the parameters are 4-bytes.
14 */
15
16/*
17 * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
18 * So we'd better save all of them at the beginning of this function and restore
19 * at the end no matter how many we use, because we can not assure EFI runtime
20 * service functions will comply with gcc calling convention, too.
21 */
22
23.text
24ENTRY(efi_call_phys)
25 /*
26 * 0. The function can only be called in Linux kernel. So CS has been
27 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
28 * the values of these registers are the same. And, the corresponding
29 * GDT entries are identical. So I will do nothing about segment reg
30 * and GDT, but change GDT base register in prelog and epilog.
31 */
32
33 /*
34 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
35 * But to make it smoothly switch from virtual mode to flat mode.
36 * The mapping of lower virtual memory has been created in prelog and
37 * epilog.
38 */
39 movl $1f, %edx
40 subl $__PAGE_OFFSET, %edx
41 jmp *%edx
421:
43
44 /*
45 * 2. Now on the top of stack is the return
46 * address in the caller of efi_call_phys(), then parameter 1,
47 * parameter 2, ..., param n. To make things easy, we save the return
48 * address of efi_call_phys in a global variable.
49 */
50 popl %edx
51 movl %edx, saved_return_addr
52 /* get the function pointer into ECX*/
53 popl %ecx
54 movl %ecx, efi_rt_function_ptr
55 movl $2f, %edx
56 subl $__PAGE_OFFSET, %edx
57 pushl %edx
58
59 /*
60 * 3. Clear PG bit in %CR0.
61 */
62 movl %cr0, %edx
63 andl $0x7fffffff, %edx
64 movl %edx, %cr0
65 jmp 1f
661:
67
68 /*
69 * 4. Adjust stack pointer.
70 */
71 subl $__PAGE_OFFSET, %esp
72
73 /*
74 * 5. Call the physical function.
75 */
76 jmp *%ecx
77
782:
79 /*
80 * 6. After EFI runtime service returns, control will return to
81 * following instruction. We'd better readjust stack pointer first.
82 */
83 addl $__PAGE_OFFSET, %esp
84
85 /*
86 * 7. Restore PG bit
87 */
88 movl %cr0, %edx
89 orl $0x80000000, %edx
90 movl %edx, %cr0
91 jmp 1f
921:
93 /*
94 * 8. Now restore the virtual mode from flat mode by
95 * adding EIP with PAGE_OFFSET.
96 */
97 movl $1f, %edx
98 jmp *%edx
991:
100
101 /*
102 * 9. Balance the stack. And because EAX contain the return value,
103 * we'd better not clobber it.
104 */
105 leal efi_rt_function_ptr, %edx
106 movl (%edx), %ecx
107 pushl %ecx
108
109 /*
110 * 10. Push the saved return address onto the stack and return.
111 */
112 leal saved_return_addr, %edx
113 movl (%edx), %ecx
114 pushl %ecx
115 ret
116.previous
117
118.data
119saved_return_addr:
120 .long 0
121efi_rt_function_ptr:
122 .long 0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
new file mode 100644
index 000000000000..290b7bc82da3
--- /dev/null
+++ b/arch/x86/kernel/entry_32.S
@@ -0,0 +1,1112 @@
1/*
2 * linux/arch/i386/entry.S
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * entry.S contains the system-call and fault low-level handling routines.
9 * This also contains the timer-interrupt handler, as well as all interrupts
10 * and faults that can result in a task-switch.
11 *
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after a timer-interrupt and after each system call.
14 *
15 * I changed all the .align's to 4 (16 byte alignment), as that's faster
16 * on a 486.
17 *
18 * Stack layout in 'syscall_exit':
19 * ptrace needs to have all regs on the stack.
20 * if the order here is changed, it needs to be
21 * updated in fork.c:copy_process, signal.c:do_signal,
22 * ptrace.c and ptrace.h
23 *
24 * 0(%esp) - %ebx
25 * 4(%esp) - %ecx
26 * 8(%esp) - %edx
27 * C(%esp) - %esi
28 * 10(%esp) - %edi
29 * 14(%esp) - %ebp
30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds
32 * 20(%esp) - %es
33 * 24(%esp) - %fs
34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %eip
36 * 30(%esp) - %cs
37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
40 *
41 * "current" is in register %ebx during any slow entries.
42 */
43
44#include <linux/linkage.h>
45#include <asm/thread_info.h>
46#include <asm/irqflags.h>
47#include <asm/errno.h>
48#include <asm/segment.h>
49#include <asm/smp.h>
50#include <asm/page.h>
51#include <asm/desc.h>
52#include <asm/percpu.h>
53#include <asm/dwarf2.h>
54#include "irq_vectors.h"
55
56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
68
69#define nr_syscalls ((syscall_table_size)/4)
70
71CF_MASK = 0x00000001
72TF_MASK = 0x00000100
73IF_MASK = 0x00000200
74DF_MASK = 0x00000400
75NT_MASK = 0x00004000
76VM_MASK = 0x00020000
77
78#ifdef CONFIG_PREEMPT
79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
80#else
81#define preempt_stop(clobbers)
82#define resume_kernel restore_nocheck
83#endif
84
85.macro TRACE_IRQS_IRET
86#ifdef CONFIG_TRACE_IRQFLAGS
87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
88 jz 1f
89 TRACE_IRQS_ON
901:
91#endif
92.endm
93
94#ifdef CONFIG_VM86
95#define resume_userspace_sig check_userspace
96#else
97#define resume_userspace_sig resume_userspace
98#endif
99
100#define SAVE_ALL \
101 cld; \
102 pushl %fs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET fs, 0;*/\
105 pushl %es; \
106 CFI_ADJUST_CFA_OFFSET 4;\
107 /*CFI_REL_OFFSET es, 0;*/\
108 pushl %ds; \
109 CFI_ADJUST_CFA_OFFSET 4;\
110 /*CFI_REL_OFFSET ds, 0;*/\
111 pushl %eax; \
112 CFI_ADJUST_CFA_OFFSET 4;\
113 CFI_REL_OFFSET eax, 0;\
114 pushl %ebp; \
115 CFI_ADJUST_CFA_OFFSET 4;\
116 CFI_REL_OFFSET ebp, 0;\
117 pushl %edi; \
118 CFI_ADJUST_CFA_OFFSET 4;\
119 CFI_REL_OFFSET edi, 0;\
120 pushl %esi; \
121 CFI_ADJUST_CFA_OFFSET 4;\
122 CFI_REL_OFFSET esi, 0;\
123 pushl %edx; \
124 CFI_ADJUST_CFA_OFFSET 4;\
125 CFI_REL_OFFSET edx, 0;\
126 pushl %ecx; \
127 CFI_ADJUST_CFA_OFFSET 4;\
128 CFI_REL_OFFSET ecx, 0;\
129 pushl %ebx; \
130 CFI_ADJUST_CFA_OFFSET 4;\
131 CFI_REL_OFFSET ebx, 0;\
132 movl $(__USER_DS), %edx; \
133 movl %edx, %ds; \
134 movl %edx, %es; \
135 movl $(__KERNEL_PERCPU), %edx; \
136 movl %edx, %fs
137
138#define RESTORE_INT_REGS \
139 popl %ebx; \
140 CFI_ADJUST_CFA_OFFSET -4;\
141 CFI_RESTORE ebx;\
142 popl %ecx; \
143 CFI_ADJUST_CFA_OFFSET -4;\
144 CFI_RESTORE ecx;\
145 popl %edx; \
146 CFI_ADJUST_CFA_OFFSET -4;\
147 CFI_RESTORE edx;\
148 popl %esi; \
149 CFI_ADJUST_CFA_OFFSET -4;\
150 CFI_RESTORE esi;\
151 popl %edi; \
152 CFI_ADJUST_CFA_OFFSET -4;\
153 CFI_RESTORE edi;\
154 popl %ebp; \
155 CFI_ADJUST_CFA_OFFSET -4;\
156 CFI_RESTORE ebp;\
157 popl %eax; \
158 CFI_ADJUST_CFA_OFFSET -4;\
159 CFI_RESTORE eax
160
161#define RESTORE_REGS \
162 RESTORE_INT_REGS; \
1631: popl %ds; \
164 CFI_ADJUST_CFA_OFFSET -4;\
165 /*CFI_RESTORE ds;*/\
1662: popl %es; \
167 CFI_ADJUST_CFA_OFFSET -4;\
168 /*CFI_RESTORE es;*/\
1693: popl %fs; \
170 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE fs;*/\
172.pushsection .fixup,"ax"; \
1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \
1776: movl $0,(%esp); \
178 jmp 3b; \
179.section __ex_table,"a";\
180 .align 4; \
181 .long 1b,4b; \
182 .long 2b,5b; \
183 .long 3b,6b; \
184.popsection
185
186#define RING0_INT_FRAME \
187 CFI_STARTPROC simple;\
188 CFI_SIGNAL_FRAME;\
189 CFI_DEF_CFA esp, 3*4;\
190 /*CFI_OFFSET cs, -2*4;*/\
191 CFI_OFFSET eip, -3*4
192
193#define RING0_EC_FRAME \
194 CFI_STARTPROC simple;\
195 CFI_SIGNAL_FRAME;\
196 CFI_DEF_CFA esp, 4*4;\
197 /*CFI_OFFSET cs, -2*4;*/\
198 CFI_OFFSET eip, -3*4
199
200#define RING0_PTREGS_FRAME \
201 CFI_STARTPROC simple;\
202 CFI_SIGNAL_FRAME;\
203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
215
216ENTRY(ret_from_fork)
217 CFI_STARTPROC
218 pushl %eax
219 CFI_ADJUST_CFA_OFFSET 4
220 call schedule_tail
221 GET_THREAD_INFO(%ebp)
222 popl %eax
223 CFI_ADJUST_CFA_OFFSET -4
224 pushl $0x0202 # Reset kernel eflags
225 CFI_ADJUST_CFA_OFFSET 4
226 popfl
227 CFI_ADJUST_CFA_OFFSET -4
228 jmp syscall_exit
229 CFI_ENDPROC
230END(ret_from_fork)
231
232/*
233 * Return to user mode is not as complex as all this looks,
234 * but we want the default path for a system call return to
235 * go as quickly as possible which is why some of this is
236 * less clear than it otherwise should be.
237 */
238
239 # userspace resumption stub bypassing syscall exit tracing
240 ALIGN
241 RING0_PTREGS_FRAME
242ret_from_exception:
243 preempt_stop(CLBR_ANY)
244ret_from_intr:
245 GET_THREAD_INFO(%ebp)
246check_userspace:
247 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
248 movb PT_CS(%esp), %al
249 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
250 cmpl $USER_RPL, %eax
251 jb resume_kernel # not returning to v8086 or userspace
252
253ENTRY(resume_userspace)
254 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
255 # setting need_resched or sigpending
256 # between sampling and the iret
257 movl TI_flags(%ebp), %ecx
258 andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
259 # int/exception return?
260 jne work_pending
261 jmp restore_all
262END(ret_from_exception)
263
264#ifdef CONFIG_PREEMPT
265ENTRY(resume_kernel)
266 DISABLE_INTERRUPTS(CLBR_ANY)
267 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
268 jnz restore_nocheck
269need_resched:
270 movl TI_flags(%ebp), %ecx # need_resched set ?
271 testb $_TIF_NEED_RESCHED, %cl
272 jz restore_all
273 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
274 jz restore_all
275 call preempt_schedule_irq
276 jmp need_resched
277END(resume_kernel)
278#endif
279 CFI_ENDPROC
280
281/* SYSENTER_RETURN points to after the "sysenter" instruction in
282 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
283
284 # sysenter call handler stub
285ENTRY(sysenter_entry)
286 CFI_STARTPROC simple
287 CFI_SIGNAL_FRAME
288 CFI_DEF_CFA esp, 0
289 CFI_REGISTER esp, ebp
290 movl TSS_sysenter_esp0(%esp),%esp
291sysenter_past_esp:
292 /*
293 * No need to follow this irqs on/off section: the syscall
294 * disabled irqs and here we enable it straight after entry:
295 */
296 ENABLE_INTERRUPTS(CLBR_NONE)
297 pushl $(__USER_DS)
298 CFI_ADJUST_CFA_OFFSET 4
299 /*CFI_REL_OFFSET ss, 0*/
300 pushl %ebp
301 CFI_ADJUST_CFA_OFFSET 4
302 CFI_REL_OFFSET esp, 0
303 pushfl
304 CFI_ADJUST_CFA_OFFSET 4
305 pushl $(__USER_CS)
306 CFI_ADJUST_CFA_OFFSET 4
307 /*CFI_REL_OFFSET cs, 0*/
308 /*
309 * Push current_thread_info()->sysenter_return to the stack.
310 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
311 * pushed above; +8 corresponds to copy_thread's esp0 setting.
312 */
313 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
314 CFI_ADJUST_CFA_OFFSET 4
315 CFI_REL_OFFSET eip, 0
316
317/*
318 * Load the potential sixth argument from user stack.
319 * Careful about security.
320 */
321 cmpl $__PAGE_OFFSET-3,%ebp
322 jae syscall_fault
3231: movl (%ebp),%ebp
324.section __ex_table,"a"
325 .align 4
326 .long 1b,syscall_fault
327.previous
328
329 pushl %eax
330 CFI_ADJUST_CFA_OFFSET 4
331 SAVE_ALL
332 GET_THREAD_INFO(%ebp)
333
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
336 jnz syscall_trace_entry
337 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys
339 call *sys_call_table(,%eax,4)
340 movl %eax,PT_EAX(%esp)
341 DISABLE_INTERRUPTS(CLBR_ANY)
342 TRACE_IRQS_OFF
343 movl TI_flags(%ebp), %ecx
344 testw $_TIF_ALLWORK_MASK, %cx
345 jne syscall_exit_work
346/* if something modifies registers it must also disable sysexit */
347 movl PT_EIP(%esp), %edx
348 movl PT_OLDESP(%esp), %ecx
349 xorl %ebp,%ebp
350 TRACE_IRQS_ON
3511: mov PT_FS(%esp), %fs
352 ENABLE_INTERRUPTS_SYSEXIT
353 CFI_ENDPROC
354.pushsection .fixup,"ax"
3552: movl $0,PT_FS(%esp)
356 jmp 1b
357.section __ex_table,"a"
358 .align 4
359 .long 1b,2b
360.popsection
361ENDPROC(sysenter_entry)
362
363 # system call handler stub
364ENTRY(system_call)
365 RING0_INT_FRAME # can't unwind into user space anyway
366 pushl %eax # save orig_eax
367 CFI_ADJUST_CFA_OFFSET 4
368 SAVE_ALL
369 GET_THREAD_INFO(%ebp)
370 # system call tracing in operation / emulation
371 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
372 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
373 jnz syscall_trace_entry
374 cmpl $(nr_syscalls), %eax
375 jae syscall_badsys
376syscall_call:
377 call *sys_call_table(,%eax,4)
378 movl %eax,PT_EAX(%esp) # store the return value
379syscall_exit:
380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
381 # setting need_resched or sigpending
382 # between sampling and the iret
383 TRACE_IRQS_OFF
384 testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
385 jz no_singlestep
386 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
387no_singlestep:
388 movl TI_flags(%ebp), %ecx
389 testw $_TIF_ALLWORK_MASK, %cx # current->work
390 jne syscall_exit_work
391
392restore_all:
393 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
394 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
395 # are returning to the kernel.
396 # See comments in process.c:copy_thread() for details.
397 movb PT_OLDSS(%esp), %ah
398 movb PT_CS(%esp), %al
399 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
400 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
401 CFI_REMEMBER_STATE
402 je ldt_ss # returning to user-space with LDT SS
403restore_nocheck:
404 TRACE_IRQS_IRET
405restore_nocheck_notrace:
406 RESTORE_REGS
407 addl $4, %esp # skip orig_eax/error_code
408 CFI_ADJUST_CFA_OFFSET -4
4091: INTERRUPT_RETURN
410.section .fixup,"ax"
411iret_exc:
412 pushl $0 # no error code
413 pushl $do_iret_error
414 jmp error_code
415.previous
416.section __ex_table,"a"
417 .align 4
418 .long 1b,iret_exc
419.previous
420
421 CFI_RESTORE_STATE
422ldt_ss:
423 larl PT_OLDSS(%esp), %eax
424 jnz restore_nocheck
425 testl $0x00400000, %eax # returning to 32bit stack?
426 jnz restore_nocheck # allright, normal return
427
428#ifdef CONFIG_PARAVIRT
429 /*
430 * The kernel can't run on a non-flat stack if paravirt mode
431 * is active. Rather than try to fixup the high bits of
432 * ESP, bypass this code entirely. This may break DOSemu
433 * and/or Wine support in a paravirt VM, although the option
434 * is still available to implement the setting of the high
435 * 16-bits in the INTERRUPT_RETURN paravirt-op.
436 */
437 cmpl $0, paravirt_ops+PARAVIRT_enabled
438 jne restore_nocheck
439#endif
440
441 /* If returning to userspace with 16bit stack,
442 * try to fix the higher word of ESP, as the CPU
443 * won't restore it.
444 * This is an "official" bug of all the x86-compatible
445 * CPUs, which we can try to work around to make
446 * dosemu and wine happy. */
447 movl PT_OLDESP(%esp), %eax
448 movl %esp, %edx
449 call patch_espfix_desc
450 pushl $__ESPFIX_SS
451 CFI_ADJUST_CFA_OFFSET 4
452 pushl %eax
453 CFI_ADJUST_CFA_OFFSET 4
454 DISABLE_INTERRUPTS(CLBR_EAX)
455 TRACE_IRQS_OFF
456 lss (%esp), %esp
457 CFI_ADJUST_CFA_OFFSET -8
458 jmp restore_nocheck
459 CFI_ENDPROC
460ENDPROC(system_call)
461
462 # perform work that needs to be done immediately before resumption
463 ALIGN
464 RING0_PTREGS_FRAME # can't unwind into user space anyway
465work_pending:
466 testb $_TIF_NEED_RESCHED, %cl
467 jz work_notifysig
468work_resched:
469 call schedule
470 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
471 # setting need_resched or sigpending
472 # between sampling and the iret
473 TRACE_IRQS_OFF
474 movl TI_flags(%ebp), %ecx
475 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
476 # than syscall tracing?
477 jz restore_all
478 testb $_TIF_NEED_RESCHED, %cl
479 jnz work_resched
480
481work_notifysig: # deal with pending signals and
482 # notify-resume requests
483#ifdef CONFIG_VM86
484 testl $VM_MASK, PT_EFLAGS(%esp)
485 movl %esp, %eax
486 jne work_notifysig_v86 # returning to kernel-space or
487 # vm86-space
488 xorl %edx, %edx
489 call do_notify_resume
490 jmp resume_userspace_sig
491
492 ALIGN
493work_notifysig_v86:
494 pushl %ecx # save ti_flags for do_notify_resume
495 CFI_ADJUST_CFA_OFFSET 4
496 call save_v86_state # %eax contains pt_regs pointer
497 popl %ecx
498 CFI_ADJUST_CFA_OFFSET -4
499 movl %eax, %esp
500#else
501 movl %esp, %eax
502#endif
503 xorl %edx, %edx
504 call do_notify_resume
505 jmp resume_userspace_sig
506END(work_pending)
507
508 # perform syscall exit tracing
509 ALIGN
510syscall_trace_entry:
511 movl $-ENOSYS,PT_EAX(%esp)
512 movl %esp, %eax
513 xorl %edx,%edx
514 call do_syscall_trace
515 cmpl $0, %eax
516 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
517 # so must skip actual syscall
518 movl PT_ORIG_EAX(%esp), %eax
519 cmpl $(nr_syscalls), %eax
520 jnae syscall_call
521 jmp syscall_exit
522END(syscall_trace_entry)
523
524 # perform syscall exit tracing
525 ALIGN
526syscall_exit_work:
527 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
528 jz work_pending
529 TRACE_IRQS_ON
530 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
531 # schedule() instead
532 movl %esp, %eax
533 movl $1, %edx
534 call do_syscall_trace
535 jmp resume_userspace
536END(syscall_exit_work)
537 CFI_ENDPROC
538
539 RING0_INT_FRAME # can't unwind into user space anyway
540syscall_fault:
541 pushl %eax # save orig_eax
542 CFI_ADJUST_CFA_OFFSET 4
543 SAVE_ALL
544 GET_THREAD_INFO(%ebp)
545 movl $-EFAULT,PT_EAX(%esp)
546 jmp resume_userspace
547END(syscall_fault)
548
549syscall_badsys:
550 movl $-ENOSYS,PT_EAX(%esp)
551 jmp resume_userspace
552END(syscall_badsys)
553 CFI_ENDPROC
554
555#define FIXUP_ESPFIX_STACK \
556 /* since we are on a wrong stack, we cant make it a C code :( */ \
557 PER_CPU(gdt_page, %ebx); \
558 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
559 addl %esp, %eax; \
560 pushl $__KERNEL_DS; \
561 CFI_ADJUST_CFA_OFFSET 4; \
562 pushl %eax; \
563 CFI_ADJUST_CFA_OFFSET 4; \
564 lss (%esp), %esp; \
565 CFI_ADJUST_CFA_OFFSET -8;
566#define UNWIND_ESPFIX_STACK \
567 movl %ss, %eax; \
568 /* see if on espfix stack */ \
569 cmpw $__ESPFIX_SS, %ax; \
570 jne 27f; \
571 movl $__KERNEL_DS, %eax; \
572 movl %eax, %ds; \
573 movl %eax, %es; \
574 /* switch to normal stack */ \
575 FIXUP_ESPFIX_STACK; \
57627:;
577
578/*
579 * Build the entry stubs and pointer table with
580 * some assembler magic.
581 */
582.data
583ENTRY(interrupt)
584.text
585
586ENTRY(irq_entries_start)
587 RING0_INT_FRAME
588vector=0
589.rept NR_IRQS
590 ALIGN
591 .if vector
592 CFI_ADJUST_CFA_OFFSET -4
593 .endif
5941: pushl $~(vector)
595 CFI_ADJUST_CFA_OFFSET 4
596 jmp common_interrupt
597 .previous
598 .long 1b
599 .text
600vector=vector+1
601.endr
602END(irq_entries_start)
603
604.previous
605END(interrupt)
606.previous
607
608/*
609 * the CPU automatically disables interrupts when executing an IRQ vector,
610 * so IRQ-flags tracing has to follow that:
611 */
612 ALIGN
613common_interrupt:
614 SAVE_ALL
615 TRACE_IRQS_OFF
616 movl %esp,%eax
617 call do_IRQ
618 jmp ret_from_intr
619ENDPROC(common_interrupt)
620 CFI_ENDPROC
621
622#define BUILD_INTERRUPT(name, nr) \
623ENTRY(name) \
624 RING0_INT_FRAME; \
625 pushl $~(nr); \
626 CFI_ADJUST_CFA_OFFSET 4; \
627 SAVE_ALL; \
628 TRACE_IRQS_OFF \
629 movl %esp,%eax; \
630 call smp_##name; \
631 jmp ret_from_intr; \
632 CFI_ENDPROC; \
633ENDPROC(name)
634
635/* The include is where all of the SMP etc. interrupts come from */
636#include "entry_arch.h"
637
638KPROBE_ENTRY(page_fault)
639 RING0_EC_FRAME
640 pushl $do_page_fault
641 CFI_ADJUST_CFA_OFFSET 4
642 ALIGN
643error_code:
644 /* the function address is in %fs's slot on the stack */
645 pushl %es
646 CFI_ADJUST_CFA_OFFSET 4
647 /*CFI_REL_OFFSET es, 0*/
648 pushl %ds
649 CFI_ADJUST_CFA_OFFSET 4
650 /*CFI_REL_OFFSET ds, 0*/
651 pushl %eax
652 CFI_ADJUST_CFA_OFFSET 4
653 CFI_REL_OFFSET eax, 0
654 pushl %ebp
655 CFI_ADJUST_CFA_OFFSET 4
656 CFI_REL_OFFSET ebp, 0
657 pushl %edi
658 CFI_ADJUST_CFA_OFFSET 4
659 CFI_REL_OFFSET edi, 0
660 pushl %esi
661 CFI_ADJUST_CFA_OFFSET 4
662 CFI_REL_OFFSET esi, 0
663 pushl %edx
664 CFI_ADJUST_CFA_OFFSET 4
665 CFI_REL_OFFSET edx, 0
666 pushl %ecx
667 CFI_ADJUST_CFA_OFFSET 4
668 CFI_REL_OFFSET ecx, 0
669 pushl %ebx
670 CFI_ADJUST_CFA_OFFSET 4
671 CFI_REL_OFFSET ebx, 0
672 cld
673 pushl %fs
674 CFI_ADJUST_CFA_OFFSET 4
675 /*CFI_REL_OFFSET fs, 0*/
676 movl $(__KERNEL_PERCPU), %ecx
677 movl %ecx, %fs
678 UNWIND_ESPFIX_STACK
679 popl %ecx
680 CFI_ADJUST_CFA_OFFSET -4
681 /*CFI_REGISTER es, ecx*/
682 movl PT_FS(%esp), %edi # get the function address
683 movl PT_ORIG_EAX(%esp), %edx # get the error code
684 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
685 mov %ecx, PT_FS(%esp)
686 /*CFI_REL_OFFSET fs, ES*/
687 movl $(__USER_DS), %ecx
688 movl %ecx, %ds
689 movl %ecx, %es
690 movl %esp,%eax # pt_regs pointer
691 call *%edi
692 jmp ret_from_exception
693 CFI_ENDPROC
694KPROBE_END(page_fault)
695
696ENTRY(coprocessor_error)
697 RING0_INT_FRAME
698 pushl $0
699 CFI_ADJUST_CFA_OFFSET 4
700 pushl $do_coprocessor_error
701 CFI_ADJUST_CFA_OFFSET 4
702 jmp error_code
703 CFI_ENDPROC
704END(coprocessor_error)
705
706ENTRY(simd_coprocessor_error)
707 RING0_INT_FRAME
708 pushl $0
709 CFI_ADJUST_CFA_OFFSET 4
710 pushl $do_simd_coprocessor_error
711 CFI_ADJUST_CFA_OFFSET 4
712 jmp error_code
713 CFI_ENDPROC
714END(simd_coprocessor_error)
715
716ENTRY(device_not_available)
717 RING0_INT_FRAME
718 pushl $-1 # mark this as an int
719 CFI_ADJUST_CFA_OFFSET 4
720 SAVE_ALL
721 GET_CR0_INTO_EAX
722 testl $0x4, %eax # EM (math emulation bit)
723 jne device_not_available_emulate
724 preempt_stop(CLBR_ANY)
725 call math_state_restore
726 jmp ret_from_exception
727device_not_available_emulate:
728 pushl $0 # temporary storage for ORIG_EIP
729 CFI_ADJUST_CFA_OFFSET 4
730 call math_emulate
731 addl $4, %esp
732 CFI_ADJUST_CFA_OFFSET -4
733 jmp ret_from_exception
734 CFI_ENDPROC
735END(device_not_available)
736
737/*
738 * Debug traps and NMI can happen at the one SYSENTER instruction
739 * that sets up the real kernel stack. Check here, since we can't
740 * allow the wrong stack to be used.
741 *
742 * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
743 * already pushed 3 words if it hits on the sysenter instruction:
744 * eflags, cs and eip.
745 *
746 * We just load the right stack, and push the three (known) values
747 * by hand onto the new stack - while updating the return eip past
748 * the instruction that would have done it for sysenter.
749 */
750#define FIX_STACK(offset, ok, label) \
751 cmpw $__KERNEL_CS,4(%esp); \
752 jne ok; \
753label: \
754 movl TSS_sysenter_esp0+offset(%esp),%esp; \
755 CFI_DEF_CFA esp, 0; \
756 CFI_UNDEFINED eip; \
757 pushfl; \
758 CFI_ADJUST_CFA_OFFSET 4; \
759 pushl $__KERNEL_CS; \
760 CFI_ADJUST_CFA_OFFSET 4; \
761 pushl $sysenter_past_esp; \
762 CFI_ADJUST_CFA_OFFSET 4; \
763 CFI_REL_OFFSET eip, 0
764
765KPROBE_ENTRY(debug)
766 RING0_INT_FRAME
767 cmpl $sysenter_entry,(%esp)
768 jne debug_stack_correct
769 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
770debug_stack_correct:
771 pushl $-1 # mark this as an int
772 CFI_ADJUST_CFA_OFFSET 4
773 SAVE_ALL
774 xorl %edx,%edx # error code 0
775 movl %esp,%eax # pt_regs pointer
776 call do_debug
777 jmp ret_from_exception
778 CFI_ENDPROC
779KPROBE_END(debug)
780
781/*
782 * NMI is doubly nasty. It can happen _while_ we're handling
783 * a debug fault, and the debug fault hasn't yet been able to
784 * clear up the stack. So we first check whether we got an
785 * NMI on the sysenter entry path, but after that we need to
786 * check whether we got an NMI on the debug path where the debug
787 * fault happened on the sysenter path.
788 */
789KPROBE_ENTRY(nmi)
790 RING0_INT_FRAME
791 pushl %eax
792 CFI_ADJUST_CFA_OFFSET 4
793 movl %ss, %eax
794 cmpw $__ESPFIX_SS, %ax
795 popl %eax
796 CFI_ADJUST_CFA_OFFSET -4
797 je nmi_espfix_stack
798 cmpl $sysenter_entry,(%esp)
799 je nmi_stack_fixup
800 pushl %eax
801 CFI_ADJUST_CFA_OFFSET 4
802 movl %esp,%eax
803 /* Do not access memory above the end of our stack page,
804 * it might not exist.
805 */
806 andl $(THREAD_SIZE-1),%eax
807 cmpl $(THREAD_SIZE-20),%eax
808 popl %eax
809 CFI_ADJUST_CFA_OFFSET -4
810 jae nmi_stack_correct
811 cmpl $sysenter_entry,12(%esp)
812 je nmi_debug_stack_check
813nmi_stack_correct:
814 /* We have a RING0_INT_FRAME here */
815 pushl %eax
816 CFI_ADJUST_CFA_OFFSET 4
817 SAVE_ALL
818 xorl %edx,%edx # zero error code
819 movl %esp,%eax # pt_regs pointer
820 call do_nmi
821 jmp restore_nocheck_notrace
822 CFI_ENDPROC
823
824nmi_stack_fixup:
825 RING0_INT_FRAME
826 FIX_STACK(12,nmi_stack_correct, 1)
827 jmp nmi_stack_correct
828
829nmi_debug_stack_check:
830 /* We have a RING0_INT_FRAME here */
831 cmpw $__KERNEL_CS,16(%esp)
832 jne nmi_stack_correct
833 cmpl $debug,(%esp)
834 jb nmi_stack_correct
835 cmpl $debug_esp_fix_insn,(%esp)
836 ja nmi_stack_correct
837 FIX_STACK(24,nmi_stack_correct, 1)
838 jmp nmi_stack_correct
839
840nmi_espfix_stack:
841 /* We have a RING0_INT_FRAME here.
842 *
843 * create the pointer to lss back
844 */
845 pushl %ss
846 CFI_ADJUST_CFA_OFFSET 4
847 pushl %esp
848 CFI_ADJUST_CFA_OFFSET 4
849 addw $4, (%esp)
850 /* copy the iret frame of 12 bytes */
851 .rept 3
852 pushl 16(%esp)
853 CFI_ADJUST_CFA_OFFSET 4
854 .endr
855 pushl %eax
856 CFI_ADJUST_CFA_OFFSET 4
857 SAVE_ALL
858 FIXUP_ESPFIX_STACK # %eax == %esp
859 xorl %edx,%edx # zero error code
860 call do_nmi
861 RESTORE_REGS
862 lss 12+4(%esp), %esp # back to espfix stack
863 CFI_ADJUST_CFA_OFFSET -24
8641: INTERRUPT_RETURN
865 CFI_ENDPROC
866.section __ex_table,"a"
867 .align 4
868 .long 1b,iret_exc
869.previous
870KPROBE_END(nmi)
871
872#ifdef CONFIG_PARAVIRT
873ENTRY(native_iret)
8741: iret
875.section __ex_table,"a"
876 .align 4
877 .long 1b,iret_exc
878.previous
879END(native_iret)
880
881ENTRY(native_irq_enable_sysexit)
882 sti
883 sysexit
884END(native_irq_enable_sysexit)
885#endif
886
887KPROBE_ENTRY(int3)
888 RING0_INT_FRAME
889 pushl $-1 # mark this as an int
890 CFI_ADJUST_CFA_OFFSET 4
891 SAVE_ALL
892 xorl %edx,%edx # zero error code
893 movl %esp,%eax # pt_regs pointer
894 call do_int3
895 jmp ret_from_exception
896 CFI_ENDPROC
897KPROBE_END(int3)
898
899ENTRY(overflow)
900 RING0_INT_FRAME
901 pushl $0
902 CFI_ADJUST_CFA_OFFSET 4
903 pushl $do_overflow
904 CFI_ADJUST_CFA_OFFSET 4
905 jmp error_code
906 CFI_ENDPROC
907END(overflow)
908
909ENTRY(bounds)
910 RING0_INT_FRAME
911 pushl $0
912 CFI_ADJUST_CFA_OFFSET 4
913 pushl $do_bounds
914 CFI_ADJUST_CFA_OFFSET 4
915 jmp error_code
916 CFI_ENDPROC
917END(bounds)
918
919ENTRY(invalid_op)
920 RING0_INT_FRAME
921 pushl $0
922 CFI_ADJUST_CFA_OFFSET 4
923 pushl $do_invalid_op
924 CFI_ADJUST_CFA_OFFSET 4
925 jmp error_code
926 CFI_ENDPROC
927END(invalid_op)
928
929ENTRY(coprocessor_segment_overrun)
930 RING0_INT_FRAME
931 pushl $0
932 CFI_ADJUST_CFA_OFFSET 4
933 pushl $do_coprocessor_segment_overrun
934 CFI_ADJUST_CFA_OFFSET 4
935 jmp error_code
936 CFI_ENDPROC
937END(coprocessor_segment_overrun)
938
939ENTRY(invalid_TSS)
940 RING0_EC_FRAME
941 pushl $do_invalid_TSS
942 CFI_ADJUST_CFA_OFFSET 4
943 jmp error_code
944 CFI_ENDPROC
945END(invalid_TSS)
946
947ENTRY(segment_not_present)
948 RING0_EC_FRAME
949 pushl $do_segment_not_present
950 CFI_ADJUST_CFA_OFFSET 4
951 jmp error_code
952 CFI_ENDPROC
953END(segment_not_present)
954
955ENTRY(stack_segment)
956 RING0_EC_FRAME
957 pushl $do_stack_segment
958 CFI_ADJUST_CFA_OFFSET 4
959 jmp error_code
960 CFI_ENDPROC
961END(stack_segment)
962
963KPROBE_ENTRY(general_protection)
964 RING0_EC_FRAME
965 pushl $do_general_protection
966 CFI_ADJUST_CFA_OFFSET 4
967 jmp error_code
968 CFI_ENDPROC
969KPROBE_END(general_protection)
970
971ENTRY(alignment_check)
972 RING0_EC_FRAME
973 pushl $do_alignment_check
974 CFI_ADJUST_CFA_OFFSET 4
975 jmp error_code
976 CFI_ENDPROC
977END(alignment_check)
978
979ENTRY(divide_error)
980 RING0_INT_FRAME
981 pushl $0 # no error code
982 CFI_ADJUST_CFA_OFFSET 4
983 pushl $do_divide_error
984 CFI_ADJUST_CFA_OFFSET 4
985 jmp error_code
986 CFI_ENDPROC
987END(divide_error)
988
989#ifdef CONFIG_X86_MCE
990ENTRY(machine_check)
991 RING0_INT_FRAME
992 pushl $0
993 CFI_ADJUST_CFA_OFFSET 4
994 pushl machine_check_vector
995 CFI_ADJUST_CFA_OFFSET 4
996 jmp error_code
997 CFI_ENDPROC
998END(machine_check)
999#endif
1000
1001ENTRY(spurious_interrupt_bug)
1002 RING0_INT_FRAME
1003 pushl $0
1004 CFI_ADJUST_CFA_OFFSET 4
1005 pushl $do_spurious_interrupt_bug
1006 CFI_ADJUST_CFA_OFFSET 4
1007 jmp error_code
1008 CFI_ENDPROC
1009END(spurious_interrupt_bug)
1010
1011ENTRY(kernel_thread_helper)
1012 pushl $0 # fake return address for unwinder
1013 CFI_STARTPROC
1014 movl %edx,%eax
1015 push %edx
1016 CFI_ADJUST_CFA_OFFSET 4
1017 call *%ebx
1018 push %eax
1019 CFI_ADJUST_CFA_OFFSET 4
1020 call do_exit
1021 CFI_ENDPROC
1022ENDPROC(kernel_thread_helper)
1023
1024#ifdef CONFIG_XEN
1025ENTRY(xen_hypervisor_callback)
1026 CFI_STARTPROC
1027 pushl $0
1028 CFI_ADJUST_CFA_OFFSET 4
1029 SAVE_ALL
1030 TRACE_IRQS_OFF
1031
1032 /* Check to see if we got the event in the critical
1033 region in xen_iret_direct, after we've reenabled
1034 events and checked for pending events. This simulates
1035 iret instruction's behaviour where it delivers a
1036 pending interrupt when enabling interrupts. */
1037 movl PT_EIP(%esp),%eax
1038 cmpl $xen_iret_start_crit,%eax
1039 jb 1f
1040 cmpl $xen_iret_end_crit,%eax
1041 jae 1f
1042
1043 call xen_iret_crit_fixup
1044
10451: mov %esp, %eax
1046 call xen_evtchn_do_upcall
1047 jmp ret_from_intr
1048 CFI_ENDPROC
1049ENDPROC(xen_hypervisor_callback)
1050
1051# Hypervisor uses this for application faults while it executes.
1052# We get here for two reasons:
1053# 1. Fault while reloading DS, ES, FS or GS
1054# 2. Fault while executing IRET
1055# Category 1 we fix up by reattempting the load, and zeroing the segment
1056# register if the load fails.
1057# Category 2 we fix up by jumping to do_iret_error. We cannot use the
1058# normal Linux return path in this case because if we use the IRET hypercall
1059# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1060# We distinguish between categories by maintaining a status value in EAX.
1061ENTRY(xen_failsafe_callback)
1062 CFI_STARTPROC
1063 pushl %eax
1064 CFI_ADJUST_CFA_OFFSET 4
1065 movl $1,%eax
10661: mov 4(%esp),%ds
10672: mov 8(%esp),%es
10683: mov 12(%esp),%fs
10694: mov 16(%esp),%gs
1070 testl %eax,%eax
1071 popl %eax
1072 CFI_ADJUST_CFA_OFFSET -4
1073 lea 16(%esp),%esp
1074 CFI_ADJUST_CFA_OFFSET -16
1075 jz 5f
1076 addl $16,%esp
1077 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
10785: pushl $0 # EAX == 0 => Category 1 (Bad segment)
1079 CFI_ADJUST_CFA_OFFSET 4
1080 SAVE_ALL
1081 jmp ret_from_exception
1082 CFI_ENDPROC
1083
1084.section .fixup,"ax"
10856: xorl %eax,%eax
1086 movl %eax,4(%esp)
1087 jmp 1b
10887: xorl %eax,%eax
1089 movl %eax,8(%esp)
1090 jmp 2b
10918: xorl %eax,%eax
1092 movl %eax,12(%esp)
1093 jmp 3b
10949: xorl %eax,%eax
1095 movl %eax,16(%esp)
1096 jmp 4b
1097.previous
1098.section __ex_table,"a"
1099 .align 4
1100 .long 1b,6b
1101 .long 2b,7b
1102 .long 3b,8b
1103 .long 4b,9b
1104.previous
1105ENDPROC(xen_failsafe_callback)
1106
1107#endif /* CONFIG_XEN */
1108
1109.section .rodata,"a"
1110#include "syscall_table_32.S"
1111
1112syscall_table_size=(.-sys_call_table)
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
new file mode 100644
index 000000000000..41e8aec4c61d
--- /dev/null
+++ b/arch/x86/kernel/geode_32.c
@@ -0,0 +1,155 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License
7 * as published by the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/ioport.h>
13#include <linux/io.h>
14#include <asm/msr.h>
15#include <asm/geode.h>
16
17static struct {
18 char *name;
19 u32 msr;
20 int size;
21 u32 base;
22} lbars[] = {
23 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
24 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
25 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
26 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
27};
28
29static void __init init_lbars(void)
30{
31 u32 lo, hi;
32 int i;
33
34 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
35 rdmsr(lbars[i].msr, lo, hi);
36 if (hi & 0x01)
37 lbars[i].base = lo & 0x0000ffff;
38
39 if (lbars[i].base == 0)
40 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
41 lbars[i].name);
42 }
43}
44
45int geode_get_dev_base(unsigned int dev)
46{
47 BUG_ON(dev >= ARRAY_SIZE(lbars));
48 return lbars[dev].base;
49}
50EXPORT_SYMBOL_GPL(geode_get_dev_base);
51
52/* === GPIO API === */
53
54void geode_gpio_set(unsigned int gpio, unsigned int reg)
55{
56 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
57
58 if (!base)
59 return;
60
61 if (gpio < 16)
62 outl(1 << gpio, base + reg);
63 else
64 outl(1 << (gpio - 16), base + 0x80 + reg);
65}
66EXPORT_SYMBOL_GPL(geode_gpio_set);
67
68void geode_gpio_clear(unsigned int gpio, unsigned int reg)
69{
70 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
71
72 if (!base)
73 return;
74
75 if (gpio < 16)
76 outl(1 << (gpio + 16), base + reg);
77 else
78 outl(1 << gpio, base + 0x80 + reg);
79}
80EXPORT_SYMBOL_GPL(geode_gpio_clear);
81
82int geode_gpio_isset(unsigned int gpio, unsigned int reg)
83{
84 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
85
86 if (!base)
87 return 0;
88
89 if (gpio < 16)
90 return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
91 else
92 return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
93}
94EXPORT_SYMBOL_GPL(geode_gpio_isset);
95
96void geode_gpio_set_irq(unsigned int group, unsigned int irq)
97{
98 u32 lo, hi;
99
100 if (group > 7 || irq > 15)
101 return;
102
103 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
104
105 lo &= ~(0xF << (group * 4));
106 lo |= (irq & 0xF) << (group * 4);
107
108 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
109}
110EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
111
112void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
113{
114 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
115 u32 offset, shift, val;
116
117 if (gpio >= 24)
118 offset = GPIO_MAP_W;
119 else if (gpio >= 16)
120 offset = GPIO_MAP_Z;
121 else if (gpio >= 8)
122 offset = GPIO_MAP_Y;
123 else
124 offset = GPIO_MAP_X;
125
126 shift = (gpio % 8) * 4;
127
128 val = inl(base + offset);
129
130 /* Clear whatever was there before */
131 val &= ~(0xF << shift);
132
133 /* And set the new value */
134
135 val |= ((pair & 7) << shift);
136
137 /* Set the PME bit if this is a PME event */
138
139 if (pme)
140 val |= (1 << (shift + 3));
141
142 outl(val, base + offset);
143}
144EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
145
146static int __init geode_southbridge_init(void)
147{
148 if (!is_geode())
149 return -ENODEV;
150
151 init_lbars();
152 return 0;
153}
154
155postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644
index 000000000000..9150ca9b5f80
--- /dev/null
+++ b/arch/x86/kernel/head_32.S
@@ -0,0 +1,578 @@
1/*
2 * linux/arch/i386/kernel/head.S -- the 32-bit startup code.
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Enhanced CPU detection and feature setting code by Mike Jagdis
7 * and Martin Mares, November 1997.
8 */
9
10.text
11#include <linux/threads.h>
12#include <linux/linkage.h>
13#include <asm/segment.h>
14#include <asm/page.h>
15#include <asm/pgtable.h>
16#include <asm/desc.h>
17#include <asm/cache.h>
18#include <asm/thread_info.h>
19#include <asm/asm-offsets.h>
20#include <asm/setup.h>
21
22/*
23 * References to members of the new_cpu_data structure.
24 */
25
26#define X86 new_cpu_data+CPUINFO_x86
27#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
28#define X86_MODEL new_cpu_data+CPUINFO_x86_model
29#define X86_MASK new_cpu_data+CPUINFO_x86_mask
30#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
31#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
32#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
33#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
34
35/*
36 * This is how much memory *in addition to the memory covered up to
37 * and including _end* we need mapped initially.
38 * We need:
39 * - one bit for each possible page, but only in low memory, which means
40 * 2^32/4096/8 = 128K worst case (4G/4G split.)
41 * - enough space to map all low memory, which means
42 * (2^32/4096) / 1024 pages (worst case, non PAE)
43 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
44 * - a few pages for allocator use before the kernel pagetable has
45 * been set up
46 *
47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
48 * memory, which is currently unreclaimed.
49 *
50 * This should be a multiple of a page.
51 */
52LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
53
54#if PTRS_PER_PMD > 1
55PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
56#else
57PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
58#endif
59BOOTBITMAP_SIZE = LOW_PAGES / 8
60ALLOCATOR_SLOP = 4
61
62INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
63
64/*
65 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
66 * %esi points to the real-mode code as a 32-bit pointer.
67 * CS and DS must be 4 GB flat segments, but we don't depend on
68 * any particular GDT layout, because we load our own as soon as we
69 * can.
70 */
71.section .text.head,"ax",@progbits
72ENTRY(startup_32)
73
74/*
75 * Set segments to known values.
76 */
77 cld
78 lgdt boot_gdt_descr - __PAGE_OFFSET
79 movl $(__BOOT_DS),%eax
80 movl %eax,%ds
81 movl %eax,%es
82 movl %eax,%fs
83 movl %eax,%gs
84
85/*
86 * Clear BSS first so that there are no surprises...
87 * No need to cld as DF is already clear from cld above...
88 */
89 xorl %eax,%eax
90 movl $__bss_start - __PAGE_OFFSET,%edi
91 movl $__bss_stop - __PAGE_OFFSET,%ecx
92 subl %edi,%ecx
93 shrl $2,%ecx
94 rep ; stosl
95/*
96 * Copy bootup parameters out of the way.
97 * Note: %esi still has the pointer to the real-mode data.
98 * With the kexec as boot loader, parameter segment might be loaded beyond
99 * kernel image and might not even be addressable by early boot page tables.
100 * (kexec on panic case). Hence copy out the parameters before initializing
101 * page tables.
102 */
103 movl $(boot_params - __PAGE_OFFSET),%edi
104 movl $(PARAM_SIZE/4),%ecx
105 cld
106 rep
107 movsl
108 movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
109 andl %esi,%esi
110 jnz 2f # New command line protocol
111 cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
112 jne 1f
113 movzwl OLD_CL_OFFSET,%esi
114 addl $(OLD_CL_BASE_ADDR),%esi
1152:
116 movl $(boot_command_line - __PAGE_OFFSET),%edi
117 movl $(COMMAND_LINE_SIZE/4),%ecx
118 rep
119 movsl
1201:
121
122/*
123 * Initialize page tables. This creates a PDE and a set of page
124 * tables, which are located immediately beyond _end. The variable
125 * init_pg_tables_end is set up to point to the first "safe" location.
126 * Mappings are created both at virtual address 0 (identity mapping)
127 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
128 *
129 * Warning: don't use %esi or the stack in this code. However, %esp
130 * can be used as a GPR if you really need it...
131 */
132page_pde_offset = (__PAGE_OFFSET >> 20);
133
134 movl $(pg0 - __PAGE_OFFSET), %edi
135 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
136 movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
13710:
138 leal 0x007(%edi),%ecx /* Create PDE entry */
139 movl %ecx,(%edx) /* Store identity PDE entry */
140 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
141 addl $4,%edx
142 movl $1024, %ecx
14311:
144 stosl
145 addl $0x1000,%eax
146 loop 11b
147 /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
148 /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
149 leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
150 cmpl %ebp,%eax
151 jb 10b
152 movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
153
154 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
155 jmp 3f
156/*
157 * Non-boot CPU entry point; entered from trampoline.S
158 * We can't lgdt here, because lgdt itself uses a data segment, but
159 * we know the trampoline has already loaded the boot_gdt for us.
160 *
161 * If cpu hotplug is not supported then this code can go in init section
162 * which will be freed later
163 */
164
165#ifndef CONFIG_HOTPLUG_CPU
166.section .init.text,"ax",@progbits
167#endif
168
169 /* Do an early initialization of the fixmap area */
170 movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
171 movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
172 addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
173 movl %eax, 4092(%edx)
174
175#ifdef CONFIG_SMP
176ENTRY(startup_32_smp)
177 cld
178 movl $(__BOOT_DS),%eax
179 movl %eax,%ds
180 movl %eax,%es
181 movl %eax,%fs
182 movl %eax,%gs
183
184/*
185 * New page tables may be in 4Mbyte page mode and may
186 * be using the global pages.
187 *
188 * NOTE! If we are on a 486 we may have no cr4 at all!
189 * So we do not try to touch it unless we really have
190 * some bits in it to set. This won't work if the BSP
191 * implements cr4 but this AP does not -- very unlikely
192 * but be warned! The same applies to the pse feature
193 * if not equally supported. --macro
194 *
195 * NOTE! We have to correct for the fact that we're
196 * not yet offset PAGE_OFFSET..
197 */
198#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
199 movl cr4_bits,%edx
200 andl %edx,%edx
201 jz 6f
202 movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
203 orl %edx,%eax
204 movl %eax,%cr4
205
206 btl $5, %eax # check if PAE is enabled
207 jnc 6f
208
209 /* Check if extended functions are implemented */
210 movl $0x80000000, %eax
211 cpuid
212 cmpl $0x80000000, %eax
213 jbe 6f
214 mov $0x80000001, %eax
215 cpuid
216 /* Execute Disable bit supported? */
217 btl $20, %edx
218 jnc 6f
219
220 /* Setup EFER (Extended Feature Enable Register) */
221 movl $0xc0000080, %ecx
222 rdmsr
223
224 btsl $11, %eax
225 /* Make changes effective */
226 wrmsr
227
2286:
229 /* This is a secondary processor (AP) */
230 xorl %ebx,%ebx
231 incl %ebx
232
233#endif /* CONFIG_SMP */
2343:
235
236/*
237 * Enable paging
238 */
239 movl $swapper_pg_dir-__PAGE_OFFSET,%eax
240 movl %eax,%cr3 /* set the page table pointer.. */
241 movl %cr0,%eax
242 orl $0x80000000,%eax
243 movl %eax,%cr0 /* ..and set paging (PG) bit */
244 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
2451:
246 /* Set up the stack pointer */
247 lss stack_start,%esp
248
249/*
250 * Initialize eflags. Some BIOS's leave bits like NT set. This would
251 * confuse the debugger if this code is traced.
252 * XXX - best to initialize before switching to protected mode.
253 */
254 pushl $0
255 popfl
256
257#ifdef CONFIG_SMP
258 andl %ebx,%ebx
259 jz 1f /* Initial CPU cleans BSS */
260 jmp checkCPUtype
2611:
262#endif /* CONFIG_SMP */
263
264/*
265 * start system 32-bit setup. We need to re-do some of the things done
266 * in 16-bit mode for the "real" operations.
267 */
268 call setup_idt
269
270checkCPUtype:
271
272 movl $-1,X86_CPUID # -1 for no CPUID initially
273
274/* check if it is 486 or 386. */
275/*
276 * XXX - this does a lot of unnecessary setup. Alignment checks don't
277 * apply at our cpl of 0 and the stack ought to be aligned already, and
278 * we don't need to preserve eflags.
279 */
280
281 movb $3,X86 # at least 386
282 pushfl # push EFLAGS
283 popl %eax # get EFLAGS
284 movl %eax,%ecx # save original EFLAGS
285 xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
286 pushl %eax # copy to EFLAGS
287 popfl # set EFLAGS
288 pushfl # get new EFLAGS
289 popl %eax # put it in eax
290 xorl %ecx,%eax # change in flags
291 pushl %ecx # restore original EFLAGS
292 popfl
293 testl $0x40000,%eax # check if AC bit changed
294 je is386
295
296 movb $4,X86 # at least 486
297 testl $0x200000,%eax # check if ID bit changed
298 je is486
299
300 /* get vendor info */
301 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
302 cpuid
303 movl %eax,X86_CPUID # save CPUID level
304 movl %ebx,X86_VENDOR_ID # lo 4 chars
305 movl %edx,X86_VENDOR_ID+4 # next 4 chars
306 movl %ecx,X86_VENDOR_ID+8 # last 4 chars
307
308 orl %eax,%eax # do we have processor info as well?
309 je is486
310
311 movl $1,%eax # Use the CPUID instruction to get CPU type
312 cpuid
313 movb %al,%cl # save reg for future use
314 andb $0x0f,%ah # mask processor family
315 movb %ah,X86
316 andb $0xf0,%al # mask model
317 shrb $4,%al
318 movb %al,X86_MODEL
319 andb $0x0f,%cl # mask mask revision
320 movb %cl,X86_MASK
321 movl %edx,X86_CAPABILITY
322
323is486: movl $0x50022,%ecx # set AM, WP, NE and MP
324 jmp 2f
325
326is386: movl $2,%ecx # set MP
3272: movl %cr0,%eax
328 andl $0x80000011,%eax # Save PG,PE,ET
329 orl %ecx,%eax
330 movl %eax,%cr0
331
332 call check_x87
333 lgdt early_gdt_descr
334 lidt idt_descr
335 ljmp $(__KERNEL_CS),$1f
3361: movl $(__KERNEL_DS),%eax # reload all the segment registers
337 movl %eax,%ss # after changing gdt.
338 movl %eax,%fs # gets reset once there's real percpu
339
340 movl $(__USER_DS),%eax # DS/ES contains default USER segment
341 movl %eax,%ds
342 movl %eax,%es
343
344 xorl %eax,%eax # Clear GS and LDT
345 movl %eax,%gs
346 lldt %ax
347
348 cld # gcc2 wants the direction flag cleared at all times
349 pushl $0 # fake return address for unwinder
350#ifdef CONFIG_SMP
351 movb ready, %cl
352 movb $1, ready
353 cmpb $0,%cl # the first CPU calls start_kernel
354 je 1f
355 movl $(__KERNEL_PERCPU), %eax
356 movl %eax,%fs # set this cpu's percpu
357 jmp initialize_secondary # all other CPUs call initialize_secondary
3581:
359#endif /* CONFIG_SMP */
360 jmp start_kernel
361
362/*
363 * We depend on ET to be correct. This checks for 287/387.
364 */
365check_x87:
366 movb $0,X86_HARD_MATH
367 clts
368 fninit
369 fstsw %ax
370 cmpb $0,%al
371 je 1f
372 movl %cr0,%eax /* no coprocessor: have to set bits */
373 xorl $4,%eax /* set EM */
374 movl %eax,%cr0
375 ret
376 ALIGN
3771: movb $1,X86_HARD_MATH
378 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
379 ret
380
381/*
382 * setup_idt
383 *
384 * sets up a idt with 256 entries pointing to
385 * ignore_int, interrupt gates. It doesn't actually load
386 * idt - that can be done only after paging has been enabled
387 * and the kernel moved to PAGE_OFFSET. Interrupts
388 * are enabled elsewhere, when we can be relatively
389 * sure everything is ok.
390 *
391 * Warning: %esi is live across this function.
392 */
393setup_idt:
394 lea ignore_int,%edx
395 movl $(__KERNEL_CS << 16),%eax
396 movw %dx,%ax /* selector = 0x0010 = cs */
397 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
398
399 lea idt_table,%edi
400 mov $256,%ecx
401rp_sidt:
402 movl %eax,(%edi)
403 movl %edx,4(%edi)
404 addl $8,%edi
405 dec %ecx
406 jne rp_sidt
407
408.macro set_early_handler handler,trapno
409 lea \handler,%edx
410 movl $(__KERNEL_CS << 16),%eax
411 movw %dx,%ax
412 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
413 lea idt_table,%edi
414 movl %eax,8*\trapno(%edi)
415 movl %edx,8*\trapno+4(%edi)
416.endm
417
418 set_early_handler handler=early_divide_err,trapno=0
419 set_early_handler handler=early_illegal_opcode,trapno=6
420 set_early_handler handler=early_protection_fault,trapno=13
421 set_early_handler handler=early_page_fault,trapno=14
422
423 ret
424
425early_divide_err:
426 xor %edx,%edx
427 pushl $0 /* fake errcode */
428 jmp early_fault
429
430early_illegal_opcode:
431 movl $6,%edx
432 pushl $0 /* fake errcode */
433 jmp early_fault
434
435early_protection_fault:
436 movl $13,%edx
437 jmp early_fault
438
439early_page_fault:
440 movl $14,%edx
441 jmp early_fault
442
443early_fault:
444 cld
445#ifdef CONFIG_PRINTK
446 movl $(__KERNEL_DS),%eax
447 movl %eax,%ds
448 movl %eax,%es
449 cmpl $2,early_recursion_flag
450 je hlt_loop
451 incl early_recursion_flag
452 movl %cr2,%eax
453 pushl %eax
454 pushl %edx /* trapno */
455 pushl $fault_msg
456#ifdef CONFIG_EARLY_PRINTK
457 call early_printk
458#else
459 call printk
460#endif
461#endif
462hlt_loop:
463 hlt
464 jmp hlt_loop
465
466/* This is the default interrupt "handler" :-) */
467 ALIGN
468ignore_int:
469 cld
470#ifdef CONFIG_PRINTK
471 pushl %eax
472 pushl %ecx
473 pushl %edx
474 pushl %es
475 pushl %ds
476 movl $(__KERNEL_DS),%eax
477 movl %eax,%ds
478 movl %eax,%es
479 cmpl $2,early_recursion_flag
480 je hlt_loop
481 incl early_recursion_flag
482 pushl 16(%esp)
483 pushl 24(%esp)
484 pushl 32(%esp)
485 pushl 40(%esp)
486 pushl $int_msg
487#ifdef CONFIG_EARLY_PRINTK
488 call early_printk
489#else
490 call printk
491#endif
492 addl $(5*4),%esp
493 popl %ds
494 popl %es
495 popl %edx
496 popl %ecx
497 popl %eax
498#endif
499 iret
500
501.section .text
502/*
503 * Real beginning of normal "text" segment
504 */
505ENTRY(stext)
506ENTRY(_stext)
507
508/*
509 * BSS section
510 */
511.section ".bss.page_aligned","wa"
512 .align PAGE_SIZE_asm
513ENTRY(swapper_pg_dir)
514 .fill 1024,4,0
515ENTRY(swapper_pg_pmd)
516 .fill 1024,4,0
517ENTRY(empty_zero_page)
518 .fill 4096,1,0
519
520/*
521 * This starts the data section.
522 */
523.data
524ENTRY(stack_start)
525 .long init_thread_union+THREAD_SIZE
526 .long __BOOT_DS
527
528ready: .byte 0
529
530early_recursion_flag:
531 .long 0
532
533int_msg:
534 .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
535
536fault_msg:
537 .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
538 .asciz "Stack: %p %p %p %p %p %p %p %p\n"
539
540#include "../../x86/xen/xen-head.S"
541
542/*
543 * The IDT and GDT 'descriptors' are a strange 48-bit object
544 * only used by the lidt and lgdt instructions. They are not
545 * like usual segment descriptors - they consist of a 16-bit
546 * segment size, and 32-bit linear address value:
547 */
548
549.globl boot_gdt_descr
550.globl idt_descr
551
552 ALIGN
553# early boot GDT descriptor (must use 1:1 address mapping)
554 .word 0 # 32 bit align gdt_desc.address
555boot_gdt_descr:
556 .word __BOOT_DS+7
557 .long boot_gdt - __PAGE_OFFSET
558
559 .word 0 # 32-bit align idt_desc.address
560idt_descr:
561 .word IDT_ENTRIES*8-1 # idt contains 256 entries
562 .long idt_table
563
564# boot GDT descriptor (later on used by CPU#0):
565 .word 0 # 32 bit align gdt_desc.address
566ENTRY(early_gdt_descr)
567 .word GDT_ENTRIES*8-1
568 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
569
570/*
571 * The boot_gdt must mirror the equivalent in setup.S and is
572 * used only for booting.
573 */
574 .align L1_CACHE_BYTES
575ENTRY(boot_gdt)
576 .fill GDT_ENTRY_BOOT_CS,8,0
577 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
578 .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c
new file mode 100644
index 000000000000..533d4932bc79
--- /dev/null
+++ b/arch/x86/kernel/hpet_32.c
@@ -0,0 +1,553 @@
1#include <linux/clocksource.h>
2#include <linux/clockchips.h>
3#include <linux/errno.h>
4#include <linux/hpet.h>
5#include <linux/init.h>
6#include <linux/sysdev.h>
7#include <linux/pm.h>
8#include <linux/delay.h>
9
10#include <asm/hpet.h>
11#include <asm/io.h>
12
13extern struct clock_event_device *global_clock_event;
14
15#define HPET_MASK CLOCKSOURCE_MASK(32)
16#define HPET_SHIFT 22
17
18/* FSEC = 10^-15 NSEC = 10^-9 */
19#define FSEC_PER_NSEC 1000000
20
21/*
22 * HPET address is set in acpi/boot.c, when an ACPI entry exists
23 */
24unsigned long hpet_address;
25static void __iomem * hpet_virt_address;
26
27static inline unsigned long hpet_readl(unsigned long a)
28{
29 return readl(hpet_virt_address + a);
30}
31
32static inline void hpet_writel(unsigned long d, unsigned long a)
33{
34 writel(d, hpet_virt_address + a);
35}
36
37/*
38 * HPET command line enable / disable
39 */
40static int boot_hpet_disable;
41
42static int __init hpet_setup(char* str)
43{
44 if (str) {
45 if (!strncmp("disable", str, 7))
46 boot_hpet_disable = 1;
47 }
48 return 1;
49}
50__setup("hpet=", hpet_setup);
51
52static inline int is_hpet_capable(void)
53{
54 return (!boot_hpet_disable && hpet_address);
55}
56
57/*
58 * HPET timer interrupt enable / disable
59 */
60static int hpet_legacy_int_enabled;
61
62/**
63 * is_hpet_enabled - check whether the hpet timer interrupt is enabled
64 */
65int is_hpet_enabled(void)
66{
67 return is_hpet_capable() && hpet_legacy_int_enabled;
68}
69
70/*
71 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
72 * timer 0 and timer 1 in case of RTC emulation.
73 */
74#ifdef CONFIG_HPET
75static void hpet_reserve_platform_timers(unsigned long id)
76{
77 struct hpet __iomem *hpet = hpet_virt_address;
78 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
79 unsigned int nrtimers, i;
80 struct hpet_data hd;
81
82 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
83
84 memset(&hd, 0, sizeof (hd));
85 hd.hd_phys_address = hpet_address;
86 hd.hd_address = hpet_virt_address;
87 hd.hd_nirqs = nrtimers;
88 hd.hd_flags = HPET_DATA_PLATFORM;
89 hpet_reserve_timer(&hd, 0);
90
91#ifdef CONFIG_HPET_EMULATE_RTC
92 hpet_reserve_timer(&hd, 1);
93#endif
94
95 hd.hd_irq[0] = HPET_LEGACY_8254;
96 hd.hd_irq[1] = HPET_LEGACY_RTC;
97
98 for (i = 2; i < nrtimers; timer++, i++)
99 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
100 Tn_INT_ROUTE_CNF_SHIFT;
101
102 hpet_alloc(&hd);
103
104}
105#else
106static void hpet_reserve_platform_timers(unsigned long id) { }
107#endif
108
109/*
110 * Common hpet info
111 */
112static unsigned long hpet_period;
113
114static void hpet_set_mode(enum clock_event_mode mode,
115 struct clock_event_device *evt);
116static int hpet_next_event(unsigned long delta,
117 struct clock_event_device *evt);
118
119/*
120 * The hpet clock event device
121 */
122static struct clock_event_device hpet_clockevent = {
123 .name = "hpet",
124 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
125 .set_mode = hpet_set_mode,
126 .set_next_event = hpet_next_event,
127 .shift = 32,
128 .irq = 0,
129};
130
131static void hpet_start_counter(void)
132{
133 unsigned long cfg = hpet_readl(HPET_CFG);
134
135 cfg &= ~HPET_CFG_ENABLE;
136 hpet_writel(cfg, HPET_CFG);
137 hpet_writel(0, HPET_COUNTER);
138 hpet_writel(0, HPET_COUNTER + 4);
139 cfg |= HPET_CFG_ENABLE;
140 hpet_writel(cfg, HPET_CFG);
141}
142
143static void hpet_enable_int(void)
144{
145 unsigned long cfg = hpet_readl(HPET_CFG);
146
147 cfg |= HPET_CFG_LEGACY;
148 hpet_writel(cfg, HPET_CFG);
149 hpet_legacy_int_enabled = 1;
150}
151
152static void hpet_set_mode(enum clock_event_mode mode,
153 struct clock_event_device *evt)
154{
155 unsigned long cfg, cmp, now;
156 uint64_t delta;
157
158 switch(mode) {
159 case CLOCK_EVT_MODE_PERIODIC:
160 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
161 delta >>= hpet_clockevent.shift;
162 now = hpet_readl(HPET_COUNTER);
163 cmp = now + (unsigned long) delta;
164 cfg = hpet_readl(HPET_T0_CFG);
165 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
166 HPET_TN_SETVAL | HPET_TN_32BIT;
167 hpet_writel(cfg, HPET_T0_CFG);
168 /*
169 * The first write after writing TN_SETVAL to the
170 * config register sets the counter value, the second
171 * write sets the period.
172 */
173 hpet_writel(cmp, HPET_T0_CMP);
174 udelay(1);
175 hpet_writel((unsigned long) delta, HPET_T0_CMP);
176 break;
177
178 case CLOCK_EVT_MODE_ONESHOT:
179 cfg = hpet_readl(HPET_T0_CFG);
180 cfg &= ~HPET_TN_PERIODIC;
181 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
182 hpet_writel(cfg, HPET_T0_CFG);
183 break;
184
185 case CLOCK_EVT_MODE_UNUSED:
186 case CLOCK_EVT_MODE_SHUTDOWN:
187 cfg = hpet_readl(HPET_T0_CFG);
188 cfg &= ~HPET_TN_ENABLE;
189 hpet_writel(cfg, HPET_T0_CFG);
190 break;
191
192 case CLOCK_EVT_MODE_RESUME:
193 hpet_enable_int();
194 break;
195 }
196}
197
198static int hpet_next_event(unsigned long delta,
199 struct clock_event_device *evt)
200{
201 unsigned long cnt;
202
203 cnt = hpet_readl(HPET_COUNTER);
204 cnt += delta;
205 hpet_writel(cnt, HPET_T0_CMP);
206
207 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
208}
209
210/*
211 * Clock source related code
212 */
213static cycle_t read_hpet(void)
214{
215 return (cycle_t)hpet_readl(HPET_COUNTER);
216}
217
218static struct clocksource clocksource_hpet = {
219 .name = "hpet",
220 .rating = 250,
221 .read = read_hpet,
222 .mask = HPET_MASK,
223 .shift = HPET_SHIFT,
224 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
225 .resume = hpet_start_counter,
226};
227
228/*
229 * Try to setup the HPET timer
230 */
231int __init hpet_enable(void)
232{
233 unsigned long id;
234 uint64_t hpet_freq;
235 u64 tmp, start, now;
236 cycle_t t1;
237
238 if (!is_hpet_capable())
239 return 0;
240
241 hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
242
243 /*
244 * Read the period and check for a sane value:
245 */
246 hpet_period = hpet_readl(HPET_PERIOD);
247 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
248 goto out_nohpet;
249
250 /*
251 * The period is a femto seconds value. We need to calculate the
252 * scaled math multiplication factor for nanosecond to hpet tick
253 * conversion.
254 */
255 hpet_freq = 1000000000000000ULL;
256 do_div(hpet_freq, hpet_period);
257 hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
258 NSEC_PER_SEC, 32);
259 /* Calculate the min / max delta */
260 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
261 &hpet_clockevent);
262 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
263 &hpet_clockevent);
264
265 /*
266 * Read the HPET ID register to retrieve the IRQ routing
267 * information and the number of channels
268 */
269 id = hpet_readl(HPET_ID);
270
271#ifdef CONFIG_HPET_EMULATE_RTC
272 /*
273 * The legacy routing mode needs at least two channels, tick timer
274 * and the rtc emulation channel.
275 */
276 if (!(id & HPET_ID_NUMBER))
277 goto out_nohpet;
278#endif
279
280 /* Start the counter */
281 hpet_start_counter();
282
283 /* Verify whether hpet counter works */
284 t1 = read_hpet();
285 rdtscll(start);
286
287 /*
288 * We don't know the TSC frequency yet, but waiting for
289 * 200000 TSC cycles is safe:
290 * 4 GHz == 50us
291 * 1 GHz == 200us
292 */
293 do {
294 rep_nop();
295 rdtscll(now);
296 } while ((now - start) < 200000UL);
297
298 if (t1 == read_hpet()) {
299 printk(KERN_WARNING
300 "HPET counter not counting. HPET disabled\n");
301 goto out_nohpet;
302 }
303
304 /* Initialize and register HPET clocksource
305 *
306 * hpet period is in femto seconds per cycle
307 * so we need to convert this to ns/cyc units
308 * aproximated by mult/2^shift
309 *
310 * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
311 * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
312 * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
313 * (fsec/cyc << shift)/1000000 = mult
314 * (hpet_period << shift)/FSEC_PER_NSEC = mult
315 */
316 tmp = (u64)hpet_period << HPET_SHIFT;
317 do_div(tmp, FSEC_PER_NSEC);
318 clocksource_hpet.mult = (u32)tmp;
319
320 clocksource_register(&clocksource_hpet);
321
322 if (id & HPET_ID_LEGSUP) {
323 hpet_enable_int();
324 hpet_reserve_platform_timers(id);
325 /*
326 * Start hpet with the boot cpu mask and make it
327 * global after the IO_APIC has been initialized.
328 */
329 hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
330 clockevents_register_device(&hpet_clockevent);
331 global_clock_event = &hpet_clockevent;
332 return 1;
333 }
334 return 0;
335
336out_nohpet:
337 iounmap(hpet_virt_address);
338 hpet_virt_address = NULL;
339 boot_hpet_disable = 1;
340 return 0;
341}
342
343
344#ifdef CONFIG_HPET_EMULATE_RTC
345
346/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
347 * is enabled, we support RTC interrupt functionality in software.
348 * RTC has 3 kinds of interrupts:
349 * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
350 * is updated
351 * 2) Alarm Interrupt - generate an interrupt at a specific time of day
352 * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
353 * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
354 * (1) and (2) above are implemented using polling at a frequency of
355 * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
356 * overhead. (DEFAULT_RTC_INT_FREQ)
357 * For (3), we use interrupts at 64Hz or user specified periodic
358 * frequency, whichever is higher.
359 */
360#include <linux/mc146818rtc.h>
361#include <linux/rtc.h>
362
363#define DEFAULT_RTC_INT_FREQ 64
364#define DEFAULT_RTC_SHIFT 6
365#define RTC_NUM_INTS 1
366
367static unsigned long hpet_rtc_flags;
368static unsigned long hpet_prev_update_sec;
369static struct rtc_time hpet_alarm_time;
370static unsigned long hpet_pie_count;
371static unsigned long hpet_t1_cmp;
372static unsigned long hpet_default_delta;
373static unsigned long hpet_pie_delta;
374static unsigned long hpet_pie_limit;
375
376/*
377 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
378 * is not supported by all HPET implementations for timer 1.
379 *
380 * hpet_rtc_timer_init() is called when the rtc is initialized.
381 */
382int hpet_rtc_timer_init(void)
383{
384 unsigned long cfg, cnt, delta, flags;
385
386 if (!is_hpet_enabled())
387 return 0;
388
389 if (!hpet_default_delta) {
390 uint64_t clc;
391
392 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
393 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
394 hpet_default_delta = (unsigned long) clc;
395 }
396
397 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
398 delta = hpet_default_delta;
399 else
400 delta = hpet_pie_delta;
401
402 local_irq_save(flags);
403
404 cnt = delta + hpet_readl(HPET_COUNTER);
405 hpet_writel(cnt, HPET_T1_CMP);
406 hpet_t1_cmp = cnt;
407
408 cfg = hpet_readl(HPET_T1_CFG);
409 cfg &= ~HPET_TN_PERIODIC;
410 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
411 hpet_writel(cfg, HPET_T1_CFG);
412
413 local_irq_restore(flags);
414
415 return 1;
416}
417
418/*
419 * The functions below are called from rtc driver.
420 * Return 0 if HPET is not being used.
421 * Otherwise do the necessary changes and return 1.
422 */
423int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
424{
425 if (!is_hpet_enabled())
426 return 0;
427
428 hpet_rtc_flags &= ~bit_mask;
429 return 1;
430}
431
432int hpet_set_rtc_irq_bit(unsigned long bit_mask)
433{
434 unsigned long oldbits = hpet_rtc_flags;
435
436 if (!is_hpet_enabled())
437 return 0;
438
439 hpet_rtc_flags |= bit_mask;
440
441 if (!oldbits)
442 hpet_rtc_timer_init();
443
444 return 1;
445}
446
447int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
448 unsigned char sec)
449{
450 if (!is_hpet_enabled())
451 return 0;
452
453 hpet_alarm_time.tm_hour = hrs;
454 hpet_alarm_time.tm_min = min;
455 hpet_alarm_time.tm_sec = sec;
456
457 return 1;
458}
459
460int hpet_set_periodic_freq(unsigned long freq)
461{
462 uint64_t clc;
463
464 if (!is_hpet_enabled())
465 return 0;
466
467 if (freq <= DEFAULT_RTC_INT_FREQ)
468 hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
469 else {
470 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
471 do_div(clc, freq);
472 clc >>= hpet_clockevent.shift;
473 hpet_pie_delta = (unsigned long) clc;
474 }
475 return 1;
476}
477
478int hpet_rtc_dropped_irq(void)
479{
480 return is_hpet_enabled();
481}
482
483static void hpet_rtc_timer_reinit(void)
484{
485 unsigned long cfg, delta;
486 int lost_ints = -1;
487
488 if (unlikely(!hpet_rtc_flags)) {
489 cfg = hpet_readl(HPET_T1_CFG);
490 cfg &= ~HPET_TN_ENABLE;
491 hpet_writel(cfg, HPET_T1_CFG);
492 return;
493 }
494
495 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
496 delta = hpet_default_delta;
497 else
498 delta = hpet_pie_delta;
499
500 /*
501 * Increment the comparator value until we are ahead of the
502 * current count.
503 */
504 do {
505 hpet_t1_cmp += delta;
506 hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
507 lost_ints++;
508 } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
509
510 if (lost_ints) {
511 if (hpet_rtc_flags & RTC_PIE)
512 hpet_pie_count += lost_ints;
513 if (printk_ratelimit())
514 printk(KERN_WARNING "rtc: lost %d interrupts\n",
515 lost_ints);
516 }
517}
518
519irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
520{
521 struct rtc_time curr_time;
522 unsigned long rtc_int_flag = 0;
523
524 hpet_rtc_timer_reinit();
525
526 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
527 rtc_get_rtc_time(&curr_time);
528
529 if (hpet_rtc_flags & RTC_UIE &&
530 curr_time.tm_sec != hpet_prev_update_sec) {
531 rtc_int_flag = RTC_UF;
532 hpet_prev_update_sec = curr_time.tm_sec;
533 }
534
535 if (hpet_rtc_flags & RTC_PIE &&
536 ++hpet_pie_count >= hpet_pie_limit) {
537 rtc_int_flag |= RTC_PF;
538 hpet_pie_count = 0;
539 }
540
541 if (hpet_rtc_flags & RTC_PIE &&
542 (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
543 (curr_time.tm_min == hpet_alarm_time.tm_min) &&
544 (curr_time.tm_hour == hpet_alarm_time.tm_hour))
545 rtc_int_flag |= RTC_AF;
546
547 if (rtc_int_flag) {
548 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
549 rtc_interrupt(rtc_int_flag, dev_id);
550 }
551 return IRQ_HANDLED;
552}
553#endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
new file mode 100644
index 000000000000..e3d4b73bfdb0
--- /dev/null
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -0,0 +1,30 @@
1#include <linux/module.h>
2#include <asm/checksum.h>
3#include <asm/desc.h>
4
5EXPORT_SYMBOL(__down_failed);
6EXPORT_SYMBOL(__down_failed_interruptible);
7EXPORT_SYMBOL(__down_failed_trylock);
8EXPORT_SYMBOL(__up_wakeup);
9/* Networking helper routines. */
10EXPORT_SYMBOL(csum_partial_copy_generic);
11
12EXPORT_SYMBOL(__get_user_1);
13EXPORT_SYMBOL(__get_user_2);
14EXPORT_SYMBOL(__get_user_4);
15
16EXPORT_SYMBOL(__put_user_1);
17EXPORT_SYMBOL(__put_user_2);
18EXPORT_SYMBOL(__put_user_4);
19EXPORT_SYMBOL(__put_user_8);
20
21EXPORT_SYMBOL(strstr);
22
23#ifdef CONFIG_SMP
24extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
25extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
26EXPORT_SYMBOL(__write_lock_failed);
27EXPORT_SYMBOL(__read_lock_failed);
28#endif
29
30EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
new file mode 100644
index 000000000000..665847281ed2
--- /dev/null
+++ b/arch/x86/kernel/i387_32.c
@@ -0,0 +1,546 @@
1/*
2 * linux/arch/i386/kernel/i387.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * General FPU state handling cleanups
8 * Gareth Hughes <gareth@valinux.com>, May 2000
9 */
10
11#include <linux/sched.h>
12#include <linux/module.h>
13#include <asm/processor.h>
14#include <asm/i387.h>
15#include <asm/math_emu.h>
16#include <asm/sigcontext.h>
17#include <asm/user.h>
18#include <asm/ptrace.h>
19#include <asm/uaccess.h>
20
21#ifdef CONFIG_MATH_EMULATION
22#define HAVE_HWFP (boot_cpu_data.hard_math)
23#else
24#define HAVE_HWFP 1
25#endif
26
27static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
28
29void mxcsr_feature_mask_init(void)
30{
31 unsigned long mask = 0;
32 clts();
33 if (cpu_has_fxsr) {
34 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
35 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
36 mask = current->thread.i387.fxsave.mxcsr_mask;
37 if (mask == 0) mask = 0x0000ffbf;
38 }
39 mxcsr_feature_mask &= mask;
40 stts();
41}
42
43/*
44 * The _current_ task is using the FPU for the first time
45 * so initialize it and set the mxcsr to its default
46 * value at reset if we support XMM instructions and then
47 * remeber the current task has used the FPU.
48 */
49void init_fpu(struct task_struct *tsk)
50{
51 if (cpu_has_fxsr) {
52 memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
53 tsk->thread.i387.fxsave.cwd = 0x37f;
54 if (cpu_has_xmm)
55 tsk->thread.i387.fxsave.mxcsr = 0x1f80;
56 } else {
57 memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
58 tsk->thread.i387.fsave.cwd = 0xffff037fu;
59 tsk->thread.i387.fsave.swd = 0xffff0000u;
60 tsk->thread.i387.fsave.twd = 0xffffffffu;
61 tsk->thread.i387.fsave.fos = 0xffff0000u;
62 }
63 /* only the device not available exception or ptrace can call init_fpu */
64 set_stopped_child_used_math(tsk);
65}
66
67/*
68 * FPU lazy state save handling.
69 */
70
71void kernel_fpu_begin(void)
72{
73 struct thread_info *thread = current_thread_info();
74
75 preempt_disable();
76 if (thread->status & TS_USEDFPU) {
77 __save_init_fpu(thread->task);
78 return;
79 }
80 clts();
81}
82EXPORT_SYMBOL_GPL(kernel_fpu_begin);
83
84/*
85 * FPU tag word conversions.
86 */
87
88static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
89{
90 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
91
92 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
93 tmp = ~twd;
94 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
95 /* and move the valid bits to the lower byte. */
96 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
97 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
98 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
99 return tmp;
100}
101
102static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
103{
104 struct _fpxreg *st = NULL;
105 unsigned long tos = (fxsave->swd >> 11) & 7;
106 unsigned long twd = (unsigned long) fxsave->twd;
107 unsigned long tag;
108 unsigned long ret = 0xffff0000u;
109 int i;
110
111#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
112
113 for ( i = 0 ; i < 8 ; i++ ) {
114 if ( twd & 0x1 ) {
115 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
116
117 switch ( st->exponent & 0x7fff ) {
118 case 0x7fff:
119 tag = 2; /* Special */
120 break;
121 case 0x0000:
122 if ( !st->significand[0] &&
123 !st->significand[1] &&
124 !st->significand[2] &&
125 !st->significand[3] ) {
126 tag = 1; /* Zero */
127 } else {
128 tag = 2; /* Special */
129 }
130 break;
131 default:
132 if ( st->significand[3] & 0x8000 ) {
133 tag = 0; /* Valid */
134 } else {
135 tag = 2; /* Special */
136 }
137 break;
138 }
139 } else {
140 tag = 3; /* Empty */
141 }
142 ret |= (tag << (2 * i));
143 twd = twd >> 1;
144 }
145 return ret;
146}
147
148/*
149 * FPU state interaction.
150 */
151
152unsigned short get_fpu_cwd( struct task_struct *tsk )
153{
154 if ( cpu_has_fxsr ) {
155 return tsk->thread.i387.fxsave.cwd;
156 } else {
157 return (unsigned short)tsk->thread.i387.fsave.cwd;
158 }
159}
160
161unsigned short get_fpu_swd( struct task_struct *tsk )
162{
163 if ( cpu_has_fxsr ) {
164 return tsk->thread.i387.fxsave.swd;
165 } else {
166 return (unsigned short)tsk->thread.i387.fsave.swd;
167 }
168}
169
170#if 0
171unsigned short get_fpu_twd( struct task_struct *tsk )
172{
173 if ( cpu_has_fxsr ) {
174 return tsk->thread.i387.fxsave.twd;
175 } else {
176 return (unsigned short)tsk->thread.i387.fsave.twd;
177 }
178}
179#endif /* 0 */
180
181unsigned short get_fpu_mxcsr( struct task_struct *tsk )
182{
183 if ( cpu_has_xmm ) {
184 return tsk->thread.i387.fxsave.mxcsr;
185 } else {
186 return 0x1f80;
187 }
188}
189
190#if 0
191
192void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
193{
194 if ( cpu_has_fxsr ) {
195 tsk->thread.i387.fxsave.cwd = cwd;
196 } else {
197 tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
198 }
199}
200
201void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
202{
203 if ( cpu_has_fxsr ) {
204 tsk->thread.i387.fxsave.swd = swd;
205 } else {
206 tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
207 }
208}
209
210void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
211{
212 if ( cpu_has_fxsr ) {
213 tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
214 } else {
215 tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
216 }
217}
218
219#endif /* 0 */
220
221/*
222 * FXSR floating point environment conversions.
223 */
224
225static int convert_fxsr_to_user( struct _fpstate __user *buf,
226 struct i387_fxsave_struct *fxsave )
227{
228 unsigned long env[7];
229 struct _fpreg __user *to;
230 struct _fpxreg *from;
231 int i;
232
233 env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
234 env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
235 env[2] = twd_fxsr_to_i387(fxsave);
236 env[3] = fxsave->fip;
237 env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
238 env[5] = fxsave->foo;
239 env[6] = fxsave->fos;
240
241 if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
242 return 1;
243
244 to = &buf->_st[0];
245 from = (struct _fpxreg *) &fxsave->st_space[0];
246 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
247 unsigned long __user *t = (unsigned long __user *)to;
248 unsigned long *f = (unsigned long *)from;
249
250 if (__put_user(*f, t) ||
251 __put_user(*(f + 1), t + 1) ||
252 __put_user(from->exponent, &to->exponent))
253 return 1;
254 }
255 return 0;
256}
257
258static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
259 struct _fpstate __user *buf )
260{
261 unsigned long env[7];
262 struct _fpxreg *to;
263 struct _fpreg __user *from;
264 int i;
265
266 if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
267 return 1;
268
269 fxsave->cwd = (unsigned short)(env[0] & 0xffff);
270 fxsave->swd = (unsigned short)(env[1] & 0xffff);
271 fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
272 fxsave->fip = env[3];
273 fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
274 fxsave->fcs = (env[4] & 0xffff);
275 fxsave->foo = env[5];
276 fxsave->fos = env[6];
277
278 to = (struct _fpxreg *) &fxsave->st_space[0];
279 from = &buf->_st[0];
280 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
281 unsigned long *t = (unsigned long *)to;
282 unsigned long __user *f = (unsigned long __user *)from;
283
284 if (__get_user(*t, f) ||
285 __get_user(*(t + 1), f + 1) ||
286 __get_user(to->exponent, &from->exponent))
287 return 1;
288 }
289 return 0;
290}
291
292/*
293 * Signal frame handlers.
294 */
295
296static inline int save_i387_fsave( struct _fpstate __user *buf )
297{
298 struct task_struct *tsk = current;
299
300 unlazy_fpu( tsk );
301 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
302 if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
303 sizeof(struct i387_fsave_struct) ) )
304 return -1;
305 return 1;
306}
307
308static int save_i387_fxsave( struct _fpstate __user *buf )
309{
310 struct task_struct *tsk = current;
311 int err = 0;
312
313 unlazy_fpu( tsk );
314
315 if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
316 return -1;
317
318 err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
319 err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
320 if ( err )
321 return -1;
322
323 if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
324 sizeof(struct i387_fxsave_struct) ) )
325 return -1;
326 return 1;
327}
328
329int save_i387( struct _fpstate __user *buf )
330{
331 if ( !used_math() )
332 return 0;
333
334 /* This will cause a "finit" to be triggered by the next
335 * attempted FPU operation by the 'current' process.
336 */
337 clear_used_math();
338
339 if ( HAVE_HWFP ) {
340 if ( cpu_has_fxsr ) {
341 return save_i387_fxsave( buf );
342 } else {
343 return save_i387_fsave( buf );
344 }
345 } else {
346 return save_i387_soft( &current->thread.i387.soft, buf );
347 }
348}
349
350static inline int restore_i387_fsave( struct _fpstate __user *buf )
351{
352 struct task_struct *tsk = current;
353 clear_fpu( tsk );
354 return __copy_from_user( &tsk->thread.i387.fsave, buf,
355 sizeof(struct i387_fsave_struct) );
356}
357
358static int restore_i387_fxsave( struct _fpstate __user *buf )
359{
360 int err;
361 struct task_struct *tsk = current;
362 clear_fpu( tsk );
363 err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
364 sizeof(struct i387_fxsave_struct) );
365 /* mxcsr reserved bits must be masked to zero for security reasons */
366 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
367 return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
368}
369
370int restore_i387( struct _fpstate __user *buf )
371{
372 int err;
373
374 if ( HAVE_HWFP ) {
375 if ( cpu_has_fxsr ) {
376 err = restore_i387_fxsave( buf );
377 } else {
378 err = restore_i387_fsave( buf );
379 }
380 } else {
381 err = restore_i387_soft( &current->thread.i387.soft, buf );
382 }
383 set_used_math();
384 return err;
385}
386
387/*
388 * ptrace request handlers.
389 */
390
391static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
392 struct task_struct *tsk )
393{
394 return __copy_to_user( buf, &tsk->thread.i387.fsave,
395 sizeof(struct user_i387_struct) );
396}
397
398static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
399 struct task_struct *tsk )
400{
401 return convert_fxsr_to_user( (struct _fpstate __user *)buf,
402 &tsk->thread.i387.fxsave );
403}
404
405int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
406{
407 if ( HAVE_HWFP ) {
408 if ( cpu_has_fxsr ) {
409 return get_fpregs_fxsave( buf, tsk );
410 } else {
411 return get_fpregs_fsave( buf, tsk );
412 }
413 } else {
414 return save_i387_soft( &tsk->thread.i387.soft,
415 (struct _fpstate __user *)buf );
416 }
417}
418
419static inline int set_fpregs_fsave( struct task_struct *tsk,
420 struct user_i387_struct __user *buf )
421{
422 return __copy_from_user( &tsk->thread.i387.fsave, buf,
423 sizeof(struct user_i387_struct) );
424}
425
426static inline int set_fpregs_fxsave( struct task_struct *tsk,
427 struct user_i387_struct __user *buf )
428{
429 return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
430 (struct _fpstate __user *)buf );
431}
432
433int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
434{
435 if ( HAVE_HWFP ) {
436 if ( cpu_has_fxsr ) {
437 return set_fpregs_fxsave( tsk, buf );
438 } else {
439 return set_fpregs_fsave( tsk, buf );
440 }
441 } else {
442 return restore_i387_soft( &tsk->thread.i387.soft,
443 (struct _fpstate __user *)buf );
444 }
445}
446
447int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
448{
449 if ( cpu_has_fxsr ) {
450 if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
451 sizeof(struct user_fxsr_struct) ))
452 return -EFAULT;
453 return 0;
454 } else {
455 return -EIO;
456 }
457}
458
459int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
460{
461 int ret = 0;
462
463 if ( cpu_has_fxsr ) {
464 if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
465 sizeof(struct user_fxsr_struct) ))
466 ret = -EFAULT;
467 /* mxcsr reserved bits must be masked to zero for security reasons */
468 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
469 } else {
470 ret = -EIO;
471 }
472 return ret;
473}
474
475/*
476 * FPU state for core dumps.
477 */
478
479static inline void copy_fpu_fsave( struct task_struct *tsk,
480 struct user_i387_struct *fpu )
481{
482 memcpy( fpu, &tsk->thread.i387.fsave,
483 sizeof(struct user_i387_struct) );
484}
485
486static inline void copy_fpu_fxsave( struct task_struct *tsk,
487 struct user_i387_struct *fpu )
488{
489 unsigned short *to;
490 unsigned short *from;
491 int i;
492
493 memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
494
495 to = (unsigned short *)&fpu->st_space[0];
496 from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
497 for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
498 memcpy( to, from, 5 * sizeof(unsigned short) );
499 }
500}
501
502int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
503{
504 int fpvalid;
505 struct task_struct *tsk = current;
506
507 fpvalid = !!used_math();
508 if ( fpvalid ) {
509 unlazy_fpu( tsk );
510 if ( cpu_has_fxsr ) {
511 copy_fpu_fxsave( tsk, fpu );
512 } else {
513 copy_fpu_fsave( tsk, fpu );
514 }
515 }
516
517 return fpvalid;
518}
519EXPORT_SYMBOL(dump_fpu);
520
521int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
522{
523 int fpvalid = !!tsk_used_math(tsk);
524
525 if (fpvalid) {
526 if (tsk == current)
527 unlazy_fpu(tsk);
528 if (cpu_has_fxsr)
529 copy_fpu_fxsave(tsk, fpu);
530 else
531 copy_fpu_fsave(tsk, fpu);
532 }
533 return fpvalid;
534}
535
536int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
537{
538 int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
539
540 if (fpvalid) {
541 if (tsk == current)
542 unlazy_fpu(tsk);
543 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
544 }
545 return fpvalid;
546}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644
index 000000000000..6f508e8d7c57
--- /dev/null
+++ b/arch/x86/kernel/i8237.c
@@ -0,0 +1,72 @@
1/*
2 * i8237.c: 8237A DMA controller suspend functions.
3 *
4 * Written by Pierre Ossman, 2005.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or (at
9 * your option) any later version.
10 */
11
12#include <linux/init.h>
13#include <linux/sysdev.h>
14
15#include <asm/dma.h>
16
17/*
18 * This module just handles suspend/resume issues with the
19 * 8237A DMA controller (used for ISA and LPC).
20 * Allocation is handled in kernel/dma.c and normal usage is
21 * in asm/dma.h.
22 */
23
24static int i8237A_resume(struct sys_device *dev)
25{
26 unsigned long flags;
27 int i;
28
29 flags = claim_dma_lock();
30
31 dma_outb(DMA1_RESET_REG, 0);
32 dma_outb(DMA2_RESET_REG, 0);
33
34 for (i = 0;i < 8;i++) {
35 set_dma_addr(i, 0x000000);
36 /* DMA count is a bit weird so this is not 0 */
37 set_dma_count(i, 1);
38 }
39
40 /* Enable cascade DMA or channel 0-3 won't work */
41 enable_dma(4);
42
43 release_dma_lock(flags);
44
45 return 0;
46}
47
48static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
49{
50 return 0;
51}
52
53static struct sysdev_class i8237_sysdev_class = {
54 set_kset_name("i8237"),
55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume,
57};
58
59static struct sys_device device_i8237A = {
60 .id = 0,
61 .cls = &i8237_sysdev_class,
62};
63
64static int __init i8237A_init_sysfs(void)
65{
66 int error = sysdev_class_register(&i8237_sysdev_class);
67 if (!error)
68 error = sysdev_register(&device_i8237A);
69 return error;
70}
71
72device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c
new file mode 100644
index 000000000000..6d839f2f1b1a
--- /dev/null
+++ b/arch/x86/kernel/i8253_32.c
@@ -0,0 +1,206 @@
1/*
2 * i8253.c 8253/PIT functions
3 *
4 */
5#include <linux/clockchips.h>
6#include <linux/init.h>
7#include <linux/interrupt.h>
8#include <linux/jiffies.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11
12#include <asm/smp.h>
13#include <asm/delay.h>
14#include <asm/i8253.h>
15#include <asm/io.h>
16#include <asm/timer.h>
17
18DEFINE_SPINLOCK(i8253_lock);
19EXPORT_SYMBOL(i8253_lock);
20
21/*
22 * HPET replaces the PIT, when enabled. So we need to know, which of
23 * the two timers is used
24 */
25struct clock_event_device *global_clock_event;
26
27/*
28 * Initialize the PIT timer.
29 *
30 * This is also called after resume to bring the PIT into operation again.
31 */
32static void init_pit_timer(enum clock_event_mode mode,
33 struct clock_event_device *evt)
34{
35 unsigned long flags;
36
37 spin_lock_irqsave(&i8253_lock, flags);
38
39 switch(mode) {
40 case CLOCK_EVT_MODE_PERIODIC:
41 /* binary, mode 2, LSB/MSB, ch 0 */
42 outb_p(0x34, PIT_MODE);
43 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
44 outb(LATCH >> 8 , PIT_CH0); /* MSB */
45 break;
46
47 case CLOCK_EVT_MODE_SHUTDOWN:
48 case CLOCK_EVT_MODE_UNUSED:
49 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
50 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
51 outb_p(0x30, PIT_MODE);
52 outb_p(0, PIT_CH0);
53 outb_p(0, PIT_CH0);
54 }
55 break;
56
57 case CLOCK_EVT_MODE_ONESHOT:
58 /* One shot setup */
59 outb_p(0x38, PIT_MODE);
60 break;
61
62 case CLOCK_EVT_MODE_RESUME:
63 /* Nothing to do here */
64 break;
65 }
66 spin_unlock_irqrestore(&i8253_lock, flags);
67}
68
69/*
70 * Program the next event in oneshot mode
71 *
72 * Delta is given in PIT ticks
73 */
74static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
75{
76 unsigned long flags;
77
78 spin_lock_irqsave(&i8253_lock, flags);
79 outb_p(delta & 0xff , PIT_CH0); /* LSB */
80 outb(delta >> 8 , PIT_CH0); /* MSB */
81 spin_unlock_irqrestore(&i8253_lock, flags);
82
83 return 0;
84}
85
86/*
87 * On UP the PIT can serve all of the possible timer functions. On SMP systems
88 * it can be solely used for the global tick.
89 *
90 * The profiling and update capabilites are switched off once the local apic is
91 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
92 * !using_apic_timer decisions in do_timer_interrupt_hook()
93 */
94struct clock_event_device pit_clockevent = {
95 .name = "pit",
96 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
97 .set_mode = init_pit_timer,
98 .set_next_event = pit_next_event,
99 .shift = 32,
100 .irq = 0,
101};
102
103/*
104 * Initialize the conversion factor and the min/max deltas of the clock event
105 * structure and register the clock event source with the framework.
106 */
107void __init setup_pit_timer(void)
108{
109 /*
110 * Start pit with the boot cpu mask and make it global after the
111 * IO_APIC has been initialized.
112 */
113 pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
114 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
115 pit_clockevent.max_delta_ns =
116 clockevent_delta2ns(0x7FFF, &pit_clockevent);
117 pit_clockevent.min_delta_ns =
118 clockevent_delta2ns(0xF, &pit_clockevent);
119 clockevents_register_device(&pit_clockevent);
120 global_clock_event = &pit_clockevent;
121}
122
123/*
124 * Since the PIT overflows every tick, its not very useful
125 * to just read by itself. So use jiffies to emulate a free
126 * running counter:
127 */
128static cycle_t pit_read(void)
129{
130 unsigned long flags;
131 int count;
132 u32 jifs;
133 static int old_count;
134 static u32 old_jifs;
135
136 spin_lock_irqsave(&i8253_lock, flags);
137 /*
138 * Although our caller may have the read side of xtime_lock,
139 * this is now a seqlock, and we are cheating in this routine
140 * by having side effects on state that we cannot undo if
141 * there is a collision on the seqlock and our caller has to
142 * retry. (Namely, old_jifs and old_count.) So we must treat
143 * jiffies as volatile despite the lock. We read jiffies
144 * before latching the timer count to guarantee that although
145 * the jiffies value might be older than the count (that is,
146 * the counter may underflow between the last point where
147 * jiffies was incremented and the point where we latch the
148 * count), it cannot be newer.
149 */
150 jifs = jiffies;
151 outb_p(0x00, PIT_MODE); /* latch the count ASAP */
152 count = inb_p(PIT_CH0); /* read the latched count */
153 count |= inb_p(PIT_CH0) << 8;
154
155 /* VIA686a test code... reset the latch if count > max + 1 */
156 if (count > LATCH) {
157 outb_p(0x34, PIT_MODE);
158 outb_p(LATCH & 0xff, PIT_CH0);
159 outb(LATCH >> 8, PIT_CH0);
160 count = LATCH - 1;
161 }
162
163 /*
164 * It's possible for count to appear to go the wrong way for a
165 * couple of reasons:
166 *
167 * 1. The timer counter underflows, but we haven't handled the
168 * resulting interrupt and incremented jiffies yet.
169 * 2. Hardware problem with the timer, not giving us continuous time,
170 * the counter does small "jumps" upwards on some Pentium systems,
171 * (see c't 95/10 page 335 for Neptun bug.)
172 *
173 * Previous attempts to handle these cases intelligently were
174 * buggy, so we just do the simple thing now.
175 */
176 if (count > old_count && jifs == old_jifs) {
177 count = old_count;
178 }
179 old_count = count;
180 old_jifs = jifs;
181
182 spin_unlock_irqrestore(&i8253_lock, flags);
183
184 count = (LATCH - 1) - count;
185
186 return (cycle_t)(jifs * LATCH) + count;
187}
188
189static struct clocksource clocksource_pit = {
190 .name = "pit",
191 .rating = 110,
192 .read = pit_read,
193 .mask = CLOCKSOURCE_MASK(32),
194 .mult = 0,
195 .shift = 20,
196};
197
198static int __init init_pit_clocksource(void)
199{
200 if (num_possible_cpus() > 1) /* PIT does not scale! */
201 return 0;
202
203 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
204 return clocksource_register(&clocksource_pit);
205}
206arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
new file mode 100644
index 000000000000..0499cbe9871a
--- /dev/null
+++ b/arch/x86/kernel/i8259_32.c
@@ -0,0 +1,420 @@
1#include <linux/errno.h>
2#include <linux/signal.h>
3#include <linux/sched.h>
4#include <linux/ioport.h>
5#include <linux/interrupt.h>
6#include <linux/slab.h>
7#include <linux/random.h>
8#include <linux/init.h>
9#include <linux/kernel_stat.h>
10#include <linux/sysdev.h>
11#include <linux/bitops.h>
12
13#include <asm/8253pit.h>
14#include <asm/atomic.h>
15#include <asm/system.h>
16#include <asm/io.h>
17#include <asm/timer.h>
18#include <asm/pgtable.h>
19#include <asm/delay.h>
20#include <asm/desc.h>
21#include <asm/apic.h>
22#include <asm/arch_hooks.h>
23#include <asm/i8259.h>
24
25#include <io_ports.h>
26
27/*
28 * This is the 'legacy' 8259A Programmable Interrupt Controller,
29 * present in the majority of PC/AT boxes.
30 * plus some generic x86 specific things if generic specifics makes
31 * any sense at all.
32 * this file should become arch/i386/kernel/irq.c when the old irq.c
33 * moves to arch independent land
34 */
35
36static int i8259A_auto_eoi;
37DEFINE_SPINLOCK(i8259A_lock);
38static void mask_and_ack_8259A(unsigned int);
39
40static struct irq_chip i8259A_chip = {
41 .name = "XT-PIC",
42 .mask = disable_8259A_irq,
43 .disable = disable_8259A_irq,
44 .unmask = enable_8259A_irq,
45 .mask_ack = mask_and_ack_8259A,
46};
47
48/*
49 * 8259A PIC functions to handle ISA devices:
50 */
51
52/*
53 * This contains the irq mask for both 8259A irq controllers,
54 */
55unsigned int cached_irq_mask = 0xffff;
56
57/*
58 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
59 * boards the timer interrupt is not really connected to any IO-APIC pin,
60 * it's fed to the master 8259A's IR0 line only.
61 *
62 * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
63 * this 'mixed mode' IRQ handling costs nothing because it's only used
64 * at IRQ setup time.
65 */
66unsigned long io_apic_irqs;
67
68void disable_8259A_irq(unsigned int irq)
69{
70 unsigned int mask = 1 << irq;
71 unsigned long flags;
72
73 spin_lock_irqsave(&i8259A_lock, flags);
74 cached_irq_mask |= mask;
75 if (irq & 8)
76 outb(cached_slave_mask, PIC_SLAVE_IMR);
77 else
78 outb(cached_master_mask, PIC_MASTER_IMR);
79 spin_unlock_irqrestore(&i8259A_lock, flags);
80}
81
82void enable_8259A_irq(unsigned int irq)
83{
84 unsigned int mask = ~(1 << irq);
85 unsigned long flags;
86
87 spin_lock_irqsave(&i8259A_lock, flags);
88 cached_irq_mask &= mask;
89 if (irq & 8)
90 outb(cached_slave_mask, PIC_SLAVE_IMR);
91 else
92 outb(cached_master_mask, PIC_MASTER_IMR);
93 spin_unlock_irqrestore(&i8259A_lock, flags);
94}
95
96int i8259A_irq_pending(unsigned int irq)
97{
98 unsigned int mask = 1<<irq;
99 unsigned long flags;
100 int ret;
101
102 spin_lock_irqsave(&i8259A_lock, flags);
103 if (irq < 8)
104 ret = inb(PIC_MASTER_CMD) & mask;
105 else
106 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
107 spin_unlock_irqrestore(&i8259A_lock, flags);
108
109 return ret;
110}
111
112void make_8259A_irq(unsigned int irq)
113{
114 disable_irq_nosync(irq);
115 io_apic_irqs &= ~(1<<irq);
116 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
117 "XT");
118 enable_irq(irq);
119}
120
121/*
122 * This function assumes to be called rarely. Switching between
123 * 8259A registers is slow.
124 * This has to be protected by the irq controller spinlock
125 * before being called.
126 */
127static inline int i8259A_irq_real(unsigned int irq)
128{
129 int value;
130 int irqmask = 1<<irq;
131
132 if (irq < 8) {
133 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
134 value = inb(PIC_MASTER_CMD) & irqmask;
135 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
136 return value;
137 }
138 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
139 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
140 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
141 return value;
142}
143
144/*
145 * Careful! The 8259A is a fragile beast, it pretty
146 * much _has_ to be done exactly like this (mask it
147 * first, _then_ send the EOI, and the order of EOI
148 * to the two 8259s is important!
149 */
150static void mask_and_ack_8259A(unsigned int irq)
151{
152 unsigned int irqmask = 1 << irq;
153 unsigned long flags;
154
155 spin_lock_irqsave(&i8259A_lock, flags);
156 /*
157 * Lightweight spurious IRQ detection. We do not want
158 * to overdo spurious IRQ handling - it's usually a sign
159 * of hardware problems, so we only do the checks we can
160 * do without slowing down good hardware unnecessarily.
161 *
162 * Note that IRQ7 and IRQ15 (the two spurious IRQs
163 * usually resulting from the 8259A-1|2 PICs) occur
164 * even if the IRQ is masked in the 8259A. Thus we
165 * can check spurious 8259A IRQs without doing the
166 * quite slow i8259A_irq_real() call for every IRQ.
167 * This does not cover 100% of spurious interrupts,
168 * but should be enough to warn the user that there
169 * is something bad going on ...
170 */
171 if (cached_irq_mask & irqmask)
172 goto spurious_8259A_irq;
173 cached_irq_mask |= irqmask;
174
175handle_real_irq:
176 if (irq & 8) {
177 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
178 outb(cached_slave_mask, PIC_SLAVE_IMR);
179 outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
180 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
181 } else {
182 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
183 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */
185 }
186 spin_unlock_irqrestore(&i8259A_lock, flags);
187 return;
188
189spurious_8259A_irq:
190 /*
191 * this is the slow path - should happen rarely.
192 */
193 if (i8259A_irq_real(irq))
194 /*
195 * oops, the IRQ _is_ in service according to the
196 * 8259A - not spurious, go handle it.
197 */
198 goto handle_real_irq;
199
200 {
201 static int spurious_irq_mask;
202 /*
203 * At this point we can be sure the IRQ is spurious,
204 * lets ACK and report it. [once per IRQ]
205 */
206 if (!(spurious_irq_mask & irqmask)) {
207 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
208 spurious_irq_mask |= irqmask;
209 }
210 atomic_inc(&irq_err_count);
211 /*
212 * Theoretically we do not have to handle this IRQ,
213 * but in Linux this does not cause problems and is
214 * simpler for us.
215 */
216 goto handle_real_irq;
217 }
218}
219
220static char irq_trigger[2];
221/**
222 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
223 */
224static void restore_ELCR(char *trigger)
225{
226 outb(trigger[0], 0x4d0);
227 outb(trigger[1], 0x4d1);
228}
229
230static void save_ELCR(char *trigger)
231{
232 /* IRQ 0,1,2,8,13 are marked as reserved */
233 trigger[0] = inb(0x4d0) & 0xF8;
234 trigger[1] = inb(0x4d1) & 0xDE;
235}
236
237static int i8259A_resume(struct sys_device *dev)
238{
239 init_8259A(i8259A_auto_eoi);
240 restore_ELCR(irq_trigger);
241 return 0;
242}
243
244static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
245{
246 save_ELCR(irq_trigger);
247 return 0;
248}
249
250static int i8259A_shutdown(struct sys_device *dev)
251{
252 /* Put the i8259A into a quiescent state that
253 * the kernel initialization code can get it
254 * out of.
255 */
256 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
257 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
258 return 0;
259}
260
261static struct sysdev_class i8259_sysdev_class = {
262 set_kset_name("i8259"),
263 .suspend = i8259A_suspend,
264 .resume = i8259A_resume,
265 .shutdown = i8259A_shutdown,
266};
267
268static struct sys_device device_i8259A = {
269 .id = 0,
270 .cls = &i8259_sysdev_class,
271};
272
273static int __init i8259A_init_sysfs(void)
274{
275 int error = sysdev_class_register(&i8259_sysdev_class);
276 if (!error)
277 error = sysdev_register(&device_i8259A);
278 return error;
279}
280
281device_initcall(i8259A_init_sysfs);
282
283void init_8259A(int auto_eoi)
284{
285 unsigned long flags;
286
287 i8259A_auto_eoi = auto_eoi;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 /*
295 * outb_p - this has to work on a wide range of PC hardware.
296 */
297 outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
298 outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
299 outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
300 if (auto_eoi) /* master does Auto EOI */
301 outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
302 else /* master expects normal EOI */
303 outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
304
305 outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
306 outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
307 outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
308 outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
309 if (auto_eoi)
310 /*
311 * In AEOI mode we just have to mask the interrupt
312 * when acking.
313 */
314 i8259A_chip.mask_ack = disable_8259A_irq;
315 else
316 i8259A_chip.mask_ack = mask_and_ack_8259A;
317
318 udelay(100); /* wait for 8259A to initialize */
319
320 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
321 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
322
323 spin_unlock_irqrestore(&i8259A_lock, flags);
324}
325
326/*
327 * Note that on a 486, we don't want to do a SIGFPE on an irq13
328 * as the irq is unreliable, and exception 16 works correctly
329 * (ie as explained in the intel literature). On a 386, you
330 * can't use exception 16 due to bad IBM design, so we have to
331 * rely on the less exact irq13.
332 *
333 * Careful.. Not only is IRQ13 unreliable, but it is also
334 * leads to races. IBM designers who came up with it should
335 * be shot.
336 */
337
338
339static irqreturn_t math_error_irq(int cpl, void *dev_id)
340{
341 extern void math_error(void __user *);
342 outb(0,0xF0);
343 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
344 return IRQ_NONE;
345 math_error((void __user *)get_irq_regs()->eip);
346 return IRQ_HANDLED;
347}
348
349/*
350 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
351 * so allow interrupt sharing.
352 */
353static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
354
355void __init init_ISA_irqs (void)
356{
357 int i;
358
359#ifdef CONFIG_X86_LOCAL_APIC
360 init_bsp_APIC();
361#endif
362 init_8259A(0);
363
364 for (i = 0; i < NR_IRQS; i++) {
365 irq_desc[i].status = IRQ_DISABLED;
366 irq_desc[i].action = NULL;
367 irq_desc[i].depth = 1;
368
369 if (i < 16) {
370 /*
371 * 16 old-style INTA-cycle interrupts:
372 */
373 set_irq_chip_and_handler_name(i, &i8259A_chip,
374 handle_level_irq, "XT");
375 } else {
376 /*
377 * 'high' PCI IRQs filled in on demand
378 */
379 irq_desc[i].chip = &no_irq_chip;
380 }
381 }
382}
383
384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
388{
389 int i;
390
391 /* all the set up before the call gates are initialised */
392 pre_intr_init_hook();
393
394 /*
395 * Cover the whole vector space, no vector can escape
396 * us. (some of these will be overridden and become
397 * 'special' SMP interrupts)
398 */
399 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
400 int vector = FIRST_EXTERNAL_VECTOR + i;
401 if (i >= NR_IRQS)
402 break;
403 if (vector != SYSCALL_VECTOR)
404 set_intr_gate(vector, interrupt[i]);
405 }
406
407 /* setup after call gates are initialised (usually add in
408 * the architecture specific gates)
409 */
410 intr_init_hook();
411
412 /*
413 * External FPU? Set up irq13 if so, for
414 * original braindamaged IBM FERR coupling.
415 */
416 if (boot_cpu_data.hard_math && !cpu_has_fpu)
417 setup_irq(FPU_IRQ, &fpu_irq);
418
419 irq_ctx_init(smp_processor_id());
420}
diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c
new file mode 100644
index 000000000000..d26fc063a760
--- /dev/null
+++ b/arch/x86/kernel/init_task_32.c
@@ -0,0 +1,46 @@
1#include <linux/mm.h>
2#include <linux/module.h>
3#include <linux/sched.h>
4#include <linux/init.h>
5#include <linux/init_task.h>
6#include <linux/fs.h>
7#include <linux/mqueue.h>
8
9#include <asm/uaccess.h>
10#include <asm/pgtable.h>
11#include <asm/desc.h>
12
13static struct fs_struct init_fs = INIT_FS;
14static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm);
18
19EXPORT_SYMBOL(init_mm);
20
21/*
22 * Initial thread structure.
23 *
24 * We need to make sure that this is THREAD_SIZE aligned due to the
25 * way process stacks are handled. This is done by having a special
26 * "init_task" linker map entry..
27 */
28union thread_union init_thread_union
29 __attribute__((__section__(".data.init_task"))) =
30 { INIT_THREAD_INFO(init_task) };
31
32/*
33 * Initial task structure.
34 *
35 * All other task structs will be allocated on slabs in fork.c
36 */
37struct task_struct init_task = INIT_TASK(init_task);
38
39EXPORT_SYMBOL(init_task);
40
41/*
42 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
43 * no more per-task TSS's.
44 */
45DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
46
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
new file mode 100644
index 000000000000..e2f4a1c68547
--- /dev/null
+++ b/arch/x86/kernel/io_apic_32.c
@@ -0,0 +1,2847 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/mc146818rtc.h>
29#include <linux/compiler.h>
30#include <linux/acpi.h>
31#include <linux/module.h>
32#include <linux/sysdev.h>
33#include <linux/pci.h>
34#include <linux/msi.h>
35#include <linux/htirq.h>
36#include <linux/freezer.h>
37#include <linux/kthread.h>
38
39#include <asm/io.h>
40#include <asm/smp.h>
41#include <asm/desc.h>
42#include <asm/timer.h>
43#include <asm/i8259.h>
44#include <asm/nmi.h>
45#include <asm/msidef.h>
46#include <asm/hypertransport.h>
47
48#include <mach_apic.h>
49#include <mach_apicdef.h>
50
51#include "io_ports.h"
52
53int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count;
55
56/* Where if anywhere is the i8259 connect in external int mode */
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58
59static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock);
61
62int timer_over_8254 __initdata = 1;
63
64/*
65 * Is the SiS APIC rmw bug present ?
66 * -1 = don't know, 0 = no, 1 = yes
67 */
68int sis_apic_bug = -1;
69
70/*
71 * # of IRQ routing registers
72 */
73int nr_ioapic_registers[MAX_IO_APICS];
74
75static int disable_timer_pin_1 __initdata;
76
77/*
78 * Rough estimation of how many shared IRQs there are, can
79 * be changed anytime.
80 */
81#define MAX_PLUS_SHARED_IRQS NR_IRQS
82#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
83
84/*
85 * This is performance-critical, we want to do it O(1)
86 *
87 * the indexing order of this array favors 1:1 mappings
88 * between pins and IRQs.
89 */
90
91static struct irq_pin_list {
92 int apic, pin, next;
93} irq_2_pin[PIN_MAP_SIZE];
94
95struct io_apic {
96 unsigned int index;
97 unsigned int unused[3];
98 unsigned int data;
99};
100
101static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
102{
103 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
104 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
105}
106
107static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
108{
109 struct io_apic __iomem *io_apic = io_apic_base(apic);
110 writel(reg, &io_apic->index);
111 return readl(&io_apic->data);
112}
113
114static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
115{
116 struct io_apic __iomem *io_apic = io_apic_base(apic);
117 writel(reg, &io_apic->index);
118 writel(value, &io_apic->data);
119}
120
121/*
122 * Re-write a value: to be used for read-modify-write
123 * cycles where the read already set up the index register.
124 *
125 * Older SiS APIC requires we rewrite the index register
126 */
127static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
128{
129 volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
130 if (sis_apic_bug)
131 writel(reg, &io_apic->index);
132 writel(value, &io_apic->data);
133}
134
135union entry_union {
136 struct { u32 w1, w2; };
137 struct IO_APIC_route_entry entry;
138};
139
140static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
141{
142 union entry_union eu;
143 unsigned long flags;
144 spin_lock_irqsave(&ioapic_lock, flags);
145 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
146 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
147 spin_unlock_irqrestore(&ioapic_lock, flags);
148 return eu.entry;
149}
150
151/*
152 * When we write a new IO APIC routing entry, we need to write the high
153 * word first! If the mask bit in the low word is clear, we will enable
154 * the interrupt, and we need to make sure the entry is fully populated
155 * before that happens.
156 */
157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
159{
160 union entry_union eu;
161 eu.entry = e;
162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
171 spin_unlock_irqrestore(&ioapic_lock, flags);
172}
173
174/*
175 * When we mask an IO APIC routing entry, we need to write the low
176 * word first, in order to set the mask bit before we change the
177 * high bits!
178 */
179static void ioapic_mask_entry(int apic, int pin)
180{
181 unsigned long flags;
182 union entry_union eu = { .entry.mask = 1 };
183
184 spin_lock_irqsave(&ioapic_lock, flags);
185 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
186 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
187 spin_unlock_irqrestore(&ioapic_lock, flags);
188}
189
190/*
191 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
192 * shared ISA-space IRQs, so we have to support them. We are super
193 * fast in the common case, and fast for shared ISA-space IRQs.
194 */
195static void add_pin_to_irq(unsigned int irq, int apic, int pin)
196{
197 static int first_free_entry = NR_IRQS;
198 struct irq_pin_list *entry = irq_2_pin + irq;
199
200 while (entry->next)
201 entry = irq_2_pin + entry->next;
202
203 if (entry->pin != -1) {
204 entry->next = first_free_entry;
205 entry = irq_2_pin + entry->next;
206 if (++first_free_entry >= PIN_MAP_SIZE)
207 panic("io_apic.c: whoops");
208 }
209 entry->apic = apic;
210 entry->pin = pin;
211}
212
213/*
214 * Reroute an IRQ to a different pin.
215 */
216static void __init replace_pin_at_irq(unsigned int irq,
217 int oldapic, int oldpin,
218 int newapic, int newpin)
219{
220 struct irq_pin_list *entry = irq_2_pin + irq;
221
222 while (1) {
223 if (entry->apic == oldapic && entry->pin == oldpin) {
224 entry->apic = newapic;
225 entry->pin = newpin;
226 }
227 if (!entry->next)
228 break;
229 entry = irq_2_pin + entry->next;
230 }
231}
232
233static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
234{
235 struct irq_pin_list *entry = irq_2_pin + irq;
236 unsigned int pin, reg;
237
238 for (;;) {
239 pin = entry->pin;
240 if (pin == -1)
241 break;
242 reg = io_apic_read(entry->apic, 0x10 + pin*2);
243 reg &= ~disable;
244 reg |= enable;
245 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
246 if (!entry->next)
247 break;
248 entry = irq_2_pin + entry->next;
249 }
250}
251
252/* mask = 1 */
253static void __mask_IO_APIC_irq (unsigned int irq)
254{
255 __modify_IO_APIC_irq(irq, 0x00010000, 0);
256}
257
258/* mask = 0 */
259static void __unmask_IO_APIC_irq (unsigned int irq)
260{
261 __modify_IO_APIC_irq(irq, 0, 0x00010000);
262}
263
264/* mask = 1, trigger = 0 */
265static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
266{
267 __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
268}
269
270/* mask = 0, trigger = 1 */
271static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
272{
273 __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
274}
275
276static void mask_IO_APIC_irq (unsigned int irq)
277{
278 unsigned long flags;
279
280 spin_lock_irqsave(&ioapic_lock, flags);
281 __mask_IO_APIC_irq(irq);
282 spin_unlock_irqrestore(&ioapic_lock, flags);
283}
284
285static void unmask_IO_APIC_irq (unsigned int irq)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&ioapic_lock, flags);
290 __unmask_IO_APIC_irq(irq);
291 spin_unlock_irqrestore(&ioapic_lock, flags);
292}
293
294static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
295{
296 struct IO_APIC_route_entry entry;
297
298 /* Check delivery_mode to be sure we're not clearing an SMI pin */
299 entry = ioapic_read_entry(apic, pin);
300 if (entry.delivery_mode == dest_SMI)
301 return;
302
303 /*
304 * Disable it in the IO-APIC irq-routing table:
305 */
306 ioapic_mask_entry(apic, pin);
307}
308
309static void clear_IO_APIC (void)
310{
311 int apic, pin;
312
313 for (apic = 0; apic < nr_ioapics; apic++)
314 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
315 clear_IO_APIC_pin(apic, pin);
316}
317
318#ifdef CONFIG_SMP
319static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
320{
321 unsigned long flags;
322 int pin;
323 struct irq_pin_list *entry = irq_2_pin + irq;
324 unsigned int apicid_value;
325 cpumask_t tmp;
326
327 cpus_and(tmp, cpumask, cpu_online_map);
328 if (cpus_empty(tmp))
329 tmp = TARGET_CPUS;
330
331 cpus_and(cpumask, tmp, CPU_MASK_ALL);
332
333 apicid_value = cpu_mask_to_apicid(cpumask);
334 /* Prepare to do the io_apic_write */
335 apicid_value = apicid_value << 24;
336 spin_lock_irqsave(&ioapic_lock, flags);
337 for (;;) {
338 pin = entry->pin;
339 if (pin == -1)
340 break;
341 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
342 if (!entry->next)
343 break;
344 entry = irq_2_pin + entry->next;
345 }
346 irq_desc[irq].affinity = cpumask;
347 spin_unlock_irqrestore(&ioapic_lock, flags);
348}
349
350#if defined(CONFIG_IRQBALANCE)
351# include <asm/processor.h> /* kernel_thread() */
352# include <linux/kernel_stat.h> /* kstat */
353# include <linux/slab.h> /* kmalloc() */
354# include <linux/timer.h> /* time_after() */
355
356#define IRQBALANCE_CHECK_ARCH -999
357#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
358#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
359#define BALANCED_IRQ_MORE_DELTA (HZ/10)
360#define BALANCED_IRQ_LESS_DELTA (HZ)
361
362static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
363static int physical_balance __read_mostly;
364static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
365
366static struct irq_cpu_info {
367 unsigned long * last_irq;
368 unsigned long * irq_delta;
369 unsigned long irq;
370} irq_cpu_data[NR_CPUS];
371
372#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
373#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
374#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
375
376#define IDLE_ENOUGH(cpu,now) \
377 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
378
379#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
380
381#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
382
383static cpumask_t balance_irq_affinity[NR_IRQS] = {
384 [0 ... NR_IRQS-1] = CPU_MASK_ALL
385};
386
387void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
388{
389 balance_irq_affinity[irq] = mask;
390}
391
392static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
393 unsigned long now, int direction)
394{
395 int search_idle = 1;
396 int cpu = curr_cpu;
397
398 goto inside;
399
400 do {
401 if (unlikely(cpu == curr_cpu))
402 search_idle = 0;
403inside:
404 if (direction == 1) {
405 cpu++;
406 if (cpu >= NR_CPUS)
407 cpu = 0;
408 } else {
409 cpu--;
410 if (cpu == -1)
411 cpu = NR_CPUS-1;
412 }
413 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
414 (search_idle && !IDLE_ENOUGH(cpu,now)));
415
416 return cpu;
417}
418
419static inline void balance_irq(int cpu, int irq)
420{
421 unsigned long now = jiffies;
422 cpumask_t allowed_mask;
423 unsigned int new_cpu;
424
425 if (irqbalance_disabled)
426 return;
427
428 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
429 new_cpu = move(cpu, allowed_mask, now, 1);
430 if (cpu != new_cpu) {
431 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
432 }
433}
434
435static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
436{
437 int i, j;
438
439 for_each_online_cpu(i) {
440 for (j = 0; j < NR_IRQS; j++) {
441 if (!irq_desc[j].action)
442 continue;
443 /* Is it a significant load ? */
444 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
445 useful_load_threshold)
446 continue;
447 balance_irq(i, j);
448 }
449 }
450 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
451 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
452 return;
453}
454
455static void do_irq_balance(void)
456{
457 int i, j;
458 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
459 unsigned long move_this_load = 0;
460 int max_loaded = 0, min_loaded = 0;
461 int load;
462 unsigned long useful_load_threshold = balanced_irq_interval + 10;
463 int selected_irq;
464 int tmp_loaded, first_attempt = 1;
465 unsigned long tmp_cpu_irq;
466 unsigned long imbalance = 0;
467 cpumask_t allowed_mask, target_cpu_mask, tmp;
468
469 for_each_possible_cpu(i) {
470 int package_index;
471 CPU_IRQ(i) = 0;
472 if (!cpu_online(i))
473 continue;
474 package_index = CPU_TO_PACKAGEINDEX(i);
475 for (j = 0; j < NR_IRQS; j++) {
476 unsigned long value_now, delta;
477 /* Is this an active IRQ or balancing disabled ? */
478 if (!irq_desc[j].action || irq_balancing_disabled(j))
479 continue;
480 if ( package_index == i )
481 IRQ_DELTA(package_index,j) = 0;
482 /* Determine the total count per processor per IRQ */
483 value_now = (unsigned long) kstat_cpu(i).irqs[j];
484
485 /* Determine the activity per processor per IRQ */
486 delta = value_now - LAST_CPU_IRQ(i,j);
487
488 /* Update last_cpu_irq[][] for the next time */
489 LAST_CPU_IRQ(i,j) = value_now;
490
491 /* Ignore IRQs whose rate is less than the clock */
492 if (delta < useful_load_threshold)
493 continue;
494 /* update the load for the processor or package total */
495 IRQ_DELTA(package_index,j) += delta;
496
497 /* Keep track of the higher numbered sibling as well */
498 if (i != package_index)
499 CPU_IRQ(i) += delta;
500 /*
501 * We have sibling A and sibling B in the package
502 *
503 * cpu_irq[A] = load for cpu A + load for cpu B
504 * cpu_irq[B] = load for cpu B
505 */
506 CPU_IRQ(package_index) += delta;
507 }
508 }
509 /* Find the least loaded processor package */
510 for_each_online_cpu(i) {
511 if (i != CPU_TO_PACKAGEINDEX(i))
512 continue;
513 if (min_cpu_irq > CPU_IRQ(i)) {
514 min_cpu_irq = CPU_IRQ(i);
515 min_loaded = i;
516 }
517 }
518 max_cpu_irq = ULONG_MAX;
519
520tryanothercpu:
521 /* Look for heaviest loaded processor.
522 * We may come back to get the next heaviest loaded processor.
523 * Skip processors with trivial loads.
524 */
525 tmp_cpu_irq = 0;
526 tmp_loaded = -1;
527 for_each_online_cpu(i) {
528 if (i != CPU_TO_PACKAGEINDEX(i))
529 continue;
530 if (max_cpu_irq <= CPU_IRQ(i))
531 continue;
532 if (tmp_cpu_irq < CPU_IRQ(i)) {
533 tmp_cpu_irq = CPU_IRQ(i);
534 tmp_loaded = i;
535 }
536 }
537
538 if (tmp_loaded == -1) {
539 /* In the case of small number of heavy interrupt sources,
540 * loading some of the cpus too much. We use Ingo's original
541 * approach to rotate them around.
542 */
543 if (!first_attempt && imbalance >= useful_load_threshold) {
544 rotate_irqs_among_cpus(useful_load_threshold);
545 return;
546 }
547 goto not_worth_the_effort;
548 }
549
550 first_attempt = 0; /* heaviest search */
551 max_cpu_irq = tmp_cpu_irq; /* load */
552 max_loaded = tmp_loaded; /* processor */
553 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
554
555 /* if imbalance is less than approx 10% of max load, then
556 * observe diminishing returns action. - quit
557 */
558 if (imbalance < (max_cpu_irq >> 3))
559 goto not_worth_the_effort;
560
561tryanotherirq:
562 /* if we select an IRQ to move that can't go where we want, then
563 * see if there is another one to try.
564 */
565 move_this_load = 0;
566 selected_irq = -1;
567 for (j = 0; j < NR_IRQS; j++) {
568 /* Is this an active IRQ? */
569 if (!irq_desc[j].action)
570 continue;
571 if (imbalance <= IRQ_DELTA(max_loaded,j))
572 continue;
573 /* Try to find the IRQ that is closest to the imbalance
574 * without going over.
575 */
576 if (move_this_load < IRQ_DELTA(max_loaded,j)) {
577 move_this_load = IRQ_DELTA(max_loaded,j);
578 selected_irq = j;
579 }
580 }
581 if (selected_irq == -1) {
582 goto tryanothercpu;
583 }
584
585 imbalance = move_this_load;
586
587 /* For physical_balance case, we accumlated both load
588 * values in the one of the siblings cpu_irq[],
589 * to use the same code for physical and logical processors
590 * as much as possible.
591 *
592 * NOTE: the cpu_irq[] array holds the sum of the load for
593 * sibling A and sibling B in the slot for the lowest numbered
594 * sibling (A), _AND_ the load for sibling B in the slot for
595 * the higher numbered sibling.
596 *
597 * We seek the least loaded sibling by making the comparison
598 * (A+B)/2 vs B
599 */
600 load = CPU_IRQ(min_loaded) >> 1;
601 for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
602 if (load > CPU_IRQ(j)) {
603 /* This won't change cpu_sibling_map[min_loaded] */
604 load = CPU_IRQ(j);
605 min_loaded = j;
606 }
607 }
608
609 cpus_and(allowed_mask,
610 cpu_online_map,
611 balance_irq_affinity[selected_irq]);
612 target_cpu_mask = cpumask_of_cpu(min_loaded);
613 cpus_and(tmp, target_cpu_mask, allowed_mask);
614
615 if (!cpus_empty(tmp)) {
616 /* mark for change destination */
617 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
618
619 /* Since we made a change, come back sooner to
620 * check for more variation.
621 */
622 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
623 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
624 return;
625 }
626 goto tryanotherirq;
627
628not_worth_the_effort:
629 /*
630 * if we did not find an IRQ to move, then adjust the time interval
631 * upward
632 */
633 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
634 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
635 return;
636}
637
638static int balanced_irq(void *unused)
639{
640 int i;
641 unsigned long prev_balance_time = jiffies;
642 long time_remaining = balanced_irq_interval;
643
644 /* push everything to CPU 0 to give us a starting point. */
645 for (i = 0 ; i < NR_IRQS ; i++) {
646 irq_desc[i].pending_mask = cpumask_of_cpu(0);
647 set_pending_irq(i, cpumask_of_cpu(0));
648 }
649
650 set_freezable();
651 for ( ; ; ) {
652 time_remaining = schedule_timeout_interruptible(time_remaining);
653 try_to_freeze();
654 if (time_after(jiffies,
655 prev_balance_time+balanced_irq_interval)) {
656 preempt_disable();
657 do_irq_balance();
658 prev_balance_time = jiffies;
659 time_remaining = balanced_irq_interval;
660 preempt_enable();
661 }
662 }
663 return 0;
664}
665
666static int __init balanced_irq_init(void)
667{
668 int i;
669 struct cpuinfo_x86 *c;
670 cpumask_t tmp;
671
672 cpus_shift_right(tmp, cpu_online_map, 2);
673 c = &boot_cpu_data;
674 /* When not overwritten by the command line ask subarchitecture. */
675 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
676 irqbalance_disabled = NO_BALANCE_IRQ;
677 if (irqbalance_disabled)
678 return 0;
679
680 /* disable irqbalance completely if there is only one processor online */
681 if (num_online_cpus() < 2) {
682 irqbalance_disabled = 1;
683 return 0;
684 }
685 /*
686 * Enable physical balance only if more than 1 physical processor
687 * is present
688 */
689 if (smp_num_siblings > 1 && !cpus_empty(tmp))
690 physical_balance = 1;
691
692 for_each_online_cpu(i) {
693 irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
694 irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
695 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
696 printk(KERN_ERR "balanced_irq_init: out of memory");
697 goto failed;
698 }
699 memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
700 memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
701 }
702
703 printk(KERN_INFO "Starting balanced_irq\n");
704 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
705 return 0;
706 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
707failed:
708 for_each_possible_cpu(i) {
709 kfree(irq_cpu_data[i].irq_delta);
710 irq_cpu_data[i].irq_delta = NULL;
711 kfree(irq_cpu_data[i].last_irq);
712 irq_cpu_data[i].last_irq = NULL;
713 }
714 return 0;
715}
716
717int __devinit irqbalance_disable(char *str)
718{
719 irqbalance_disabled = 1;
720 return 1;
721}
722
723__setup("noirqbalance", irqbalance_disable);
724
725late_initcall(balanced_irq_init);
726#endif /* CONFIG_IRQBALANCE */
727#endif /* CONFIG_SMP */
728
729#ifndef CONFIG_SMP
730void fastcall send_IPI_self(int vector)
731{
732 unsigned int cfg;
733
734 /*
735 * Wait for idle.
736 */
737 apic_wait_icr_idle();
738 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
739 /*
740 * Send the IPI. The write to APIC_ICR fires this off.
741 */
742 apic_write_around(APIC_ICR, cfg);
743}
744#endif /* !CONFIG_SMP */
745
746
747/*
748 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
749 * specific CPU-side IRQs.
750 */
751
752#define MAX_PIRQS 8
753static int pirq_entries [MAX_PIRQS];
754static int pirqs_enabled;
755int skip_ioapic_setup;
756
757static int __init ioapic_pirq_setup(char *str)
758{
759 int i, max;
760 int ints[MAX_PIRQS+1];
761
762 get_options(str, ARRAY_SIZE(ints), ints);
763
764 for (i = 0; i < MAX_PIRQS; i++)
765 pirq_entries[i] = -1;
766
767 pirqs_enabled = 1;
768 apic_printk(APIC_VERBOSE, KERN_INFO
769 "PIRQ redirection, working around broken MP-BIOS.\n");
770 max = MAX_PIRQS;
771 if (ints[0] < MAX_PIRQS)
772 max = ints[0];
773
774 for (i = 0; i < max; i++) {
775 apic_printk(APIC_VERBOSE, KERN_DEBUG
776 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
777 /*
778 * PIRQs are mapped upside down, usually.
779 */
780 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
781 }
782 return 1;
783}
784
785__setup("pirq=", ioapic_pirq_setup);
786
787/*
788 * Find the IRQ entry number of a certain pin.
789 */
790static int find_irq_entry(int apic, int pin, int type)
791{
792 int i;
793
794 for (i = 0; i < mp_irq_entries; i++)
795 if (mp_irqs[i].mpc_irqtype == type &&
796 (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
797 mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
798 mp_irqs[i].mpc_dstirq == pin)
799 return i;
800
801 return -1;
802}
803
804/*
805 * Find the pin to which IRQ[irq] (ISA) is connected
806 */
807static int __init find_isa_irq_pin(int irq, int type)
808{
809 int i;
810
811 for (i = 0; i < mp_irq_entries; i++) {
812 int lbus = mp_irqs[i].mpc_srcbus;
813
814 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
815 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
816 mp_bus_id_to_type[lbus] == MP_BUS_MCA
817 ) &&
818 (mp_irqs[i].mpc_irqtype == type) &&
819 (mp_irqs[i].mpc_srcbusirq == irq))
820
821 return mp_irqs[i].mpc_dstirq;
822 }
823 return -1;
824}
825
826static int __init find_isa_irq_apic(int irq, int type)
827{
828 int i;
829
830 for (i = 0; i < mp_irq_entries; i++) {
831 int lbus = mp_irqs[i].mpc_srcbus;
832
833 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
834 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
835 mp_bus_id_to_type[lbus] == MP_BUS_MCA
836 ) &&
837 (mp_irqs[i].mpc_irqtype == type) &&
838 (mp_irqs[i].mpc_srcbusirq == irq))
839 break;
840 }
841 if (i < mp_irq_entries) {
842 int apic;
843 for(apic = 0; apic < nr_ioapics; apic++) {
844 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
845 return apic;
846 }
847 }
848
849 return -1;
850}
851
852/*
853 * Find a specific PCI IRQ entry.
854 * Not an __init, possibly needed by modules
855 */
856static int pin_2_irq(int idx, int apic, int pin);
857
858int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
859{
860 int apic, i, best_guess = -1;
861
862 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
863 "slot:%d, pin:%d.\n", bus, slot, pin);
864 if (mp_bus_id_to_pci_bus[bus] == -1) {
865 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
866 return -1;
867 }
868 for (i = 0; i < mp_irq_entries; i++) {
869 int lbus = mp_irqs[i].mpc_srcbus;
870
871 for (apic = 0; apic < nr_ioapics; apic++)
872 if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
873 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
874 break;
875
876 if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
877 !mp_irqs[i].mpc_irqtype &&
878 (bus == lbus) &&
879 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
880 int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
881
882 if (!(apic || IO_APIC_IRQ(irq)))
883 continue;
884
885 if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
886 return irq;
887 /*
888 * Use the first all-but-pin matching entry as a
889 * best-guess fuzzy result for broken mptables.
890 */
891 if (best_guess < 0)
892 best_guess = irq;
893 }
894 }
895 return best_guess;
896}
897EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
898
899/*
900 * This function currently is only a helper for the i386 smp boot process where
901 * we need to reprogram the ioredtbls to cater for the cpus which have come online
902 * so mask in all cases should simply be TARGET_CPUS
903 */
904#ifdef CONFIG_SMP
905void __init setup_ioapic_dest(void)
906{
907 int pin, ioapic, irq, irq_entry;
908
909 if (skip_ioapic_setup == 1)
910 return;
911
912 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
913 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
914 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
915 if (irq_entry == -1)
916 continue;
917 irq = pin_2_irq(irq_entry, ioapic, pin);
918 set_ioapic_affinity_irq(irq, TARGET_CPUS);
919 }
920
921 }
922}
923#endif
924
925/*
926 * EISA Edge/Level control register, ELCR
927 */
928static int EISA_ELCR(unsigned int irq)
929{
930 if (irq < 16) {
931 unsigned int port = 0x4d0 + (irq >> 3);
932 return (inb(port) >> (irq & 7)) & 1;
933 }
934 apic_printk(APIC_VERBOSE, KERN_INFO
935 "Broken MPtable reports ISA irq %d\n", irq);
936 return 0;
937}
938
939/* EISA interrupts are always polarity zero and can be edge or level
940 * trigger depending on the ELCR value. If an interrupt is listed as
941 * EISA conforming in the MP table, that means its trigger type must
942 * be read in from the ELCR */
943
944#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
945#define default_EISA_polarity(idx) (0)
946
947/* ISA interrupts are always polarity zero edge triggered,
948 * when listed as conforming in the MP table. */
949
950#define default_ISA_trigger(idx) (0)
951#define default_ISA_polarity(idx) (0)
952
953/* PCI interrupts are always polarity one level triggered,
954 * when listed as conforming in the MP table. */
955
956#define default_PCI_trigger(idx) (1)
957#define default_PCI_polarity(idx) (1)
958
959/* MCA interrupts are always polarity zero level triggered,
960 * when listed as conforming in the MP table. */
961
962#define default_MCA_trigger(idx) (1)
963#define default_MCA_polarity(idx) (0)
964
965static int __init MPBIOS_polarity(int idx)
966{
967 int bus = mp_irqs[idx].mpc_srcbus;
968 int polarity;
969
970 /*
971 * Determine IRQ line polarity (high active or low active):
972 */
973 switch (mp_irqs[idx].mpc_irqflag & 3)
974 {
975 case 0: /* conforms, ie. bus-type dependent polarity */
976 {
977 switch (mp_bus_id_to_type[bus])
978 {
979 case MP_BUS_ISA: /* ISA pin */
980 {
981 polarity = default_ISA_polarity(idx);
982 break;
983 }
984 case MP_BUS_EISA: /* EISA pin */
985 {
986 polarity = default_EISA_polarity(idx);
987 break;
988 }
989 case MP_BUS_PCI: /* PCI pin */
990 {
991 polarity = default_PCI_polarity(idx);
992 break;
993 }
994 case MP_BUS_MCA: /* MCA pin */
995 {
996 polarity = default_MCA_polarity(idx);
997 break;
998 }
999 default:
1000 {
1001 printk(KERN_WARNING "broken BIOS!!\n");
1002 polarity = 1;
1003 break;
1004 }
1005 }
1006 break;
1007 }
1008 case 1: /* high active */
1009 {
1010 polarity = 0;
1011 break;
1012 }
1013 case 2: /* reserved */
1014 {
1015 printk(KERN_WARNING "broken BIOS!!\n");
1016 polarity = 1;
1017 break;
1018 }
1019 case 3: /* low active */
1020 {
1021 polarity = 1;
1022 break;
1023 }
1024 default: /* invalid */
1025 {
1026 printk(KERN_WARNING "broken BIOS!!\n");
1027 polarity = 1;
1028 break;
1029 }
1030 }
1031 return polarity;
1032}
1033
1034static int MPBIOS_trigger(int idx)
1035{
1036 int bus = mp_irqs[idx].mpc_srcbus;
1037 int trigger;
1038
1039 /*
1040 * Determine IRQ trigger mode (edge or level sensitive):
1041 */
1042 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
1043 {
1044 case 0: /* conforms, ie. bus-type dependent */
1045 {
1046 switch (mp_bus_id_to_type[bus])
1047 {
1048 case MP_BUS_ISA: /* ISA pin */
1049 {
1050 trigger = default_ISA_trigger(idx);
1051 break;
1052 }
1053 case MP_BUS_EISA: /* EISA pin */
1054 {
1055 trigger = default_EISA_trigger(idx);
1056 break;
1057 }
1058 case MP_BUS_PCI: /* PCI pin */
1059 {
1060 trigger = default_PCI_trigger(idx);
1061 break;
1062 }
1063 case MP_BUS_MCA: /* MCA pin */
1064 {
1065 trigger = default_MCA_trigger(idx);
1066 break;
1067 }
1068 default:
1069 {
1070 printk(KERN_WARNING "broken BIOS!!\n");
1071 trigger = 1;
1072 break;
1073 }
1074 }
1075 break;
1076 }
1077 case 1: /* edge */
1078 {
1079 trigger = 0;
1080 break;
1081 }
1082 case 2: /* reserved */
1083 {
1084 printk(KERN_WARNING "broken BIOS!!\n");
1085 trigger = 1;
1086 break;
1087 }
1088 case 3: /* level */
1089 {
1090 trigger = 1;
1091 break;
1092 }
1093 default: /* invalid */
1094 {
1095 printk(KERN_WARNING "broken BIOS!!\n");
1096 trigger = 0;
1097 break;
1098 }
1099 }
1100 return trigger;
1101}
1102
1103static inline int irq_polarity(int idx)
1104{
1105 return MPBIOS_polarity(idx);
1106}
1107
1108static inline int irq_trigger(int idx)
1109{
1110 return MPBIOS_trigger(idx);
1111}
1112
1113static int pin_2_irq(int idx, int apic, int pin)
1114{
1115 int irq, i;
1116 int bus = mp_irqs[idx].mpc_srcbus;
1117
1118 /*
1119 * Debugging check, we are in big trouble if this message pops up!
1120 */
1121 if (mp_irqs[idx].mpc_dstirq != pin)
1122 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1123
1124 switch (mp_bus_id_to_type[bus])
1125 {
1126 case MP_BUS_ISA: /* ISA pin */
1127 case MP_BUS_EISA:
1128 case MP_BUS_MCA:
1129 {
1130 irq = mp_irqs[idx].mpc_srcbusirq;
1131 break;
1132 }
1133 case MP_BUS_PCI: /* PCI pin */
1134 {
1135 /*
1136 * PCI IRQs are mapped in order
1137 */
1138 i = irq = 0;
1139 while (i < apic)
1140 irq += nr_ioapic_registers[i++];
1141 irq += pin;
1142
1143 /*
1144 * For MPS mode, so far only needed by ES7000 platform
1145 */
1146 if (ioapic_renumber_irq)
1147 irq = ioapic_renumber_irq(apic, irq);
1148
1149 break;
1150 }
1151 default:
1152 {
1153 printk(KERN_ERR "unknown bus type %d.\n",bus);
1154 irq = 0;
1155 break;
1156 }
1157 }
1158
1159 /*
1160 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1161 */
1162 if ((pin >= 16) && (pin <= 23)) {
1163 if (pirq_entries[pin-16] != -1) {
1164 if (!pirq_entries[pin-16]) {
1165 apic_printk(APIC_VERBOSE, KERN_DEBUG
1166 "disabling PIRQ%d\n", pin-16);
1167 } else {
1168 irq = pirq_entries[pin-16];
1169 apic_printk(APIC_VERBOSE, KERN_DEBUG
1170 "using PIRQ%d -> IRQ %d\n",
1171 pin-16, irq);
1172 }
1173 }
1174 }
1175 return irq;
1176}
1177
1178static inline int IO_APIC_irq_trigger(int irq)
1179{
1180 int apic, idx, pin;
1181
1182 for (apic = 0; apic < nr_ioapics; apic++) {
1183 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1184 idx = find_irq_entry(apic,pin,mp_INT);
1185 if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
1186 return irq_trigger(idx);
1187 }
1188 }
1189 /*
1190 * nonexistent IRQs are edge default
1191 */
1192 return 0;
1193}
1194
1195/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1196static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1197
1198static int __assign_irq_vector(int irq)
1199{
1200 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1201 int vector, offset, i;
1202
1203 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1204
1205 if (irq_vector[irq] > 0)
1206 return irq_vector[irq];
1207
1208 vector = current_vector;
1209 offset = current_offset;
1210next:
1211 vector += 8;
1212 if (vector >= FIRST_SYSTEM_VECTOR) {
1213 offset = (offset + 1) % 8;
1214 vector = FIRST_DEVICE_VECTOR + offset;
1215 }
1216 if (vector == current_vector)
1217 return -ENOSPC;
1218 if (vector == SYSCALL_VECTOR)
1219 goto next;
1220 for (i = 0; i < NR_IRQ_VECTORS; i++)
1221 if (irq_vector[i] == vector)
1222 goto next;
1223
1224 current_vector = vector;
1225 current_offset = offset;
1226 irq_vector[irq] = vector;
1227
1228 return vector;
1229}
1230
1231static int assign_irq_vector(int irq)
1232{
1233 unsigned long flags;
1234 int vector;
1235
1236 spin_lock_irqsave(&vector_lock, flags);
1237 vector = __assign_irq_vector(irq);
1238 spin_unlock_irqrestore(&vector_lock, flags);
1239
1240 return vector;
1241}
1242static struct irq_chip ioapic_chip;
1243
1244#define IOAPIC_AUTO -1
1245#define IOAPIC_EDGE 0
1246#define IOAPIC_LEVEL 1
1247
1248static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1249{
1250 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1251 trigger == IOAPIC_LEVEL) {
1252 irq_desc[irq].status |= IRQ_LEVEL;
1253 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1254 handle_fasteoi_irq, "fasteoi");
1255 } else {
1256 irq_desc[irq].status &= ~IRQ_LEVEL;
1257 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1258 handle_edge_irq, "edge");
1259 }
1260 set_intr_gate(vector, interrupt[irq]);
1261}
1262
1263static void __init setup_IO_APIC_irqs(void)
1264{
1265 struct IO_APIC_route_entry entry;
1266 int apic, pin, idx, irq, first_notcon = 1, vector;
1267 unsigned long flags;
1268
1269 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1270
1271 for (apic = 0; apic < nr_ioapics; apic++) {
1272 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1273
1274 /*
1275 * add it to the IO-APIC irq-routing table:
1276 */
1277 memset(&entry,0,sizeof(entry));
1278
1279 entry.delivery_mode = INT_DELIVERY_MODE;
1280 entry.dest_mode = INT_DEST_MODE;
1281 entry.mask = 0; /* enable IRQ */
1282 entry.dest.logical.logical_dest =
1283 cpu_mask_to_apicid(TARGET_CPUS);
1284
1285 idx = find_irq_entry(apic,pin,mp_INT);
1286 if (idx == -1) {
1287 if (first_notcon) {
1288 apic_printk(APIC_VERBOSE, KERN_DEBUG
1289 " IO-APIC (apicid-pin) %d-%d",
1290 mp_ioapics[apic].mpc_apicid,
1291 pin);
1292 first_notcon = 0;
1293 } else
1294 apic_printk(APIC_VERBOSE, ", %d-%d",
1295 mp_ioapics[apic].mpc_apicid, pin);
1296 continue;
1297 }
1298
1299 entry.trigger = irq_trigger(idx);
1300 entry.polarity = irq_polarity(idx);
1301
1302 if (irq_trigger(idx)) {
1303 entry.trigger = 1;
1304 entry.mask = 1;
1305 }
1306
1307 irq = pin_2_irq(idx, apic, pin);
1308 /*
1309 * skip adding the timer int on secondary nodes, which causes
1310 * a small but painful rift in the time-space continuum
1311 */
1312 if (multi_timer_check(apic, irq))
1313 continue;
1314 else
1315 add_pin_to_irq(irq, apic, pin);
1316
1317 if (!apic && !IO_APIC_IRQ(irq))
1318 continue;
1319
1320 if (IO_APIC_IRQ(irq)) {
1321 vector = assign_irq_vector(irq);
1322 entry.vector = vector;
1323 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1324
1325 if (!apic && (irq < 16))
1326 disable_8259A_irq(irq);
1327 }
1328 spin_lock_irqsave(&ioapic_lock, flags);
1329 __ioapic_write_entry(apic, pin, entry);
1330 spin_unlock_irqrestore(&ioapic_lock, flags);
1331 }
1332 }
1333
1334 if (!first_notcon)
1335 apic_printk(APIC_VERBOSE, " not connected.\n");
1336}
1337
1338/*
1339 * Set up the 8259A-master output pin:
1340 */
1341static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
1342{
1343 struct IO_APIC_route_entry entry;
1344
1345 memset(&entry,0,sizeof(entry));
1346
1347 disable_8259A_irq(0);
1348
1349 /* mask LVT0 */
1350 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1351
1352 /*
1353 * We use logical delivery to get the timer IRQ
1354 * to the first CPU.
1355 */
1356 entry.dest_mode = INT_DEST_MODE;
1357 entry.mask = 0; /* unmask IRQ now */
1358 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1359 entry.delivery_mode = INT_DELIVERY_MODE;
1360 entry.polarity = 0;
1361 entry.trigger = 0;
1362 entry.vector = vector;
1363
1364 /*
1365 * The timer IRQ doesn't have to know that behind the
1366 * scene we have a 8259A-master in AEOI mode ...
1367 */
1368 irq_desc[0].chip = &ioapic_chip;
1369 set_irq_handler(0, handle_edge_irq);
1370
1371 /*
1372 * Add it to the IO-APIC irq-routing table:
1373 */
1374 ioapic_write_entry(apic, pin, entry);
1375
1376 enable_8259A_irq(0);
1377}
1378
1379void __init print_IO_APIC(void)
1380{
1381 int apic, i;
1382 union IO_APIC_reg_00 reg_00;
1383 union IO_APIC_reg_01 reg_01;
1384 union IO_APIC_reg_02 reg_02;
1385 union IO_APIC_reg_03 reg_03;
1386 unsigned long flags;
1387
1388 if (apic_verbosity == APIC_QUIET)
1389 return;
1390
1391 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1392 for (i = 0; i < nr_ioapics; i++)
1393 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1394 mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
1395
1396 /*
1397 * We are a bit conservative about what we expect. We have to
1398 * know about every hardware change ASAP.
1399 */
1400 printk(KERN_INFO "testing the IO APIC.......................\n");
1401
1402 for (apic = 0; apic < nr_ioapics; apic++) {
1403
1404 spin_lock_irqsave(&ioapic_lock, flags);
1405 reg_00.raw = io_apic_read(apic, 0);
1406 reg_01.raw = io_apic_read(apic, 1);
1407 if (reg_01.bits.version >= 0x10)
1408 reg_02.raw = io_apic_read(apic, 2);
1409 if (reg_01.bits.version >= 0x20)
1410 reg_03.raw = io_apic_read(apic, 3);
1411 spin_unlock_irqrestore(&ioapic_lock, flags);
1412
1413 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1414 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1415 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1416 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1417 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1418
1419 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1420 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1421
1422 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1423 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1424
1425 /*
1426 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1427 * but the value of reg_02 is read as the previous read register
1428 * value, so ignore it if reg_02 == reg_01.
1429 */
1430 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1431 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1432 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1433 }
1434
1435 /*
1436 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1437 * or reg_03, but the value of reg_0[23] is read as the previous read
1438 * register value, so ignore it if reg_03 == reg_0[12].
1439 */
1440 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1441 reg_03.raw != reg_01.raw) {
1442 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1443 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1444 }
1445
1446 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1447
1448 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1449 " Stat Dest Deli Vect: \n");
1450
1451 for (i = 0; i <= reg_01.bits.entries; i++) {
1452 struct IO_APIC_route_entry entry;
1453
1454 entry = ioapic_read_entry(apic, i);
1455
1456 printk(KERN_DEBUG " %02x %03X %02X ",
1457 i,
1458 entry.dest.logical.logical_dest,
1459 entry.dest.physical.physical_dest
1460 );
1461
1462 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1463 entry.mask,
1464 entry.trigger,
1465 entry.irr,
1466 entry.polarity,
1467 entry.delivery_status,
1468 entry.dest_mode,
1469 entry.delivery_mode,
1470 entry.vector
1471 );
1472 }
1473 }
1474 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1475 for (i = 0; i < NR_IRQS; i++) {
1476 struct irq_pin_list *entry = irq_2_pin + i;
1477 if (entry->pin < 0)
1478 continue;
1479 printk(KERN_DEBUG "IRQ%d ", i);
1480 for (;;) {
1481 printk("-> %d:%d", entry->apic, entry->pin);
1482 if (!entry->next)
1483 break;
1484 entry = irq_2_pin + entry->next;
1485 }
1486 printk("\n");
1487 }
1488
1489 printk(KERN_INFO ".................................... done.\n");
1490
1491 return;
1492}
1493
1494#if 0
1495
1496static void print_APIC_bitfield (int base)
1497{
1498 unsigned int v;
1499 int i, j;
1500
1501 if (apic_verbosity == APIC_QUIET)
1502 return;
1503
1504 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1505 for (i = 0; i < 8; i++) {
1506 v = apic_read(base + i*0x10);
1507 for (j = 0; j < 32; j++) {
1508 if (v & (1<<j))
1509 printk("1");
1510 else
1511 printk("0");
1512 }
1513 printk("\n");
1514 }
1515}
1516
1517void /*__init*/ print_local_APIC(void * dummy)
1518{
1519 unsigned int v, ver, maxlvt;
1520
1521 if (apic_verbosity == APIC_QUIET)
1522 return;
1523
1524 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1525 smp_processor_id(), hard_smp_processor_id());
1526 v = apic_read(APIC_ID);
1527 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
1528 v = apic_read(APIC_LVR);
1529 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1530 ver = GET_APIC_VERSION(v);
1531 maxlvt = lapic_get_maxlvt();
1532
1533 v = apic_read(APIC_TASKPRI);
1534 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1535
1536 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1537 v = apic_read(APIC_ARBPRI);
1538 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1539 v & APIC_ARBPRI_MASK);
1540 v = apic_read(APIC_PROCPRI);
1541 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1542 }
1543
1544 v = apic_read(APIC_EOI);
1545 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1546 v = apic_read(APIC_RRR);
1547 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1548 v = apic_read(APIC_LDR);
1549 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1550 v = apic_read(APIC_DFR);
1551 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1552 v = apic_read(APIC_SPIV);
1553 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1554
1555 printk(KERN_DEBUG "... APIC ISR field:\n");
1556 print_APIC_bitfield(APIC_ISR);
1557 printk(KERN_DEBUG "... APIC TMR field:\n");
1558 print_APIC_bitfield(APIC_TMR);
1559 printk(KERN_DEBUG "... APIC IRR field:\n");
1560 print_APIC_bitfield(APIC_IRR);
1561
1562 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1563 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1564 apic_write(APIC_ESR, 0);
1565 v = apic_read(APIC_ESR);
1566 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1567 }
1568
1569 v = apic_read(APIC_ICR);
1570 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1571 v = apic_read(APIC_ICR2);
1572 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1573
1574 v = apic_read(APIC_LVTT);
1575 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1576
1577 if (maxlvt > 3) { /* PC is LVT#4. */
1578 v = apic_read(APIC_LVTPC);
1579 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1580 }
1581 v = apic_read(APIC_LVT0);
1582 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1583 v = apic_read(APIC_LVT1);
1584 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1585
1586 if (maxlvt > 2) { /* ERR is LVT#3. */
1587 v = apic_read(APIC_LVTERR);
1588 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1589 }
1590
1591 v = apic_read(APIC_TMICT);
1592 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1593 v = apic_read(APIC_TMCCT);
1594 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1595 v = apic_read(APIC_TDCR);
1596 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1597 printk("\n");
1598}
1599
1600void print_all_local_APICs (void)
1601{
1602 on_each_cpu(print_local_APIC, NULL, 1, 1);
1603}
1604
1605void /*__init*/ print_PIC(void)
1606{
1607 unsigned int v;
1608 unsigned long flags;
1609
1610 if (apic_verbosity == APIC_QUIET)
1611 return;
1612
1613 printk(KERN_DEBUG "\nprinting PIC contents\n");
1614
1615 spin_lock_irqsave(&i8259A_lock, flags);
1616
1617 v = inb(0xa1) << 8 | inb(0x21);
1618 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1619
1620 v = inb(0xa0) << 8 | inb(0x20);
1621 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1622
1623 outb(0x0b,0xa0);
1624 outb(0x0b,0x20);
1625 v = inb(0xa0) << 8 | inb(0x20);
1626 outb(0x0a,0xa0);
1627 outb(0x0a,0x20);
1628
1629 spin_unlock_irqrestore(&i8259A_lock, flags);
1630
1631 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1632
1633 v = inb(0x4d1) << 8 | inb(0x4d0);
1634 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1635}
1636
1637#endif /* 0 */
1638
1639static void __init enable_IO_APIC(void)
1640{
1641 union IO_APIC_reg_01 reg_01;
1642 int i8259_apic, i8259_pin;
1643 int i, apic;
1644 unsigned long flags;
1645
1646 for (i = 0; i < PIN_MAP_SIZE; i++) {
1647 irq_2_pin[i].pin = -1;
1648 irq_2_pin[i].next = 0;
1649 }
1650 if (!pirqs_enabled)
1651 for (i = 0; i < MAX_PIRQS; i++)
1652 pirq_entries[i] = -1;
1653
1654 /*
1655 * The number of IO-APIC IRQ registers (== #pins):
1656 */
1657 for (apic = 0; apic < nr_ioapics; apic++) {
1658 spin_lock_irqsave(&ioapic_lock, flags);
1659 reg_01.raw = io_apic_read(apic, 1);
1660 spin_unlock_irqrestore(&ioapic_lock, flags);
1661 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1662 }
1663 for(apic = 0; apic < nr_ioapics; apic++) {
1664 int pin;
1665 /* See if any of the pins is in ExtINT mode */
1666 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1667 struct IO_APIC_route_entry entry;
1668 entry = ioapic_read_entry(apic, pin);
1669
1670
1671 /* If the interrupt line is enabled and in ExtInt mode
1672 * I have found the pin where the i8259 is connected.
1673 */
1674 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1675 ioapic_i8259.apic = apic;
1676 ioapic_i8259.pin = pin;
1677 goto found_i8259;
1678 }
1679 }
1680 }
1681 found_i8259:
1682 /* Look to see what if the MP table has reported the ExtINT */
1683 /* If we could not find the appropriate pin by looking at the ioapic
1684 * the i8259 probably is not connected the ioapic but give the
1685 * mptable a chance anyway.
1686 */
1687 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1688 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1689 /* Trust the MP table if nothing is setup in the hardware */
1690 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1691 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1692 ioapic_i8259.pin = i8259_pin;
1693 ioapic_i8259.apic = i8259_apic;
1694 }
1695 /* Complain if the MP table and the hardware disagree */
1696 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1697 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1698 {
1699 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1700 }
1701
1702 /*
1703 * Do not trust the IO-APIC being empty at bootup
1704 */
1705 clear_IO_APIC();
1706}
1707
1708/*
1709 * Not an __init, needed by the reboot code
1710 */
1711void disable_IO_APIC(void)
1712{
1713 /*
1714 * Clear the IO-APIC before rebooting:
1715 */
1716 clear_IO_APIC();
1717
1718 /*
1719 * If the i8259 is routed through an IOAPIC
1720 * Put that IOAPIC in virtual wire mode
1721 * so legacy interrupts can be delivered.
1722 */
1723 if (ioapic_i8259.pin != -1) {
1724 struct IO_APIC_route_entry entry;
1725
1726 memset(&entry, 0, sizeof(entry));
1727 entry.mask = 0; /* Enabled */
1728 entry.trigger = 0; /* Edge */
1729 entry.irr = 0;
1730 entry.polarity = 0; /* High */
1731 entry.delivery_status = 0;
1732 entry.dest_mode = 0; /* Physical */
1733 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1734 entry.vector = 0;
1735 entry.dest.physical.physical_dest =
1736 GET_APIC_ID(apic_read(APIC_ID));
1737
1738 /*
1739 * Add it to the IO-APIC irq-routing table:
1740 */
1741 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1742 }
1743 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1744}
1745
1746/*
1747 * function to set the IO-APIC physical IDs based on the
1748 * values stored in the MPC table.
1749 *
1750 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1751 */
1752
1753#ifndef CONFIG_X86_NUMAQ
1754static void __init setup_ioapic_ids_from_mpc(void)
1755{
1756 union IO_APIC_reg_00 reg_00;
1757 physid_mask_t phys_id_present_map;
1758 int apic;
1759 int i;
1760 unsigned char old_id;
1761 unsigned long flags;
1762
1763 /*
1764 * Don't check I/O APIC IDs for xAPIC systems. They have
1765 * no meaning without the serial APIC bus.
1766 */
1767 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1768 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1769 return;
1770 /*
1771 * This is broken; anything with a real cpu count has to
1772 * circumvent this idiocy regardless.
1773 */
1774 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1775
1776 /*
1777 * Set the IOAPIC ID to the value stored in the MPC table.
1778 */
1779 for (apic = 0; apic < nr_ioapics; apic++) {
1780
1781 /* Read the register 0 value */
1782 spin_lock_irqsave(&ioapic_lock, flags);
1783 reg_00.raw = io_apic_read(apic, 0);
1784 spin_unlock_irqrestore(&ioapic_lock, flags);
1785
1786 old_id = mp_ioapics[apic].mpc_apicid;
1787
1788 if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
1789 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1790 apic, mp_ioapics[apic].mpc_apicid);
1791 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1792 reg_00.bits.ID);
1793 mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
1794 }
1795
1796 /*
1797 * Sanity check, is the ID really free? Every APIC in a
1798 * system must have a unique ID or we get lots of nice
1799 * 'stuck on smp_invalidate_needed IPI wait' messages.
1800 */
1801 if (check_apicid_used(phys_id_present_map,
1802 mp_ioapics[apic].mpc_apicid)) {
1803 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1804 apic, mp_ioapics[apic].mpc_apicid);
1805 for (i = 0; i < get_physical_broadcast(); i++)
1806 if (!physid_isset(i, phys_id_present_map))
1807 break;
1808 if (i >= get_physical_broadcast())
1809 panic("Max APIC ID exceeded!\n");
1810 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1811 i);
1812 physid_set(i, phys_id_present_map);
1813 mp_ioapics[apic].mpc_apicid = i;
1814 } else {
1815 physid_mask_t tmp;
1816 tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
1817 apic_printk(APIC_VERBOSE, "Setting %d in the "
1818 "phys_id_present_map\n",
1819 mp_ioapics[apic].mpc_apicid);
1820 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1821 }
1822
1823
1824 /*
1825 * We need to adjust the IRQ routing table
1826 * if the ID changed.
1827 */
1828 if (old_id != mp_ioapics[apic].mpc_apicid)
1829 for (i = 0; i < mp_irq_entries; i++)
1830 if (mp_irqs[i].mpc_dstapic == old_id)
1831 mp_irqs[i].mpc_dstapic
1832 = mp_ioapics[apic].mpc_apicid;
1833
1834 /*
1835 * Read the right value from the MPC table and
1836 * write it into the ID register.
1837 */
1838 apic_printk(APIC_VERBOSE, KERN_INFO
1839 "...changing IO-APIC physical APIC ID to %d ...",
1840 mp_ioapics[apic].mpc_apicid);
1841
1842 reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
1843 spin_lock_irqsave(&ioapic_lock, flags);
1844 io_apic_write(apic, 0, reg_00.raw);
1845 spin_unlock_irqrestore(&ioapic_lock, flags);
1846
1847 /*
1848 * Sanity check
1849 */
1850 spin_lock_irqsave(&ioapic_lock, flags);
1851 reg_00.raw = io_apic_read(apic, 0);
1852 spin_unlock_irqrestore(&ioapic_lock, flags);
1853 if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
1854 printk("could not set ID!\n");
1855 else
1856 apic_printk(APIC_VERBOSE, " ok.\n");
1857 }
1858}
1859#else
1860static void __init setup_ioapic_ids_from_mpc(void) { }
1861#endif
1862
1863int no_timer_check __initdata;
1864
1865static int __init notimercheck(char *s)
1866{
1867 no_timer_check = 1;
1868 return 1;
1869}
1870__setup("no_timer_check", notimercheck);
1871
1872/*
1873 * There is a nasty bug in some older SMP boards, their mptable lies
1874 * about the timer IRQ. We do the following to work around the situation:
1875 *
1876 * - timer IRQ defaults to IO-APIC IRQ
1877 * - if this function detects that timer IRQs are defunct, then we fall
1878 * back to ISA timer IRQs
1879 */
1880static int __init timer_irq_works(void)
1881{
1882 unsigned long t1 = jiffies;
1883
1884 if (no_timer_check)
1885 return 1;
1886
1887 local_irq_enable();
1888 /* Let ten ticks pass... */
1889 mdelay((10 * 1000) / HZ);
1890
1891 /*
1892 * Expect a few ticks at least, to be sure some possible
1893 * glue logic does not lock up after one or two first
1894 * ticks in a non-ExtINT mode. Also the local APIC
1895 * might have cached one ExtINT interrupt. Finally, at
1896 * least one tick may be lost due to delays.
1897 */
1898 if (jiffies - t1 > 4)
1899 return 1;
1900
1901 return 0;
1902}
1903
1904/*
1905 * In the SMP+IOAPIC case it might happen that there are an unspecified
1906 * number of pending IRQ events unhandled. These cases are very rare,
1907 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1908 * better to do it this way as thus we do not have to be aware of
1909 * 'pending' interrupts in the IRQ path, except at this point.
1910 */
1911/*
1912 * Edge triggered needs to resend any interrupt
1913 * that was delayed but this is now handled in the device
1914 * independent code.
1915 */
1916
1917/*
1918 * Startup quirk:
1919 *
1920 * Starting up a edge-triggered IO-APIC interrupt is
1921 * nasty - we need to make sure that we get the edge.
1922 * If it is already asserted for some reason, we need
1923 * return 1 to indicate that is was pending.
1924 *
1925 * This is not complete - we should be able to fake
1926 * an edge even if it isn't on the 8259A...
1927 *
1928 * (We do this for level-triggered IRQs too - it cannot hurt.)
1929 */
1930static unsigned int startup_ioapic_irq(unsigned int irq)
1931{
1932 int was_pending = 0;
1933 unsigned long flags;
1934
1935 spin_lock_irqsave(&ioapic_lock, flags);
1936 if (irq < 16) {
1937 disable_8259A_irq(irq);
1938 if (i8259A_irq_pending(irq))
1939 was_pending = 1;
1940 }
1941 __unmask_IO_APIC_irq(irq);
1942 spin_unlock_irqrestore(&ioapic_lock, flags);
1943
1944 return was_pending;
1945}
1946
1947static void ack_ioapic_irq(unsigned int irq)
1948{
1949 move_native_irq(irq);
1950 ack_APIC_irq();
1951}
1952
1953static void ack_ioapic_quirk_irq(unsigned int irq)
1954{
1955 unsigned long v;
1956 int i;
1957
1958 move_native_irq(irq);
1959/*
1960 * It appears there is an erratum which affects at least version 0x11
1961 * of I/O APIC (that's the 82093AA and cores integrated into various
1962 * chipsets). Under certain conditions a level-triggered interrupt is
1963 * erroneously delivered as edge-triggered one but the respective IRR
1964 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1965 * message but it will never arrive and further interrupts are blocked
1966 * from the source. The exact reason is so far unknown, but the
1967 * phenomenon was observed when two consecutive interrupt requests
1968 * from a given source get delivered to the same CPU and the source is
1969 * temporarily disabled in between.
1970 *
1971 * A workaround is to simulate an EOI message manually. We achieve it
1972 * by setting the trigger mode to edge and then to level when the edge
1973 * trigger mode gets detected in the TMR of a local APIC for a
1974 * level-triggered interrupt. We mask the source for the time of the
1975 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1976 * The idea is from Manfred Spraul. --macro
1977 */
1978 i = irq_vector[irq];
1979
1980 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1981
1982 ack_APIC_irq();
1983
1984 if (!(v & (1 << (i & 0x1f)))) {
1985 atomic_inc(&irq_mis_count);
1986 spin_lock(&ioapic_lock);
1987 __mask_and_edge_IO_APIC_irq(irq);
1988 __unmask_and_level_IO_APIC_irq(irq);
1989 spin_unlock(&ioapic_lock);
1990 }
1991}
1992
1993static int ioapic_retrigger_irq(unsigned int irq)
1994{
1995 send_IPI_self(irq_vector[irq]);
1996
1997 return 1;
1998}
1999
2000static struct irq_chip ioapic_chip __read_mostly = {
2001 .name = "IO-APIC",
2002 .startup = startup_ioapic_irq,
2003 .mask = mask_IO_APIC_irq,
2004 .unmask = unmask_IO_APIC_irq,
2005 .ack = ack_ioapic_irq,
2006 .eoi = ack_ioapic_quirk_irq,
2007#ifdef CONFIG_SMP
2008 .set_affinity = set_ioapic_affinity_irq,
2009#endif
2010 .retrigger = ioapic_retrigger_irq,
2011};
2012
2013
2014static inline void init_IO_APIC_traps(void)
2015{
2016 int irq;
2017
2018 /*
2019 * NOTE! The local APIC isn't very good at handling
2020 * multiple interrupts at the same interrupt level.
2021 * As the interrupt level is determined by taking the
2022 * vector number and shifting that right by 4, we
2023 * want to spread these out a bit so that they don't
2024 * all fall in the same interrupt level.
2025 *
2026 * Also, we've got to be careful not to trash gate
2027 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2028 */
2029 for (irq = 0; irq < NR_IRQS ; irq++) {
2030 int tmp = irq;
2031 if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2032 /*
2033 * Hmm.. We don't have an entry for this,
2034 * so default to an old-fashioned 8259
2035 * interrupt if we can..
2036 */
2037 if (irq < 16)
2038 make_8259A_irq(irq);
2039 else
2040 /* Strange. Oh, well.. */
2041 irq_desc[irq].chip = &no_irq_chip;
2042 }
2043 }
2044}
2045
2046/*
2047 * The local APIC irq-chip implementation:
2048 */
2049
2050static void ack_apic(unsigned int irq)
2051{
2052 ack_APIC_irq();
2053}
2054
2055static void mask_lapic_irq (unsigned int irq)
2056{
2057 unsigned long v;
2058
2059 v = apic_read(APIC_LVT0);
2060 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2061}
2062
2063static void unmask_lapic_irq (unsigned int irq)
2064{
2065 unsigned long v;
2066
2067 v = apic_read(APIC_LVT0);
2068 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2069}
2070
2071static struct irq_chip lapic_chip __read_mostly = {
2072 .name = "local-APIC-edge",
2073 .mask = mask_lapic_irq,
2074 .unmask = unmask_lapic_irq,
2075 .eoi = ack_apic,
2076};
2077
2078static void setup_nmi (void)
2079{
2080 /*
2081 * Dirty trick to enable the NMI watchdog ...
2082 * We put the 8259A master into AEOI mode and
2083 * unmask on all local APICs LVT0 as NMI.
2084 *
2085 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2086 * is from Maciej W. Rozycki - so we do not have to EOI from
2087 * the NMI handler or the timer interrupt.
2088 */
2089 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2090
2091 on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
2092
2093 apic_printk(APIC_VERBOSE, " done.\n");
2094}
2095
2096/*
2097 * This looks a bit hackish but it's about the only one way of sending
2098 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2099 * not support the ExtINT mode, unfortunately. We need to send these
2100 * cycles as some i82489DX-based boards have glue logic that keeps the
2101 * 8259A interrupt line asserted until INTA. --macro
2102 */
2103static inline void unlock_ExtINT_logic(void)
2104{
2105 int apic, pin, i;
2106 struct IO_APIC_route_entry entry0, entry1;
2107 unsigned char save_control, save_freq_select;
2108
2109 pin = find_isa_irq_pin(8, mp_INT);
2110 if (pin == -1) {
2111 WARN_ON_ONCE(1);
2112 return;
2113 }
2114 apic = find_isa_irq_apic(8, mp_INT);
2115 if (apic == -1) {
2116 WARN_ON_ONCE(1);
2117 return;
2118 }
2119
2120 entry0 = ioapic_read_entry(apic, pin);
2121 clear_IO_APIC_pin(apic, pin);
2122
2123 memset(&entry1, 0, sizeof(entry1));
2124
2125 entry1.dest_mode = 0; /* physical delivery */
2126 entry1.mask = 0; /* unmask IRQ now */
2127 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2128 entry1.delivery_mode = dest_ExtINT;
2129 entry1.polarity = entry0.polarity;
2130 entry1.trigger = 0;
2131 entry1.vector = 0;
2132
2133 ioapic_write_entry(apic, pin, entry1);
2134
2135 save_control = CMOS_READ(RTC_CONTROL);
2136 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2137 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2138 RTC_FREQ_SELECT);
2139 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2140
2141 i = 100;
2142 while (i-- > 0) {
2143 mdelay(10);
2144 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2145 i -= 10;
2146 }
2147
2148 CMOS_WRITE(save_control, RTC_CONTROL);
2149 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2150 clear_IO_APIC_pin(apic, pin);
2151
2152 ioapic_write_entry(apic, pin, entry0);
2153}
2154
2155int timer_uses_ioapic_pin_0;
2156
2157/*
2158 * This code may look a bit paranoid, but it's supposed to cooperate with
2159 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2160 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2161 * fanatically on his truly buggy board.
2162 */
2163static inline void __init check_timer(void)
2164{
2165 int apic1, pin1, apic2, pin2;
2166 int vector;
2167
2168 /*
2169 * get/set the timer IRQ vector:
2170 */
2171 disable_8259A_irq(0);
2172 vector = assign_irq_vector(0);
2173 set_intr_gate(vector, interrupt[0]);
2174
2175 /*
2176 * Subtle, code in do_timer_interrupt() expects an AEOI
2177 * mode for the 8259A whenever interrupts are routed
2178 * through I/O APICs. Also IRQ0 has to be enabled in
2179 * the 8259A which implies the virtual wire has to be
2180 * disabled in the local APIC.
2181 */
2182 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2183 init_8259A(1);
2184 timer_ack = 1;
2185 if (timer_over_8254 > 0)
2186 enable_8259A_irq(0);
2187
2188 pin1 = find_isa_irq_pin(0, mp_INT);
2189 apic1 = find_isa_irq_apic(0, mp_INT);
2190 pin2 = ioapic_i8259.pin;
2191 apic2 = ioapic_i8259.apic;
2192
2193 if (pin1 == 0)
2194 timer_uses_ioapic_pin_0 = 1;
2195
2196 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2197 vector, apic1, pin1, apic2, pin2);
2198
2199 if (pin1 != -1) {
2200 /*
2201 * Ok, does IRQ0 through the IOAPIC work?
2202 */
2203 unmask_IO_APIC_irq(0);
2204 if (timer_irq_works()) {
2205 if (nmi_watchdog == NMI_IO_APIC) {
2206 disable_8259A_irq(0);
2207 setup_nmi();
2208 enable_8259A_irq(0);
2209 }
2210 if (disable_timer_pin_1 > 0)
2211 clear_IO_APIC_pin(0, pin1);
2212 return;
2213 }
2214 clear_IO_APIC_pin(apic1, pin1);
2215 printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
2216 "IO-APIC\n");
2217 }
2218
2219 printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
2220 if (pin2 != -1) {
2221 printk("\n..... (found pin %d) ...", pin2);
2222 /*
2223 * legacy devices should be connected to IO APIC #0
2224 */
2225 setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
2226 if (timer_irq_works()) {
2227 printk("works.\n");
2228 if (pin1 != -1)
2229 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2230 else
2231 add_pin_to_irq(0, apic2, pin2);
2232 if (nmi_watchdog == NMI_IO_APIC) {
2233 setup_nmi();
2234 }
2235 return;
2236 }
2237 /*
2238 * Cleanup, just in case ...
2239 */
2240 clear_IO_APIC_pin(apic2, pin2);
2241 }
2242 printk(" failed.\n");
2243
2244 if (nmi_watchdog == NMI_IO_APIC) {
2245 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
2246 nmi_watchdog = 0;
2247 }
2248
2249 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2250
2251 disable_8259A_irq(0);
2252 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2253 "fasteoi");
2254 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2255 enable_8259A_irq(0);
2256
2257 if (timer_irq_works()) {
2258 printk(" works.\n");
2259 return;
2260 }
2261 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2262 printk(" failed.\n");
2263
2264 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
2265
2266 timer_ack = 0;
2267 init_8259A(0);
2268 make_8259A_irq(0);
2269 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
2270
2271 unlock_ExtINT_logic();
2272
2273 if (timer_irq_works()) {
2274 printk(" works.\n");
2275 return;
2276 }
2277 printk(" failed :(.\n");
2278 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2279 "report. Then try booting with the 'noapic' option");
2280}
2281
2282/*
2283 *
2284 * IRQ's that are handled by the PIC in the MPS IOAPIC case.
2285 * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
2286 * Linux doesn't really care, as it's not actually used
2287 * for any interrupt handling anyway.
2288 */
2289#define PIC_IRQS (1 << PIC_CASCADE_IR)
2290
2291void __init setup_IO_APIC(void)
2292{
2293 enable_IO_APIC();
2294
2295 if (acpi_ioapic)
2296 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
2297 else
2298 io_apic_irqs = ~PIC_IRQS;
2299
2300 printk("ENABLING IO-APIC IRQs\n");
2301
2302 /*
2303 * Set up IO-APIC IRQ routing.
2304 */
2305 if (!acpi_ioapic)
2306 setup_ioapic_ids_from_mpc();
2307 sync_Arb_IDs();
2308 setup_IO_APIC_irqs();
2309 init_IO_APIC_traps();
2310 check_timer();
2311 if (!acpi_ioapic)
2312 print_IO_APIC();
2313}
2314
2315static int __init setup_disable_8254_timer(char *s)
2316{
2317 timer_over_8254 = -1;
2318 return 1;
2319}
2320static int __init setup_enable_8254_timer(char *s)
2321{
2322 timer_over_8254 = 2;
2323 return 1;
2324}
2325
2326__setup("disable_8254_timer", setup_disable_8254_timer);
2327__setup("enable_8254_timer", setup_enable_8254_timer);
2328
2329/*
2330 * Called after all the initialization is done. If we didnt find any
2331 * APIC bugs then we can allow the modify fast path
2332 */
2333
2334static int __init io_apic_bug_finalize(void)
2335{
2336 if(sis_apic_bug == -1)
2337 sis_apic_bug = 0;
2338 return 0;
2339}
2340
2341late_initcall(io_apic_bug_finalize);
2342
2343struct sysfs_ioapic_data {
2344 struct sys_device dev;
2345 struct IO_APIC_route_entry entry[0];
2346};
2347static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
2348
2349static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2350{
2351 struct IO_APIC_route_entry *entry;
2352 struct sysfs_ioapic_data *data;
2353 int i;
2354
2355 data = container_of(dev, struct sysfs_ioapic_data, dev);
2356 entry = data->entry;
2357 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2358 entry[i] = ioapic_read_entry(dev->id, i);
2359
2360 return 0;
2361}
2362
2363static int ioapic_resume(struct sys_device *dev)
2364{
2365 struct IO_APIC_route_entry *entry;
2366 struct sysfs_ioapic_data *data;
2367 unsigned long flags;
2368 union IO_APIC_reg_00 reg_00;
2369 int i;
2370
2371 data = container_of(dev, struct sysfs_ioapic_data, dev);
2372 entry = data->entry;
2373
2374 spin_lock_irqsave(&ioapic_lock, flags);
2375 reg_00.raw = io_apic_read(dev->id, 0);
2376 if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
2377 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2378 io_apic_write(dev->id, 0, reg_00.raw);
2379 }
2380 spin_unlock_irqrestore(&ioapic_lock, flags);
2381 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2382 ioapic_write_entry(dev->id, i, entry[i]);
2383
2384 return 0;
2385}
2386
2387static struct sysdev_class ioapic_sysdev_class = {
2388 set_kset_name("ioapic"),
2389 .suspend = ioapic_suspend,
2390 .resume = ioapic_resume,
2391};
2392
2393static int __init ioapic_init_sysfs(void)
2394{
2395 struct sys_device * dev;
2396 int i, size, error = 0;
2397
2398 error = sysdev_class_register(&ioapic_sysdev_class);
2399 if (error)
2400 return error;
2401
2402 for (i = 0; i < nr_ioapics; i++ ) {
2403 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2404 * sizeof(struct IO_APIC_route_entry);
2405 mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
2406 if (!mp_ioapic_data[i]) {
2407 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2408 continue;
2409 }
2410 memset(mp_ioapic_data[i], 0, size);
2411 dev = &mp_ioapic_data[i]->dev;
2412 dev->id = i;
2413 dev->cls = &ioapic_sysdev_class;
2414 error = sysdev_register(dev);
2415 if (error) {
2416 kfree(mp_ioapic_data[i]);
2417 mp_ioapic_data[i] = NULL;
2418 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2419 continue;
2420 }
2421 }
2422
2423 return 0;
2424}
2425
2426device_initcall(ioapic_init_sysfs);
2427
2428/*
2429 * Dynamic irq allocate and deallocation
2430 */
2431int create_irq(void)
2432{
2433 /* Allocate an unused irq */
2434 int irq, new, vector = 0;
2435 unsigned long flags;
2436
2437 irq = -ENOSPC;
2438 spin_lock_irqsave(&vector_lock, flags);
2439 for (new = (NR_IRQS - 1); new >= 0; new--) {
2440 if (platform_legacy_irq(new))
2441 continue;
2442 if (irq_vector[new] != 0)
2443 continue;
2444 vector = __assign_irq_vector(new);
2445 if (likely(vector > 0))
2446 irq = new;
2447 break;
2448 }
2449 spin_unlock_irqrestore(&vector_lock, flags);
2450
2451 if (irq >= 0) {
2452 set_intr_gate(vector, interrupt[irq]);
2453 dynamic_irq_init(irq);
2454 }
2455 return irq;
2456}
2457
2458void destroy_irq(unsigned int irq)
2459{
2460 unsigned long flags;
2461
2462 dynamic_irq_cleanup(irq);
2463
2464 spin_lock_irqsave(&vector_lock, flags);
2465 irq_vector[irq] = 0;
2466 spin_unlock_irqrestore(&vector_lock, flags);
2467}
2468
2469/*
2470 * MSI mesage composition
2471 */
2472#ifdef CONFIG_PCI_MSI
2473static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2474{
2475 int vector;
2476 unsigned dest;
2477
2478 vector = assign_irq_vector(irq);
2479 if (vector >= 0) {
2480 dest = cpu_mask_to_apicid(TARGET_CPUS);
2481
2482 msg->address_hi = MSI_ADDR_BASE_HI;
2483 msg->address_lo =
2484 MSI_ADDR_BASE_LO |
2485 ((INT_DEST_MODE == 0) ?
2486 MSI_ADDR_DEST_MODE_PHYSICAL:
2487 MSI_ADDR_DEST_MODE_LOGICAL) |
2488 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2489 MSI_ADDR_REDIRECTION_CPU:
2490 MSI_ADDR_REDIRECTION_LOWPRI) |
2491 MSI_ADDR_DEST_ID(dest);
2492
2493 msg->data =
2494 MSI_DATA_TRIGGER_EDGE |
2495 MSI_DATA_LEVEL_ASSERT |
2496 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2497 MSI_DATA_DELIVERY_FIXED:
2498 MSI_DATA_DELIVERY_LOWPRI) |
2499 MSI_DATA_VECTOR(vector);
2500 }
2501 return vector;
2502}
2503
2504#ifdef CONFIG_SMP
2505static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2506{
2507 struct msi_msg msg;
2508 unsigned int dest;
2509 cpumask_t tmp;
2510 int vector;
2511
2512 cpus_and(tmp, mask, cpu_online_map);
2513 if (cpus_empty(tmp))
2514 tmp = TARGET_CPUS;
2515
2516 vector = assign_irq_vector(irq);
2517 if (vector < 0)
2518 return;
2519
2520 dest = cpu_mask_to_apicid(mask);
2521
2522 read_msi_msg(irq, &msg);
2523
2524 msg.data &= ~MSI_DATA_VECTOR_MASK;
2525 msg.data |= MSI_DATA_VECTOR(vector);
2526 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2527 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2528
2529 write_msi_msg(irq, &msg);
2530 irq_desc[irq].affinity = mask;
2531}
2532#endif /* CONFIG_SMP */
2533
2534/*
2535 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2536 * which implement the MSI or MSI-X Capability Structure.
2537 */
2538static struct irq_chip msi_chip = {
2539 .name = "PCI-MSI",
2540 .unmask = unmask_msi_irq,
2541 .mask = mask_msi_irq,
2542 .ack = ack_ioapic_irq,
2543#ifdef CONFIG_SMP
2544 .set_affinity = set_msi_irq_affinity,
2545#endif
2546 .retrigger = ioapic_retrigger_irq,
2547};
2548
2549int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2550{
2551 struct msi_msg msg;
2552 int irq, ret;
2553 irq = create_irq();
2554 if (irq < 0)
2555 return irq;
2556
2557 ret = msi_compose_msg(dev, irq, &msg);
2558 if (ret < 0) {
2559 destroy_irq(irq);
2560 return ret;
2561 }
2562
2563 set_irq_msi(irq, desc);
2564 write_msi_msg(irq, &msg);
2565
2566 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2567 "edge");
2568
2569 return 0;
2570}
2571
2572void arch_teardown_msi_irq(unsigned int irq)
2573{
2574 destroy_irq(irq);
2575}
2576
2577#endif /* CONFIG_PCI_MSI */
2578
2579/*
2580 * Hypertransport interrupt support
2581 */
2582#ifdef CONFIG_HT_IRQ
2583
2584#ifdef CONFIG_SMP
2585
2586static void target_ht_irq(unsigned int irq, unsigned int dest)
2587{
2588 struct ht_irq_msg msg;
2589 fetch_ht_irq_msg(irq, &msg);
2590
2591 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2592 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2593
2594 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2595 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2596
2597 write_ht_irq_msg(irq, &msg);
2598}
2599
2600static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2601{
2602 unsigned int dest;
2603 cpumask_t tmp;
2604
2605 cpus_and(tmp, mask, cpu_online_map);
2606 if (cpus_empty(tmp))
2607 tmp = TARGET_CPUS;
2608
2609 cpus_and(mask, tmp, CPU_MASK_ALL);
2610
2611 dest = cpu_mask_to_apicid(mask);
2612
2613 target_ht_irq(irq, dest);
2614 irq_desc[irq].affinity = mask;
2615}
2616#endif
2617
2618static struct irq_chip ht_irq_chip = {
2619 .name = "PCI-HT",
2620 .mask = mask_ht_irq,
2621 .unmask = unmask_ht_irq,
2622 .ack = ack_ioapic_irq,
2623#ifdef CONFIG_SMP
2624 .set_affinity = set_ht_irq_affinity,
2625#endif
2626 .retrigger = ioapic_retrigger_irq,
2627};
2628
2629int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2630{
2631 int vector;
2632
2633 vector = assign_irq_vector(irq);
2634 if (vector >= 0) {
2635 struct ht_irq_msg msg;
2636 unsigned dest;
2637 cpumask_t tmp;
2638
2639 cpus_clear(tmp);
2640 cpu_set(vector >> 8, tmp);
2641 dest = cpu_mask_to_apicid(tmp);
2642
2643 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2644
2645 msg.address_lo =
2646 HT_IRQ_LOW_BASE |
2647 HT_IRQ_LOW_DEST_ID(dest) |
2648 HT_IRQ_LOW_VECTOR(vector) |
2649 ((INT_DEST_MODE == 0) ?
2650 HT_IRQ_LOW_DM_PHYSICAL :
2651 HT_IRQ_LOW_DM_LOGICAL) |
2652 HT_IRQ_LOW_RQEOI_EDGE |
2653 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2654 HT_IRQ_LOW_MT_FIXED :
2655 HT_IRQ_LOW_MT_ARBITRATED) |
2656 HT_IRQ_LOW_IRQ_MASKED;
2657
2658 write_ht_irq_msg(irq, &msg);
2659
2660 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2661 handle_edge_irq, "edge");
2662 }
2663 return vector;
2664}
2665#endif /* CONFIG_HT_IRQ */
2666
2667/* --------------------------------------------------------------------------
2668 ACPI-based IOAPIC Configuration
2669 -------------------------------------------------------------------------- */
2670
2671#ifdef CONFIG_ACPI
2672
2673int __init io_apic_get_unique_id (int ioapic, int apic_id)
2674{
2675 union IO_APIC_reg_00 reg_00;
2676 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2677 physid_mask_t tmp;
2678 unsigned long flags;
2679 int i = 0;
2680
2681 /*
2682 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2683 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2684 * supports up to 16 on one shared APIC bus.
2685 *
2686 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2687 * advantage of new APIC bus architecture.
2688 */
2689
2690 if (physids_empty(apic_id_map))
2691 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2692
2693 spin_lock_irqsave(&ioapic_lock, flags);
2694 reg_00.raw = io_apic_read(ioapic, 0);
2695 spin_unlock_irqrestore(&ioapic_lock, flags);
2696
2697 if (apic_id >= get_physical_broadcast()) {
2698 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2699 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2700 apic_id = reg_00.bits.ID;
2701 }
2702
2703 /*
2704 * Every APIC in a system must have a unique ID or we get lots of nice
2705 * 'stuck on smp_invalidate_needed IPI wait' messages.
2706 */
2707 if (check_apicid_used(apic_id_map, apic_id)) {
2708
2709 for (i = 0; i < get_physical_broadcast(); i++) {
2710 if (!check_apicid_used(apic_id_map, i))
2711 break;
2712 }
2713
2714 if (i == get_physical_broadcast())
2715 panic("Max apic_id exceeded!\n");
2716
2717 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2718 "trying %d\n", ioapic, apic_id, i);
2719
2720 apic_id = i;
2721 }
2722
2723 tmp = apicid_to_cpu_present(apic_id);
2724 physids_or(apic_id_map, apic_id_map, tmp);
2725
2726 if (reg_00.bits.ID != apic_id) {
2727 reg_00.bits.ID = apic_id;
2728
2729 spin_lock_irqsave(&ioapic_lock, flags);
2730 io_apic_write(ioapic, 0, reg_00.raw);
2731 reg_00.raw = io_apic_read(ioapic, 0);
2732 spin_unlock_irqrestore(&ioapic_lock, flags);
2733
2734 /* Sanity check */
2735 if (reg_00.bits.ID != apic_id) {
2736 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
2737 return -1;
2738 }
2739 }
2740
2741 apic_printk(APIC_VERBOSE, KERN_INFO
2742 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2743
2744 return apic_id;
2745}
2746
2747
2748int __init io_apic_get_version (int ioapic)
2749{
2750 union IO_APIC_reg_01 reg_01;
2751 unsigned long flags;
2752
2753 spin_lock_irqsave(&ioapic_lock, flags);
2754 reg_01.raw = io_apic_read(ioapic, 1);
2755 spin_unlock_irqrestore(&ioapic_lock, flags);
2756
2757 return reg_01.bits.version;
2758}
2759
2760
2761int __init io_apic_get_redir_entries (int ioapic)
2762{
2763 union IO_APIC_reg_01 reg_01;
2764 unsigned long flags;
2765
2766 spin_lock_irqsave(&ioapic_lock, flags);
2767 reg_01.raw = io_apic_read(ioapic, 1);
2768 spin_unlock_irqrestore(&ioapic_lock, flags);
2769
2770 return reg_01.bits.entries;
2771}
2772
2773
2774int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
2775{
2776 struct IO_APIC_route_entry entry;
2777 unsigned long flags;
2778
2779 if (!IO_APIC_IRQ(irq)) {
2780 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2781 ioapic);
2782 return -EINVAL;
2783 }
2784
2785 /*
2786 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2787 * Note that we mask (disable) IRQs now -- these get enabled when the
2788 * corresponding device driver registers for this IRQ.
2789 */
2790
2791 memset(&entry,0,sizeof(entry));
2792
2793 entry.delivery_mode = INT_DELIVERY_MODE;
2794 entry.dest_mode = INT_DEST_MODE;
2795 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2796 entry.trigger = edge_level;
2797 entry.polarity = active_high_low;
2798 entry.mask = 1;
2799
2800 /*
2801 * IRQs < 16 are already in the irq_2_pin[] map
2802 */
2803 if (irq >= 16)
2804 add_pin_to_irq(irq, ioapic, pin);
2805
2806 entry.vector = assign_irq_vector(irq);
2807
2808 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2809 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2810 mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
2811 edge_level, active_high_low);
2812
2813 ioapic_register_intr(irq, entry.vector, edge_level);
2814
2815 if (!ioapic && (irq < 16))
2816 disable_8259A_irq(irq);
2817
2818 spin_lock_irqsave(&ioapic_lock, flags);
2819 __ioapic_write_entry(ioapic, pin, entry);
2820 spin_unlock_irqrestore(&ioapic_lock, flags);
2821
2822 return 0;
2823}
2824
2825#endif /* CONFIG_ACPI */
2826
2827static int __init parse_disable_timer_pin_1(char *arg)
2828{
2829 disable_timer_pin_1 = 1;
2830 return 0;
2831}
2832early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2833
2834static int __init parse_enable_timer_pin_1(char *arg)
2835{
2836 disable_timer_pin_1 = -1;
2837 return 0;
2838}
2839early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2840
2841static int __init parse_noapic(char *arg)
2842{
2843 /* disable IO-APIC */
2844 disable_ioapic_setup();
2845 return 0;
2846}
2847early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
new file mode 100644
index 000000000000..3d310a946d76
--- /dev/null
+++ b/arch/x86/kernel/ioport_32.c
@@ -0,0 +1,153 @@
1/*
2 * linux/arch/i386/kernel/ioport.c
3 *
4 * This contains the io-permission bitmap code - written by obz, with changes
5 * by Linus.
6 */
7
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/capability.h>
11#include <linux/errno.h>
12#include <linux/types.h>
13#include <linux/ioport.h>
14#include <linux/smp.h>
15#include <linux/stddef.h>
16#include <linux/slab.h>
17#include <linux/thread_info.h>
18#include <linux/syscalls.h>
19
20/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
21static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
22{
23 unsigned long mask;
24 unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
25 unsigned int low_index = base & (BITS_PER_LONG-1);
26 int length = low_index + extent;
27
28 if (low_index != 0) {
29 mask = (~0UL << low_index);
30 if (length < BITS_PER_LONG)
31 mask &= ~(~0UL << length);
32 if (new_value)
33 *bitmap_base++ |= mask;
34 else
35 *bitmap_base++ &= ~mask;
36 length -= BITS_PER_LONG;
37 }
38
39 mask = (new_value ? ~0UL : 0UL);
40 while (length >= BITS_PER_LONG) {
41 *bitmap_base++ = mask;
42 length -= BITS_PER_LONG;
43 }
44
45 if (length > 0) {
46 mask = ~(~0UL << length);
47 if (new_value)
48 *bitmap_base++ |= mask;
49 else
50 *bitmap_base++ &= ~mask;
51 }
52}
53
54
55/*
56 * this changes the io permissions bitmap in the current task.
57 */
58asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
59{
60 unsigned long i, max_long, bytes, bytes_updated;
61 struct thread_struct * t = &current->thread;
62 struct tss_struct * tss;
63 unsigned long *bitmap;
64
65 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
66 return -EINVAL;
67 if (turn_on && !capable(CAP_SYS_RAWIO))
68 return -EPERM;
69
70 /*
71 * If it's the first ioperm() call in this thread's lifetime, set the
72 * IO bitmap up. ioperm() is much less timing critical than clone(),
73 * this is why we delay this operation until now:
74 */
75 if (!t->io_bitmap_ptr) {
76 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
77 if (!bitmap)
78 return -ENOMEM;
79
80 memset(bitmap, 0xff, IO_BITMAP_BYTES);
81 t->io_bitmap_ptr = bitmap;
82 set_thread_flag(TIF_IO_BITMAP);
83 }
84
85 /*
86 * do it in the per-thread copy and in the TSS ...
87 *
88 * Disable preemption via get_cpu() - we must not switch away
89 * because the ->io_bitmap_max value must match the bitmap
90 * contents:
91 */
92 tss = &per_cpu(init_tss, get_cpu());
93
94 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
95
96 /*
97 * Search for a (possibly new) maximum. This is simple and stupid,
98 * to keep it obviously correct:
99 */
100 max_long = 0;
101 for (i = 0; i < IO_BITMAP_LONGS; i++)
102 if (t->io_bitmap_ptr[i] != ~0UL)
103 max_long = i;
104
105 bytes = (max_long + 1) * sizeof(long);
106 bytes_updated = max(bytes, t->io_bitmap_max);
107
108 t->io_bitmap_max = bytes;
109
110 /*
111 * Sets the lazy trigger so that the next I/O operation will
112 * reload the correct bitmap.
113 * Reset the owner so that a process switch will not set
114 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
115 */
116 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
117 tss->io_bitmap_owner = NULL;
118
119 put_cpu();
120
121 return 0;
122}
123
124/*
125 * sys_iopl has to be used when you want to access the IO ports
126 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
127 * you'd need 8kB of bitmaps/process, which is a bit excessive.
128 *
129 * Here we just change the eflags value on the stack: we allow
130 * only the super-user to do it. This depends on the stack-layout
131 * on system-call entry - see also fork() and the signal handling
132 * code.
133 */
134
135asmlinkage long sys_iopl(unsigned long unused)
136{
137 volatile struct pt_regs * regs = (struct pt_regs *) &unused;
138 unsigned int level = regs->ebx;
139 unsigned int old = (regs->eflags >> 12) & 3;
140 struct thread_struct *t = &current->thread;
141
142 if (level > 3)
143 return -EINVAL;
144 /* Trying to gain more privileges? */
145 if (level > old) {
146 if (!capable(CAP_SYS_RAWIO))
147 return -EPERM;
148 }
149 t->iopl = level << 12;
150 regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
151 set_iopl_mask(t->iopl);
152 return 0;
153}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644
index 000000000000..dd2b97fc00b2
--- /dev/null
+++ b/arch/x86/kernel/irq_32.c
@@ -0,0 +1,343 @@
1/*
2 * linux/arch/i386/kernel/irq.c
3 *
4 * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
5 *
6 * This file contains the lowest level x86-specific interrupt
7 * entry, irq-stacks and irq statistics code. All the remaining
8 * irq logic is done by the generic kernel/irq/ code and
9 * by the x86-specific irq controller code. (e.g. i8259.c and
10 * io_apic.c.)
11 */
12
13#include <linux/module.h>
14#include <linux/seq_file.h>
15#include <linux/interrupt.h>
16#include <linux/kernel_stat.h>
17#include <linux/notifier.h>
18#include <linux/cpu.h>
19#include <linux/delay.h>
20
21#include <asm/apic.h>
22#include <asm/uaccess.h>
23
24DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
25EXPORT_PER_CPU_SYMBOL(irq_stat);
26
27DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs);
29
30/*
31 * 'what should we do if we get a hw irq event on an illegal vector'.
32 * each architecture has to answer this themselves.
33 */
34void ack_bad_irq(unsigned int irq)
35{
36 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
37
38#ifdef CONFIG_X86_LOCAL_APIC
39 /*
40 * Currently unexpected vectors happen only on SMP and APIC.
41 * We _must_ ack these because every local APIC has only N
42 * irq slots per priority level, and a 'hanging, unacked' IRQ
43 * holds up an irq slot - in excessive cases (when multiple
44 * unexpected vectors occur) that might lock up the APIC
45 * completely.
46 * But only ack when the APIC is enabled -AK
47 */
48 if (cpu_has_apic)
49 ack_APIC_irq();
50#endif
51}
52
53#ifdef CONFIG_4KSTACKS
54/*
55 * per-CPU IRQ handling contexts (thread information and stack)
56 */
57union irq_ctx {
58 struct thread_info tinfo;
59 u32 stack[THREAD_SIZE/sizeof(u32)];
60};
61
62static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
63static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
64#endif
65
66/*
67 * do_IRQ handles all normal device IRQ's (the special
68 * SMP cross-CPU interrupts have their own specific
69 * handlers).
70 */
71fastcall unsigned int do_IRQ(struct pt_regs *regs)
72{
73 struct pt_regs *old_regs;
74 /* high bit used in ret_from_ code */
75 int irq = ~regs->orig_eax;
76 struct irq_desc *desc = irq_desc + irq;
77#ifdef CONFIG_4KSTACKS
78 union irq_ctx *curctx, *irqctx;
79 u32 *isp;
80#endif
81
82 if (unlikely((unsigned)irq >= NR_IRQS)) {
83 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
84 __FUNCTION__, irq);
85 BUG();
86 }
87
88 old_regs = set_irq_regs(regs);
89 irq_enter();
90#ifdef CONFIG_DEBUG_STACKOVERFLOW
91 /* Debugging check for stack overflow: is there less than 1KB free? */
92 {
93 long esp;
94
95 __asm__ __volatile__("andl %%esp,%0" :
96 "=r" (esp) : "0" (THREAD_SIZE - 1));
97 if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
98 printk("do_IRQ: stack overflow: %ld\n",
99 esp - sizeof(struct thread_info));
100 dump_stack();
101 }
102 }
103#endif
104
105#ifdef CONFIG_4KSTACKS
106
107 curctx = (union irq_ctx *) current_thread_info();
108 irqctx = hardirq_ctx[smp_processor_id()];
109
110 /*
111 * this is where we switch to the IRQ stack. However, if we are
112 * already using the IRQ stack (because we interrupted a hardirq
113 * handler) we can't do that and just have to keep using the
114 * current stack (which is the irq stack already after all)
115 */
116 if (curctx != irqctx) {
117 int arg1, arg2, ebx;
118
119 /* build the stack frame on the IRQ stack */
120 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
121 irqctx->tinfo.task = curctx->tinfo.task;
122 irqctx->tinfo.previous_esp = current_stack_pointer;
123
124 /*
125 * Copy the softirq bits in preempt_count so that the
126 * softirq checks work in the hardirq context.
127 */
128 irqctx->tinfo.preempt_count =
129 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
130 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
131
132 asm volatile(
133 " xchgl %%ebx,%%esp \n"
134 " call *%%edi \n"
135 " movl %%ebx,%%esp \n"
136 : "=a" (arg1), "=d" (arg2), "=b" (ebx)
137 : "0" (irq), "1" (desc), "2" (isp),
138 "D" (desc->handle_irq)
139 : "memory", "cc"
140 );
141 } else
142#endif
143 desc->handle_irq(irq, desc);
144
145 irq_exit();
146 set_irq_regs(old_regs);
147 return 1;
148}
149
150#ifdef CONFIG_4KSTACKS
151
152static char softirq_stack[NR_CPUS * THREAD_SIZE]
153 __attribute__((__section__(".bss.page_aligned")));
154
155static char hardirq_stack[NR_CPUS * THREAD_SIZE]
156 __attribute__((__section__(".bss.page_aligned")));
157
158/*
159 * allocate per-cpu stacks for hardirq and for softirq processing
160 */
161void irq_ctx_init(int cpu)
162{
163 union irq_ctx *irqctx;
164
165 if (hardirq_ctx[cpu])
166 return;
167
168 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
169 irqctx->tinfo.task = NULL;
170 irqctx->tinfo.exec_domain = NULL;
171 irqctx->tinfo.cpu = cpu;
172 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
173 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
174
175 hardirq_ctx[cpu] = irqctx;
176
177 irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
178 irqctx->tinfo.task = NULL;
179 irqctx->tinfo.exec_domain = NULL;
180 irqctx->tinfo.cpu = cpu;
181 irqctx->tinfo.preempt_count = 0;
182 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
183
184 softirq_ctx[cpu] = irqctx;
185
186 printk("CPU %u irqstacks, hard=%p soft=%p\n",
187 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
188}
189
190void irq_ctx_exit(int cpu)
191{
192 hardirq_ctx[cpu] = NULL;
193}
194
195extern asmlinkage void __do_softirq(void);
196
197asmlinkage void do_softirq(void)
198{
199 unsigned long flags;
200 struct thread_info *curctx;
201 union irq_ctx *irqctx;
202 u32 *isp;
203
204 if (in_interrupt())
205 return;
206
207 local_irq_save(flags);
208
209 if (local_softirq_pending()) {
210 curctx = current_thread_info();
211 irqctx = softirq_ctx[smp_processor_id()];
212 irqctx->tinfo.task = curctx->task;
213 irqctx->tinfo.previous_esp = current_stack_pointer;
214
215 /* build the stack frame on the softirq stack */
216 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
217
218 asm volatile(
219 " xchgl %%ebx,%%esp \n"
220 " call __do_softirq \n"
221 " movl %%ebx,%%esp \n"
222 : "=b"(isp)
223 : "0"(isp)
224 : "memory", "cc", "edx", "ecx", "eax"
225 );
226 /*
227 * Shouldnt happen, we returned above if in_interrupt():
228 */
229 WARN_ON_ONCE(softirq_count());
230 }
231
232 local_irq_restore(flags);
233}
234
235EXPORT_SYMBOL(do_softirq);
236#endif
237
238/*
239 * Interrupt statistics:
240 */
241
242atomic_t irq_err_count;
243
244/*
245 * /proc/interrupts printing:
246 */
247
248int show_interrupts(struct seq_file *p, void *v)
249{
250 int i = *(loff_t *) v, j;
251 struct irqaction * action;
252 unsigned long flags;
253
254 if (i == 0) {
255 seq_printf(p, " ");
256 for_each_online_cpu(j)
257 seq_printf(p, "CPU%-8d",j);
258 seq_putc(p, '\n');
259 }
260
261 if (i < NR_IRQS) {
262 spin_lock_irqsave(&irq_desc[i].lock, flags);
263 action = irq_desc[i].action;
264 if (!action)
265 goto skip;
266 seq_printf(p, "%3d: ",i);
267#ifndef CONFIG_SMP
268 seq_printf(p, "%10u ", kstat_irqs(i));
269#else
270 for_each_online_cpu(j)
271 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
272#endif
273 seq_printf(p, " %8s", irq_desc[i].chip->name);
274 seq_printf(p, "-%-8s", irq_desc[i].name);
275 seq_printf(p, " %s", action->name);
276
277 for (action=action->next; action; action = action->next)
278 seq_printf(p, ", %s", action->name);
279
280 seq_putc(p, '\n');
281skip:
282 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
283 } else if (i == NR_IRQS) {
284 seq_printf(p, "NMI: ");
285 for_each_online_cpu(j)
286 seq_printf(p, "%10u ", nmi_count(j));
287 seq_putc(p, '\n');
288#ifdef CONFIG_X86_LOCAL_APIC
289 seq_printf(p, "LOC: ");
290 for_each_online_cpu(j)
291 seq_printf(p, "%10u ",
292 per_cpu(irq_stat,j).apic_timer_irqs);
293 seq_putc(p, '\n');
294#endif
295 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
296#if defined(CONFIG_X86_IO_APIC)
297 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
298#endif
299 }
300 return 0;
301}
302
303#ifdef CONFIG_HOTPLUG_CPU
304#include <mach_apic.h>
305
306void fixup_irqs(cpumask_t map)
307{
308 unsigned int irq;
309 static int warned;
310
311 for (irq = 0; irq < NR_IRQS; irq++) {
312 cpumask_t mask;
313 if (irq == 2)
314 continue;
315
316 cpus_and(mask, irq_desc[irq].affinity, map);
317 if (any_online_cpu(mask) == NR_CPUS) {
318 printk("Breaking affinity for irq %i\n", irq);
319 mask = map;
320 }
321 if (irq_desc[irq].chip->set_affinity)
322 irq_desc[irq].chip->set_affinity(irq, mask);
323 else if (irq_desc[irq].action && !(warned++))
324 printk("Cannot set affinity for irq %i\n", irq);
325 }
326
327#if 0
328 barrier();
329 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
330 [note the nop - the interrupt-enable boundary on x86 is two
331 instructions from sti] - to flush out pending hardirqs and
332 IPIs. After this point nothing is supposed to reach this CPU." */
333 __asm__ __volatile__("sti; nop; cli");
334 barrier();
335#else
336 /* That doesn't seem sufficient. Give it 1ms. */
337 local_irq_enable();
338 mdelay(1);
339 local_irq_disable();
340#endif
341}
342#endif
343
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
new file mode 100644
index 000000000000..448a50b1324c
--- /dev/null
+++ b/arch/x86/kernel/kprobes_32.c
@@ -0,0 +1,751 @@
1/*
2 * Kernel Probes (KProbes)
3 * arch/i386/kernel/kprobes.c
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004
20 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
22 * Probes initial implementation ( includes contributions from
23 * Rusty Russell).
24 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
25 * interface to access function arguments.
26 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
27 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
28 * <prasanna@in.ibm.com> added function-return probes.
29 */
30
31#include <linux/kprobes.h>
32#include <linux/ptrace.h>
33#include <linux/preempt.h>
34#include <linux/kdebug.h>
35#include <asm/cacheflush.h>
36#include <asm/desc.h>
37#include <asm/uaccess.h>
38#include <asm/alternative.h>
39
40void jprobe_return_end(void);
41
42DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
43DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
44
45/* insert a jmp code */
46static __always_inline void set_jmp_op(void *from, void *to)
47{
48 struct __arch_jmp_op {
49 char op;
50 long raddr;
51 } __attribute__((packed)) *jop;
52 jop = (struct __arch_jmp_op *)from;
53 jop->raddr = (long)(to) - ((long)(from) + 5);
54 jop->op = RELATIVEJUMP_INSTRUCTION;
55}
56
57/*
58 * returns non-zero if opcodes can be boosted.
59 */
60static __always_inline int can_boost(kprobe_opcode_t *opcodes)
61{
62#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
63 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
64 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
65 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
66 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
67 << (row % 32))
68 /*
69 * Undefined/reserved opcodes, conditional jump, Opcode Extension
70 * Groups, and some special opcodes can not be boost.
71 */
72 static const unsigned long twobyte_is_boostable[256 / 32] = {
73 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
74 /* ------------------------------- */
75 W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
76 W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
77 W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
78 W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
79 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
80 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
81 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
82 W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
83 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
84 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
85 W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
86 W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
87 W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
88 W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
89 W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
90 W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
91 /* ------------------------------- */
92 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
93 };
94#undef W
95 kprobe_opcode_t opcode;
96 kprobe_opcode_t *orig_opcodes = opcodes;
97retry:
98 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
99 return 0;
100 opcode = *(opcodes++);
101
102 /* 2nd-byte opcode */
103 if (opcode == 0x0f) {
104 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
105 return 0;
106 return test_bit(*opcodes, twobyte_is_boostable);
107 }
108
109 switch (opcode & 0xf0) {
110 case 0x60:
111 if (0x63 < opcode && opcode < 0x67)
112 goto retry; /* prefixes */
113 /* can't boost Address-size override and bound */
114 return (opcode != 0x62 && opcode != 0x67);
115 case 0x70:
116 return 0; /* can't boost conditional jump */
117 case 0xc0:
118 /* can't boost software-interruptions */
119 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
120 case 0xd0:
121 /* can boost AA* and XLAT */
122 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
123 case 0xe0:
124 /* can boost in/out and absolute jmps */
125 return ((opcode & 0x04) || opcode == 0xea);
126 case 0xf0:
127 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
128 goto retry; /* lock/rep(ne) prefix */
129 /* clear and set flags can be boost */
130 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
131 default:
132 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
133 goto retry; /* prefixes */
134 /* can't boost CS override and call */
135 return (opcode != 0x2e && opcode != 0x9a);
136 }
137}
138
139/*
140 * returns non-zero if opcode modifies the interrupt flag.
141 */
142static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
143{
144 switch (opcode) {
145 case 0xfa: /* cli */
146 case 0xfb: /* sti */
147 case 0xcf: /* iret/iretd */
148 case 0x9d: /* popf/popfd */
149 return 1;
150 }
151 return 0;
152}
153
154int __kprobes arch_prepare_kprobe(struct kprobe *p)
155{
156 /* insn: must be on special executable page on i386. */
157 p->ainsn.insn = get_insn_slot();
158 if (!p->ainsn.insn)
159 return -ENOMEM;
160
161 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
162 p->opcode = *p->addr;
163 if (can_boost(p->addr)) {
164 p->ainsn.boostable = 0;
165 } else {
166 p->ainsn.boostable = -1;
167 }
168 return 0;
169}
170
171void __kprobes arch_arm_kprobe(struct kprobe *p)
172{
173 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
174}
175
176void __kprobes arch_disarm_kprobe(struct kprobe *p)
177{
178 text_poke(p->addr, &p->opcode, 1);
179}
180
181void __kprobes arch_remove_kprobe(struct kprobe *p)
182{
183 mutex_lock(&kprobe_mutex);
184 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
185 mutex_unlock(&kprobe_mutex);
186}
187
188static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
189{
190 kcb->prev_kprobe.kp = kprobe_running();
191 kcb->prev_kprobe.status = kcb->kprobe_status;
192 kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
193 kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
194}
195
196static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
197{
198 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
199 kcb->kprobe_status = kcb->prev_kprobe.status;
200 kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
201 kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
202}
203
204static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
205 struct kprobe_ctlblk *kcb)
206{
207 __get_cpu_var(current_kprobe) = p;
208 kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
209 = (regs->eflags & (TF_MASK | IF_MASK));
210 if (is_IF_modifier(p->opcode))
211 kcb->kprobe_saved_eflags &= ~IF_MASK;
212}
213
214static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
215{
216 regs->eflags |= TF_MASK;
217 regs->eflags &= ~IF_MASK;
218 /*single step inline if the instruction is an int3*/
219 if (p->opcode == BREAKPOINT_INSTRUCTION)
220 regs->eip = (unsigned long)p->addr;
221 else
222 regs->eip = (unsigned long)p->ainsn.insn;
223}
224
225/* Called with kretprobe_lock held */
226void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
227 struct pt_regs *regs)
228{
229 unsigned long *sara = (unsigned long *)&regs->esp;
230
231 ri->ret_addr = (kprobe_opcode_t *) *sara;
232
233 /* Replace the return addr with trampoline addr */
234 *sara = (unsigned long) &kretprobe_trampoline;
235}
236
237/*
238 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
239 * remain disabled thorough out this function.
240 */
241static int __kprobes kprobe_handler(struct pt_regs *regs)
242{
243 struct kprobe *p;
244 int ret = 0;
245 kprobe_opcode_t *addr;
246 struct kprobe_ctlblk *kcb;
247
248 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
249
250 /*
251 * We don't want to be preempted for the entire
252 * duration of kprobe processing
253 */
254 preempt_disable();
255 kcb = get_kprobe_ctlblk();
256
257 /* Check we're not actually recursing */
258 if (kprobe_running()) {
259 p = get_kprobe(addr);
260 if (p) {
261 if (kcb->kprobe_status == KPROBE_HIT_SS &&
262 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
263 regs->eflags &= ~TF_MASK;
264 regs->eflags |= kcb->kprobe_saved_eflags;
265 goto no_kprobe;
266 }
267 /* We have reentered the kprobe_handler(), since
268 * another probe was hit while within the handler.
269 * We here save the original kprobes variables and
270 * just single step on the instruction of the new probe
271 * without calling any user handlers.
272 */
273 save_previous_kprobe(kcb);
274 set_current_kprobe(p, regs, kcb);
275 kprobes_inc_nmissed_count(p);
276 prepare_singlestep(p, regs);
277 kcb->kprobe_status = KPROBE_REENTER;
278 return 1;
279 } else {
280 if (*addr != BREAKPOINT_INSTRUCTION) {
281 /* The breakpoint instruction was removed by
282 * another cpu right after we hit, no further
283 * handling of this interrupt is appropriate
284 */
285 regs->eip -= sizeof(kprobe_opcode_t);
286 ret = 1;
287 goto no_kprobe;
288 }
289 p = __get_cpu_var(current_kprobe);
290 if (p->break_handler && p->break_handler(p, regs)) {
291 goto ss_probe;
292 }
293 }
294 goto no_kprobe;
295 }
296
297 p = get_kprobe(addr);
298 if (!p) {
299 if (*addr != BREAKPOINT_INSTRUCTION) {
300 /*
301 * The breakpoint instruction was removed right
302 * after we hit it. Another cpu has removed
303 * either a probepoint or a debugger breakpoint
304 * at this address. In either case, no further
305 * handling of this interrupt is appropriate.
306 * Back up over the (now missing) int3 and run
307 * the original instruction.
308 */
309 regs->eip -= sizeof(kprobe_opcode_t);
310 ret = 1;
311 }
312 /* Not one of ours: let kernel handle it */
313 goto no_kprobe;
314 }
315
316 set_current_kprobe(p, regs, kcb);
317 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
318
319 if (p->pre_handler && p->pre_handler(p, regs))
320 /* handler has already set things up, so skip ss setup */
321 return 1;
322
323ss_probe:
324#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
325 if (p->ainsn.boostable == 1 && !p->post_handler){
326 /* Boost up -- we can execute copied instructions directly */
327 reset_current_kprobe();
328 regs->eip = (unsigned long)p->ainsn.insn;
329 preempt_enable_no_resched();
330 return 1;
331 }
332#endif
333 prepare_singlestep(p, regs);
334 kcb->kprobe_status = KPROBE_HIT_SS;
335 return 1;
336
337no_kprobe:
338 preempt_enable_no_resched();
339 return ret;
340}
341
342/*
343 * For function-return probes, init_kprobes() establishes a probepoint
344 * here. When a retprobed function returns, this probe is hit and
345 * trampoline_probe_handler() runs, calling the kretprobe's handler.
346 */
347 void __kprobes kretprobe_trampoline_holder(void)
348 {
349 asm volatile ( ".global kretprobe_trampoline\n"
350 "kretprobe_trampoline: \n"
351 " pushf\n"
352 /* skip cs, eip, orig_eax */
353 " subl $12, %esp\n"
354 " pushl %fs\n"
355 " pushl %ds\n"
356 " pushl %es\n"
357 " pushl %eax\n"
358 " pushl %ebp\n"
359 " pushl %edi\n"
360 " pushl %esi\n"
361 " pushl %edx\n"
362 " pushl %ecx\n"
363 " pushl %ebx\n"
364 " movl %esp, %eax\n"
365 " call trampoline_handler\n"
366 /* move eflags to cs */
367 " movl 52(%esp), %edx\n"
368 " movl %edx, 48(%esp)\n"
369 /* save true return address on eflags */
370 " movl %eax, 52(%esp)\n"
371 " popl %ebx\n"
372 " popl %ecx\n"
373 " popl %edx\n"
374 " popl %esi\n"
375 " popl %edi\n"
376 " popl %ebp\n"
377 " popl %eax\n"
378 /* skip eip, orig_eax, es, ds, fs */
379 " addl $20, %esp\n"
380 " popf\n"
381 " ret\n");
382}
383
384/*
385 * Called from kretprobe_trampoline
386 */
387fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
388{
389 struct kretprobe_instance *ri = NULL;
390 struct hlist_head *head, empty_rp;
391 struct hlist_node *node, *tmp;
392 unsigned long flags, orig_ret_address = 0;
393 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
394
395 INIT_HLIST_HEAD(&empty_rp);
396 spin_lock_irqsave(&kretprobe_lock, flags);
397 head = kretprobe_inst_table_head(current);
398 /* fixup registers */
399 regs->xcs = __KERNEL_CS | get_kernel_rpl();
400 regs->eip = trampoline_address;
401 regs->orig_eax = 0xffffffff;
402
403 /*
404 * It is possible to have multiple instances associated with a given
405 * task either because an multiple functions in the call path
406 * have a return probe installed on them, and/or more then one return
407 * return probe was registered for a target function.
408 *
409 * We can handle this because:
410 * - instances are always inserted at the head of the list
411 * - when multiple return probes are registered for the same
412 * function, the first instance's ret_addr will point to the
413 * real return address, and all the rest will point to
414 * kretprobe_trampoline
415 */
416 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
417 if (ri->task != current)
418 /* another task is sharing our hash bucket */
419 continue;
420
421 if (ri->rp && ri->rp->handler){
422 __get_cpu_var(current_kprobe) = &ri->rp->kp;
423 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
424 ri->rp->handler(ri, regs);
425 __get_cpu_var(current_kprobe) = NULL;
426 }
427
428 orig_ret_address = (unsigned long)ri->ret_addr;
429 recycle_rp_inst(ri, &empty_rp);
430
431 if (orig_ret_address != trampoline_address)
432 /*
433 * This is the real return address. Any other
434 * instances associated with this task are for
435 * other calls deeper on the call stack
436 */
437 break;
438 }
439
440 kretprobe_assert(ri, orig_ret_address, trampoline_address);
441 spin_unlock_irqrestore(&kretprobe_lock, flags);
442
443 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
444 hlist_del(&ri->hlist);
445 kfree(ri);
446 }
447 return (void*)orig_ret_address;
448}
449
450/*
451 * Called after single-stepping. p->addr is the address of the
452 * instruction whose first byte has been replaced by the "int 3"
453 * instruction. To avoid the SMP problems that can occur when we
454 * temporarily put back the original opcode to single-step, we
455 * single-stepped a copy of the instruction. The address of this
456 * copy is p->ainsn.insn.
457 *
458 * This function prepares to return from the post-single-step
459 * interrupt. We have to fix up the stack as follows:
460 *
461 * 0) Except in the case of absolute or indirect jump or call instructions,
462 * the new eip is relative to the copied instruction. We need to make
463 * it relative to the original instruction.
464 *
465 * 1) If the single-stepped instruction was pushfl, then the TF and IF
466 * flags are set in the just-pushed eflags, and may need to be cleared.
467 *
468 * 2) If the single-stepped instruction was a call, the return address
469 * that is atop the stack is the address following the copied instruction.
470 * We need to make it the address following the original instruction.
471 *
472 * This function also checks instruction size for preparing direct execution.
473 */
474static void __kprobes resume_execution(struct kprobe *p,
475 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
476{
477 unsigned long *tos = (unsigned long *)&regs->esp;
478 unsigned long copy_eip = (unsigned long)p->ainsn.insn;
479 unsigned long orig_eip = (unsigned long)p->addr;
480
481 regs->eflags &= ~TF_MASK;
482 switch (p->ainsn.insn[0]) {
483 case 0x9c: /* pushfl */
484 *tos &= ~(TF_MASK | IF_MASK);
485 *tos |= kcb->kprobe_old_eflags;
486 break;
487 case 0xc2: /* iret/ret/lret */
488 case 0xc3:
489 case 0xca:
490 case 0xcb:
491 case 0xcf:
492 case 0xea: /* jmp absolute -- eip is correct */
493 /* eip is already adjusted, no more changes required */
494 p->ainsn.boostable = 1;
495 goto no_change;
496 case 0xe8: /* call relative - Fix return addr */
497 *tos = orig_eip + (*tos - copy_eip);
498 break;
499 case 0x9a: /* call absolute -- same as call absolute, indirect */
500 *tos = orig_eip + (*tos - copy_eip);
501 goto no_change;
502 case 0xff:
503 if ((p->ainsn.insn[1] & 0x30) == 0x10) {
504 /*
505 * call absolute, indirect
506 * Fix return addr; eip is correct.
507 * But this is not boostable
508 */
509 *tos = orig_eip + (*tos - copy_eip);
510 goto no_change;
511 } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
512 ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
513 /* eip is correct. And this is boostable */
514 p->ainsn.boostable = 1;
515 goto no_change;
516 }
517 default:
518 break;
519 }
520
521 if (p->ainsn.boostable == 0) {
522 if ((regs->eip > copy_eip) &&
523 (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
524 /*
525 * These instructions can be executed directly if it
526 * jumps back to correct address.
527 */
528 set_jmp_op((void *)regs->eip,
529 (void *)orig_eip + (regs->eip - copy_eip));
530 p->ainsn.boostable = 1;
531 } else {
532 p->ainsn.boostable = -1;
533 }
534 }
535
536 regs->eip = orig_eip + (regs->eip - copy_eip);
537
538no_change:
539 return;
540}
541
542/*
543 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
544 * remain disabled thoroughout this function.
545 */
546static int __kprobes post_kprobe_handler(struct pt_regs *regs)
547{
548 struct kprobe *cur = kprobe_running();
549 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
550
551 if (!cur)
552 return 0;
553
554 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
555 kcb->kprobe_status = KPROBE_HIT_SSDONE;
556 cur->post_handler(cur, regs, 0);
557 }
558
559 resume_execution(cur, regs, kcb);
560 regs->eflags |= kcb->kprobe_saved_eflags;
561
562 /*Restore back the original saved kprobes variables and continue. */
563 if (kcb->kprobe_status == KPROBE_REENTER) {
564 restore_previous_kprobe(kcb);
565 goto out;
566 }
567 reset_current_kprobe();
568out:
569 preempt_enable_no_resched();
570
571 /*
572 * if somebody else is singlestepping across a probe point, eflags
573 * will have TF set, in which case, continue the remaining processing
574 * of do_debug, as if this is not a probe hit.
575 */
576 if (regs->eflags & TF_MASK)
577 return 0;
578
579 return 1;
580}
581
582static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
583{
584 struct kprobe *cur = kprobe_running();
585 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
586
587 switch(kcb->kprobe_status) {
588 case KPROBE_HIT_SS:
589 case KPROBE_REENTER:
590 /*
591 * We are here because the instruction being single
592 * stepped caused a page fault. We reset the current
593 * kprobe and the eip points back to the probe address
594 * and allow the page fault handler to continue as a
595 * normal page fault.
596 */
597 regs->eip = (unsigned long)cur->addr;
598 regs->eflags |= kcb->kprobe_old_eflags;
599 if (kcb->kprobe_status == KPROBE_REENTER)
600 restore_previous_kprobe(kcb);
601 else
602 reset_current_kprobe();
603 preempt_enable_no_resched();
604 break;
605 case KPROBE_HIT_ACTIVE:
606 case KPROBE_HIT_SSDONE:
607 /*
608 * We increment the nmissed count for accounting,
609 * we can also use npre/npostfault count for accouting
610 * these specific fault cases.
611 */
612 kprobes_inc_nmissed_count(cur);
613
614 /*
615 * We come here because instructions in the pre/post
616 * handler caused the page_fault, this could happen
617 * if handler tries to access user space by
618 * copy_from_user(), get_user() etc. Let the
619 * user-specified handler try to fix it first.
620 */
621 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
622 return 1;
623
624 /*
625 * In case the user-specified fault handler returned
626 * zero, try to fix up.
627 */
628 if (fixup_exception(regs))
629 return 1;
630
631 /*
632 * fixup_exception() could not handle it,
633 * Let do_page_fault() fix it.
634 */
635 break;
636 default:
637 break;
638 }
639 return 0;
640}
641
642/*
643 * Wrapper routine to for handling exceptions.
644 */
645int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
646 unsigned long val, void *data)
647{
648 struct die_args *args = (struct die_args *)data;
649 int ret = NOTIFY_DONE;
650
651 if (args->regs && user_mode_vm(args->regs))
652 return ret;
653
654 switch (val) {
655 case DIE_INT3:
656 if (kprobe_handler(args->regs))
657 ret = NOTIFY_STOP;
658 break;
659 case DIE_DEBUG:
660 if (post_kprobe_handler(args->regs))
661 ret = NOTIFY_STOP;
662 break;
663 case DIE_GPF:
664 case DIE_PAGE_FAULT:
665 /* kprobe_running() needs smp_processor_id() */
666 preempt_disable();
667 if (kprobe_running() &&
668 kprobe_fault_handler(args->regs, args->trapnr))
669 ret = NOTIFY_STOP;
670 preempt_enable();
671 break;
672 default:
673 break;
674 }
675 return ret;
676}
677
678int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
679{
680 struct jprobe *jp = container_of(p, struct jprobe, kp);
681 unsigned long addr;
682 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
683
684 kcb->jprobe_saved_regs = *regs;
685 kcb->jprobe_saved_esp = &regs->esp;
686 addr = (unsigned long)(kcb->jprobe_saved_esp);
687
688 /*
689 * TBD: As Linus pointed out, gcc assumes that the callee
690 * owns the argument space and could overwrite it, e.g.
691 * tailcall optimization. So, to be absolutely safe
692 * we also save and restore enough stack bytes to cover
693 * the argument area.
694 */
695 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
696 MIN_STACK_SIZE(addr));
697 regs->eflags &= ~IF_MASK;
698 regs->eip = (unsigned long)(jp->entry);
699 return 1;
700}
701
702void __kprobes jprobe_return(void)
703{
704 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
705
706 asm volatile (" xchgl %%ebx,%%esp \n"
707 " int3 \n"
708 " .globl jprobe_return_end \n"
709 " jprobe_return_end: \n"
710 " nop \n"::"b"
711 (kcb->jprobe_saved_esp):"memory");
712}
713
714int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
715{
716 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
717 u8 *addr = (u8 *) (regs->eip - 1);
718 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
719 struct jprobe *jp = container_of(p, struct jprobe, kp);
720
721 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
722 if (&regs->esp != kcb->jprobe_saved_esp) {
723 struct pt_regs *saved_regs =
724 container_of(kcb->jprobe_saved_esp,
725 struct pt_regs, esp);
726 printk("current esp %p does not match saved esp %p\n",
727 &regs->esp, kcb->jprobe_saved_esp);
728 printk("Saved registers for jprobe %p\n", jp);
729 show_registers(saved_regs);
730 printk("Current registers\n");
731 show_registers(regs);
732 BUG();
733 }
734 *regs = kcb->jprobe_saved_regs;
735 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
736 MIN_STACK_SIZE(stack_addr));
737 preempt_enable_no_resched();
738 return 1;
739 }
740 return 0;
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 return 0;
746}
747
748int __init arch_init_kprobes(void)
749{
750 return 0;
751}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
new file mode 100644
index 000000000000..e0b2d17f4f10
--- /dev/null
+++ b/arch/x86/kernel/ldt_32.c
@@ -0,0 +1,250 @@
1/*
2 * linux/arch/i386/kernel/ldt.c
3 *
4 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
5 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
6 */
7
8#include <linux/errno.h>
9#include <linux/sched.h>
10#include <linux/string.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <asm/ldt.h>
19#include <asm/desc.h>
20#include <asm/mmu_context.h>
21
22#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
23static void flush_ldt(void *null)
24{
25 if (current->active_mm)
26 load_LDT(&current->active_mm->context);
27}
28#endif
29
30static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
31{
32 void *oldldt;
33 void *newldt;
34 int oldsize;
35
36 if (mincount <= pc->size)
37 return 0;
38 oldsize = pc->size;
39 mincount = (mincount+511)&(~511);
40 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
41 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
42 else
43 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
44
45 if (!newldt)
46 return -ENOMEM;
47
48 if (oldsize)
49 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
50 oldldt = pc->ldt;
51 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
52 pc->ldt = newldt;
53 wmb();
54 pc->size = mincount;
55 wmb();
56
57 if (reload) {
58#ifdef CONFIG_SMP
59 cpumask_t mask;
60 preempt_disable();
61 load_LDT(pc);
62 mask = cpumask_of_cpu(smp_processor_id());
63 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
64 smp_call_function(flush_ldt, NULL, 1, 1);
65 preempt_enable();
66#else
67 load_LDT(pc);
68#endif
69 }
70 if (oldsize) {
71 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
72 vfree(oldldt);
73 else
74 kfree(oldldt);
75 }
76 return 0;
77}
78
79static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
80{
81 int err = alloc_ldt(new, old->size, 0);
82 if (err < 0)
83 return err;
84 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
85 return 0;
86}
87
88/*
89 * we do not have to muck with descriptors here, that is
90 * done in switch_mm() as needed.
91 */
92int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
93{
94 struct mm_struct * old_mm;
95 int retval = 0;
96
97 init_MUTEX(&mm->context.sem);
98 mm->context.size = 0;
99 old_mm = current->mm;
100 if (old_mm && old_mm->context.size > 0) {
101 down(&old_mm->context.sem);
102 retval = copy_ldt(&mm->context, &old_mm->context);
103 up(&old_mm->context.sem);
104 }
105 return retval;
106}
107
108/*
109 * No need to lock the MM as we are the last user
110 */
111void destroy_context(struct mm_struct *mm)
112{
113 if (mm->context.size) {
114 if (mm == current->active_mm)
115 clear_LDT();
116 if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
117 vfree(mm->context.ldt);
118 else
119 kfree(mm->context.ldt);
120 mm->context.size = 0;
121 }
122}
123
124static int read_ldt(void __user * ptr, unsigned long bytecount)
125{
126 int err;
127 unsigned long size;
128 struct mm_struct * mm = current->mm;
129
130 if (!mm->context.size)
131 return 0;
132 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
133 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
134
135 down(&mm->context.sem);
136 size = mm->context.size*LDT_ENTRY_SIZE;
137 if (size > bytecount)
138 size = bytecount;
139
140 err = 0;
141 if (copy_to_user(ptr, mm->context.ldt, size))
142 err = -EFAULT;
143 up(&mm->context.sem);
144 if (err < 0)
145 goto error_return;
146 if (size != bytecount) {
147 /* zero-fill the rest */
148 if (clear_user(ptr+size, bytecount-size) != 0) {
149 err = -EFAULT;
150 goto error_return;
151 }
152 }
153 return bytecount;
154error_return:
155 return err;
156}
157
158static int read_default_ldt(void __user * ptr, unsigned long bytecount)
159{
160 int err;
161 unsigned long size;
162
163 err = 0;
164 size = 5*sizeof(struct desc_struct);
165 if (size > bytecount)
166 size = bytecount;
167
168 err = size;
169 if (clear_user(ptr, size))
170 err = -EFAULT;
171
172 return err;
173}
174
175static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
176{
177 struct mm_struct * mm = current->mm;
178 __u32 entry_1, entry_2;
179 int error;
180 struct user_desc ldt_info;
181
182 error = -EINVAL;
183 if (bytecount != sizeof(ldt_info))
184 goto out;
185 error = -EFAULT;
186 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
187 goto out;
188
189 error = -EINVAL;
190 if (ldt_info.entry_number >= LDT_ENTRIES)
191 goto out;
192 if (ldt_info.contents == 3) {
193 if (oldmode)
194 goto out;
195 if (ldt_info.seg_not_present == 0)
196 goto out;
197 }
198
199 down(&mm->context.sem);
200 if (ldt_info.entry_number >= mm->context.size) {
201 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
202 if (error < 0)
203 goto out_unlock;
204 }
205
206 /* Allow LDTs to be cleared by the user. */
207 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
208 if (oldmode || LDT_empty(&ldt_info)) {
209 entry_1 = 0;
210 entry_2 = 0;
211 goto install;
212 }
213 }
214
215 entry_1 = LDT_entry_a(&ldt_info);
216 entry_2 = LDT_entry_b(&ldt_info);
217 if (oldmode)
218 entry_2 &= ~(1 << 20);
219
220 /* Install the new entry ... */
221install:
222 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
223 error = 0;
224
225out_unlock:
226 up(&mm->context.sem);
227out:
228 return error;
229}
230
231asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
232{
233 int ret = -ENOSYS;
234
235 switch (func) {
236 case 0:
237 ret = read_ldt(ptr, bytecount);
238 break;
239 case 1:
240 ret = write_ldt(ptr, bytecount, 1);
241 break;
242 case 2:
243 ret = read_default_ldt(ptr, bytecount);
244 break;
245 case 0x11:
246 ret = write_ldt(ptr, bytecount, 0);
247 break;
248 }
249 return ret;
250}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644
index 000000000000..91966bafb3dc
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -0,0 +1,171 @@
1/*
2 * machine_kexec.c - handle transition of Linux booting another kernel
3 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/kexec.h>
11#include <linux/delay.h>
12#include <linux/init.h>
13#include <asm/pgtable.h>
14#include <asm/pgalloc.h>
15#include <asm/tlbflush.h>
16#include <asm/mmu_context.h>
17#include <asm/io.h>
18#include <asm/apic.h>
19#include <asm/cpufeature.h>
20#include <asm/desc.h>
21#include <asm/system.h>
22
23#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
24static u32 kexec_pgd[1024] PAGE_ALIGNED;
25#ifdef CONFIG_X86_PAE
26static u32 kexec_pmd0[1024] PAGE_ALIGNED;
27static u32 kexec_pmd1[1024] PAGE_ALIGNED;
28#endif
29static u32 kexec_pte0[1024] PAGE_ALIGNED;
30static u32 kexec_pte1[1024] PAGE_ALIGNED;
31
32static void set_idt(void *newidt, __u16 limit)
33{
34 struct Xgt_desc_struct curidt;
35
36 /* ia32 supports unaliged loads & stores */
37 curidt.size = limit;
38 curidt.address = (unsigned long)newidt;
39
40 load_idt(&curidt);
41};
42
43
44static void set_gdt(void *newgdt, __u16 limit)
45{
46 struct Xgt_desc_struct curgdt;
47
48 /* ia32 supports unaligned loads & stores */
49 curgdt.size = limit;
50 curgdt.address = (unsigned long)newgdt;
51
52 load_gdt(&curgdt);
53};
54
55static void load_segments(void)
56{
57#define __STR(X) #X
58#define STR(X) __STR(X)
59
60 __asm__ __volatile__ (
61 "\tljmp $"STR(__KERNEL_CS)",$1f\n"
62 "\t1:\n"
63 "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
64 "\tmovl %%eax,%%ds\n"
65 "\tmovl %%eax,%%es\n"
66 "\tmovl %%eax,%%fs\n"
67 "\tmovl %%eax,%%gs\n"
68 "\tmovl %%eax,%%ss\n"
69 ::: "eax", "memory");
70#undef STR
71#undef __STR
72}
73
74/*
75 * A architecture hook called to validate the
76 * proposed image and prepare the control pages
77 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE
78 * have been allocated, but the segments have yet
79 * been copied into the kernel.
80 *
81 * Do what every setup is needed on image and the
82 * reboot code buffer to allow us to avoid allocations
83 * later.
84 *
85 * Currently nothing.
86 */
87int machine_kexec_prepare(struct kimage *image)
88{
89 return 0;
90}
91
92/*
93 * Undo anything leftover by machine_kexec_prepare
94 * when an image is freed.
95 */
96void machine_kexec_cleanup(struct kimage *image)
97{
98}
99
100/*
101 * Do not allocate memory (or fail in any way) in machine_kexec().
102 * We are past the point of no return, committed to rebooting now.
103 */
104NORET_TYPE void machine_kexec(struct kimage *image)
105{
106 unsigned long page_list[PAGES_NR];
107 void *control_page;
108
109 /* Interrupts aren't acceptable while we reboot */
110 local_irq_disable();
111
112 control_page = page_address(image->control_code_page);
113 memcpy(control_page, relocate_kernel, PAGE_SIZE);
114
115 page_list[PA_CONTROL_PAGE] = __pa(control_page);
116 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
117 page_list[PA_PGD] = __pa(kexec_pgd);
118 page_list[VA_PGD] = (unsigned long)kexec_pgd;
119#ifdef CONFIG_X86_PAE
120 page_list[PA_PMD_0] = __pa(kexec_pmd0);
121 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
122 page_list[PA_PMD_1] = __pa(kexec_pmd1);
123 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
124#endif
125 page_list[PA_PTE_0] = __pa(kexec_pte0);
126 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
127 page_list[PA_PTE_1] = __pa(kexec_pte1);
128 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
129
130 /* The segment registers are funny things, they have both a
131 * visible and an invisible part. Whenever the visible part is
132 * set to a specific selector, the invisible part is loaded
133 * with from a table in memory. At no other time is the
134 * descriptor table in memory accessed.
135 *
136 * I take advantage of this here by force loading the
137 * segments, before I zap the gdt with an invalid value.
138 */
139 load_segments();
140 /* The gdt & idt are now invalid.
141 * If you want to load them you must set up your own idt & gdt.
142 */
143 set_gdt(phys_to_virt(0),0);
144 set_idt(phys_to_virt(0),0);
145
146 /* now call it */
147 relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
148 image->start, cpu_has_pae);
149}
150
151/* crashkernel=size@addr specifies the location to reserve for
152 * a crash kernel. By reserving this memory we guarantee
153 * that linux never sets it up as a DMA target.
154 * Useful for holding code to do something appropriate
155 * after a kernel panic.
156 */
157static int __init parse_crashkernel(char *arg)
158{
159 unsigned long size, base;
160 size = memparse(arg, &arg);
161 if (*arg == '@') {
162 base = memparse(arg+1, &arg);
163 /* FIXME: Do I want a sanity check
164 * to validate the memory range?
165 */
166 crashk_res.start = base;
167 crashk_res.end = base + size - 1;
168 }
169 return 0;
170}
171early_param("crashkernel", parse_crashkernel);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
new file mode 100644
index 000000000000..b83672b89527
--- /dev/null
+++ b/arch/x86/kernel/mca_32.c
@@ -0,0 +1,470 @@
1/*
2 * linux/arch/i386/kernel/mca.c
3 * Written by Martin Kolinek, February 1996
4 *
5 * Changes:
6 *
7 * Chris Beauregard July 28th, 1996
8 * - Fixed up integrated SCSI detection
9 *
10 * Chris Beauregard August 3rd, 1996
11 * - Made mca_info local
12 * - Made integrated registers accessible through standard function calls
13 * - Added name field
14 * - More sanity checking
15 *
16 * Chris Beauregard August 9th, 1996
17 * - Rewrote /proc/mca
18 *
19 * Chris Beauregard January 7th, 1997
20 * - Added basic NMI-processing
21 * - Added more information to mca_info structure
22 *
23 * David Weinehall October 12th, 1998
24 * - Made a lot of cleaning up in the source
25 * - Added use of save_flags / restore_flags
26 * - Added the 'driver_loaded' flag in MCA_adapter
27 * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
28 *
29 * David Weinehall March 24th, 1999
30 * - Fixed the output of 'Driver Installed' in /proc/mca/pos
31 * - Made the Integrated Video & SCSI show up even if they have id 0000
32 *
33 * Alexander Viro November 9th, 1999
34 * - Switched to regular procfs methods
35 *
36 * Alfred Arnold & David Weinehall August 23rd, 2000
37 * - Added support for Planar POS-registers
38 */
39
40#include <linux/module.h>
41#include <linux/types.h>
42#include <linux/errno.h>
43#include <linux/kernel.h>
44#include <linux/mca.h>
45#include <linux/kprobes.h>
46#include <asm/system.h>
47#include <asm/io.h>
48#include <linux/proc_fs.h>
49#include <linux/mman.h>
50#include <linux/mm.h>
51#include <linux/pagemap.h>
52#include <linux/ioport.h>
53#include <asm/uaccess.h>
54#include <linux/init.h>
55#include <asm/arch_hooks.h>
56
57static unsigned char which_scsi = 0;
58
59int MCA_bus = 0;
60EXPORT_SYMBOL(MCA_bus);
61
62/*
63 * Motherboard register spinlock. Untested on SMP at the moment, but
64 * are there any MCA SMP boxes?
65 *
66 * Yes - Alan
67 */
68static DEFINE_SPINLOCK(mca_lock);
69
70/* Build the status info for the adapter */
71
72static void mca_configure_adapter_status(struct mca_device *mca_dev) {
73 mca_dev->status = MCA_ADAPTER_NONE;
74
75 mca_dev->pos_id = mca_dev->pos[0]
76 + (mca_dev->pos[1] << 8);
77
78 if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
79
80 /* id = 0x0000 usually indicates hardware failure,
81 * however, ZP Gu (zpg@castle.net> reports that his 9556
82 * has 0x0000 as id and everything still works. There
83 * also seem to be an adapter with id = 0x0000; the
84 * NCR Parallel Bus Memory Card. Until this is confirmed,
85 * however, this code will stay.
86 */
87
88 mca_dev->status = MCA_ADAPTER_ERROR;
89
90 return;
91 } else if(mca_dev->pos_id != 0xffff) {
92
93 /* 0xffff usually indicates that there's no adapter,
94 * however, some integrated adapters may have 0xffff as
95 * their id and still be valid. Examples are on-board
96 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
97 * and possibly also the 95 ULTIMEDIA.
98 */
99
100 mca_dev->status = MCA_ADAPTER_NORMAL;
101 }
102
103 if((mca_dev->pos_id == 0xffff ||
104 mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
105 int j;
106
107 for(j = 2; j < 8; j++) {
108 if(mca_dev->pos[j] != 0xff) {
109 mca_dev->status = MCA_ADAPTER_NORMAL;
110 break;
111 }
112 }
113 }
114
115 if(!(mca_dev->pos[2] & MCA_ENABLED)) {
116
117 /* enabled bit is in POS 2 */
118
119 mca_dev->status = MCA_ADAPTER_DISABLED;
120 }
121} /* mca_configure_adapter_status */
122
123/*--------------------------------------------------------------------*/
124
125static struct resource mca_standard_resources[] = {
126 { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
127 { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
128 { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
129 { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
130 { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
131 { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
132 { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
133};
134
135#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
136
137/**
138 * mca_read_and_store_pos - read the POS registers into a memory buffer
139 * @pos: a char pointer to 8 bytes, contains the POS register value on
140 * successful return
141 *
142 * Returns 1 if a card actually exists (i.e. the pos isn't
143 * all 0xff) or 0 otherwise
144 */
145static int mca_read_and_store_pos(unsigned char *pos) {
146 int j;
147 int found = 0;
148
149 for(j=0; j<8; j++) {
150 if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
151 /* 0xff all across means no device. 0x00 means
152 * something's broken, but a device is
153 * probably there. However, if you get 0x00
154 * from a motherboard register it won't matter
155 * what we find. For the record, on the
156 * 57SLC, the integrated SCSI adapter has
157 * 0xffff for the adapter ID, but nonzero for
158 * other registers. */
159
160 found = 1;
161 }
162 }
163 return found;
164}
165
166static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
167{
168 unsigned char byte;
169 unsigned long flags;
170
171 if(reg < 0 || reg >= 8)
172 return 0;
173
174 spin_lock_irqsave(&mca_lock, flags);
175 if(mca_dev->pos_register) {
176 /* Disable adapter setup, enable motherboard setup */
177
178 outb_p(0, MCA_ADAPTER_SETUP_REG);
179 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
180
181 byte = inb_p(MCA_POS_REG(reg));
182 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
183 } else {
184
185 /* Make sure motherboard setup is off */
186
187 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
188
189 /* Read the appropriate register */
190
191 outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
192 byte = inb_p(MCA_POS_REG(reg));
193 outb_p(0, MCA_ADAPTER_SETUP_REG);
194 }
195 spin_unlock_irqrestore(&mca_lock, flags);
196
197 mca_dev->pos[reg] = byte;
198
199 return byte;
200}
201
202static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
203 unsigned char byte)
204{
205 unsigned long flags;
206
207 if(reg < 0 || reg >= 8)
208 return;
209
210 spin_lock_irqsave(&mca_lock, flags);
211
212 /* Make sure motherboard setup is off */
213
214 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
215
216 /* Read in the appropriate register */
217
218 outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
219 outb_p(byte, MCA_POS_REG(reg));
220 outb_p(0, MCA_ADAPTER_SETUP_REG);
221
222 spin_unlock_irqrestore(&mca_lock, flags);
223
224 /* Update the global register list, while we have the byte */
225
226 mca_dev->pos[reg] = byte;
227
228}
229
230/* for the primary MCA bus, we have identity transforms */
231static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
232{
233 return irq;
234}
235
236static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
237{
238 return port;
239}
240
241static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
242{
243 return mem;
244}
245
246
247static int __init mca_init(void)
248{
249 unsigned int i, j;
250 struct mca_device *mca_dev;
251 unsigned char pos[8];
252 short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
253 struct mca_bus *bus;
254
255 /* WARNING: Be careful when making changes here. Putting an adapter
256 * and the motherboard simultaneously into setup mode may result in
257 * damage to chips (according to The Indispensible PC Hardware Book
258 * by Hans-Peter Messmer). Also, we disable system interrupts (so
259 * that we are not disturbed in the middle of this).
260 */
261
262 /* Make sure the MCA bus is present */
263
264 if (mca_system_init()) {
265 printk(KERN_ERR "MCA bus system initialisation failed\n");
266 return -ENODEV;
267 }
268
269 if (!MCA_bus)
270 return -ENODEV;
271
272 printk(KERN_INFO "Micro Channel bus detected.\n");
273
274 /* All MCA systems have at least a primary bus */
275 bus = mca_attach_bus(MCA_PRIMARY_BUS);
276 if (!bus)
277 goto out_nomem;
278 bus->default_dma_mask = 0xffffffffLL;
279 bus->f.mca_write_pos = mca_pc_write_pos;
280 bus->f.mca_read_pos = mca_pc_read_pos;
281 bus->f.mca_transform_irq = mca_dummy_transform_irq;
282 bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284
285 /* get the motherboard device */
286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev))
288 goto out_nomem;
289
290 /*
291 * We do not expect many MCA interrupts during initialization,
292 * but let us be safe:
293 */
294 spin_lock_irq(&mca_lock);
295
296 /* Make sure adapter setup is off */
297
298 outb_p(0, MCA_ADAPTER_SETUP_REG);
299
300 /* Read motherboard POS registers */
301
302 mca_dev->pos_register = 0x7f;
303 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
304 mca_dev->name[0] = 0;
305 mca_read_and_store_pos(mca_dev->pos);
306 mca_configure_adapter_status(mca_dev);
307 /* fake POS and slot for a motherboard */
308 mca_dev->pos_id = MCA_MOTHERBOARD_POS;
309 mca_dev->slot = MCA_MOTHERBOARD;
310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
311
312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
313 if(unlikely(!mca_dev))
314 goto out_unlock_nomem;
315
316 /* Put motherboard into video setup mode, read integrated video
317 * POS registers, and turn motherboard setup off.
318 */
319
320 mca_dev->pos_register = 0xdf;
321 outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
322 mca_dev->name[0] = 0;
323 mca_read_and_store_pos(mca_dev->pos);
324 mca_configure_adapter_status(mca_dev);
325 /* fake POS and slot for the integrated video */
326 mca_dev->pos_id = MCA_INTEGVIDEO_POS;
327 mca_dev->slot = MCA_INTEGVIDEO;
328 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
329
330 /* Put motherboard into scsi setup mode, read integrated scsi
331 * POS registers, and turn motherboard setup off.
332 *
333 * It seems there are two possible SCSI registers. Martin says that
334 * for the 56,57, 0xf7 is the one, but fails on the 76.
335 * Alfredo (apena@vnet.ibm.com) says
336 * 0xfd works on his machine. We'll try both of them. I figure it's
337 * a good bet that only one could be valid at a time. This could
338 * screw up though if one is used for something else on the other
339 * machine.
340 */
341
342 for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
343 outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
344 if(mca_read_and_store_pos(pos))
345 break;
346 }
347 if(which_scsi) {
348 /* found a scsi card */
349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
350 if(unlikely(!mca_dev))
351 goto out_unlock_nomem;
352
353 for(j = 0; j < 8; j++)
354 mca_dev->pos[j] = pos[j];
355
356 mca_configure_adapter_status(mca_dev);
357 /* fake POS and slot for integrated SCSI controller */
358 mca_dev->pos_id = MCA_INTEGSCSI_POS;
359 mca_dev->slot = MCA_INTEGSCSI;
360 mca_dev->pos_register = which_scsi;
361 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
362 }
363
364 /* Turn off motherboard setup */
365
366 outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
367
368 /* Now loop over MCA slots: put each adapter into setup mode, and
369 * read its POS registers. Then put adapter setup off.
370 */
371
372 for(i=0; i<MCA_MAX_SLOT_NR; i++) {
373 outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
374 if(!mca_read_and_store_pos(pos))
375 continue;
376
377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
378 if(unlikely(!mca_dev))
379 goto out_unlock_nomem;
380
381 for(j=0; j<8; j++)
382 mca_dev->pos[j]=pos[j];
383
384 mca_dev->driver_loaded = 0;
385 mca_dev->slot = i;
386 mca_dev->pos_register = 0;
387 mca_configure_adapter_status(mca_dev);
388 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
389 }
390 outb_p(0, MCA_ADAPTER_SETUP_REG);
391
392 /* Enable interrupts and return memory start */
393 spin_unlock_irq(&mca_lock);
394
395 for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
396 request_resource(&ioport_resource, mca_standard_resources + i);
397
398 mca_do_proc_init();
399
400 return 0;
401
402 out_unlock_nomem:
403 spin_unlock_irq(&mca_lock);
404 out_nomem:
405 printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
406 return -ENOMEM;
407}
408
409subsys_initcall(mca_init);
410
411/*--------------------------------------------------------------------*/
412
413static __kprobes void
414mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
415{
416 int slot = mca_dev->slot;
417
418 if(slot == MCA_INTEGSCSI) {
419 printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
420 mca_dev->name);
421 } else if(slot == MCA_INTEGVIDEO) {
422 printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
423 mca_dev->name);
424 } else if(slot == MCA_MOTHERBOARD) {
425 printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
426 mca_dev->name);
427 }
428
429 /* More info available in POS 6 and 7? */
430
431 if(check_flag) {
432 unsigned char pos6, pos7;
433
434 pos6 = mca_device_read_pos(mca_dev, 6);
435 pos7 = mca_device_read_pos(mca_dev, 7);
436
437 printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
438 }
439
440} /* mca_handle_nmi_slot */
441
442/*--------------------------------------------------------------------*/
443
444static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
445{
446 struct mca_device *mca_dev = to_mca_device(dev);
447 unsigned char pos5;
448
449 pos5 = mca_device_read_pos(mca_dev, 5);
450
451 if(!(pos5 & 0x80)) {
452 /* Bit 7 of POS 5 is reset when this adapter has a hardware
453 * error. Bit 7 it reset if there's error information
454 * available in POS 6 and 7.
455 */
456 mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
457 return 1;
458 }
459 return 0;
460}
461
462void __kprobes mca_handle_nmi(void)
463{
464 /* First try - scan the various adapters and see if a specific
465 * adapter was responsible for the error.
466 */
467 bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
468
469 mca_nmi_hook();
470} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
new file mode 100644
index 000000000000..09cf78110358
--- /dev/null
+++ b/arch/x86/kernel/microcode.c
@@ -0,0 +1,850 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
12 * Order Number 245472 or free download from:
13 *
14 * http://developer.intel.com/design/pentium4/manuals/245472.htm
15 *
16 * For more information, go to http://www.urbanmyth.org/microcode
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 *
23 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
24 * Initial release.
25 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
26 * Added read() support + cleanups.
27 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
28 * Added 'device trimming' support. open(O_WRONLY) zeroes
29 * and frees the saved copy of applied microcode.
30 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
31 * Made to use devfs (/dev/cpu/microcode) + cleanups.
32 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
33 * Added misc device support (now uses both devfs and misc).
34 * Added MICROCODE_IOCFREE ioctl to clear memory.
35 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
36 * Messages for error cases (non Intel & no suitable microcode).
37 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
38 * Removed ->release(). Removed exclusive open and status bitmap.
39 * Added microcode_rwsem to serialize read()/write()/ioctl().
40 * Removed global kernel lock usage.
41 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
42 * Write 0 to 0x8B msr and then cpuid before reading revision,
43 * so that it works even if there were no update done by the
44 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
45 * to be 0 on my machine which is why it worked even when I
46 * disabled update by the BIOS)
47 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
48 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
49 * Tigran Aivazian <tigran@veritas.com>
50 * Intel Pentium 4 processor support and bugfixes.
51 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
52 * Bugfix for HT (Hyper-Threading) enabled processors
53 * whereby processor resources are shared by all logical processors
54 * in a single CPU package.
55 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
56 * Tigran Aivazian <tigran@veritas.com>,
57 * Serialize updates as required on HT processors due to speculative
58 * nature of implementation.
59 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
60 * Fix the panic when writing zero-length microcode chunk.
61 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
62 * Jun Nakajima <jun.nakajima@intel.com>
63 * Support for the microcode updates in the new format.
64 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
65 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
66 * because we no longer hold a copy of applied microcode
67 * in kernel memory.
68 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
69 * Fix sigmatch() macro to handle old CPUs with pf == 0.
70 * Thanks to Stuart Swales for pointing out this bug.
71 */
72
73//#define DEBUG /* pr_debug */
74#include <linux/capability.h>
75#include <linux/kernel.h>
76#include <linux/init.h>
77#include <linux/sched.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94
95MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
96MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
97MODULE_LICENSE("GPL");
98
99#define MICROCODE_VERSION "1.14a"
100
101#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
102#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
103#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
104#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
105#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
106#define DWSIZE (sizeof (u32))
107#define get_totalsize(mc) \
108 (((microcode_t *)mc)->hdr.totalsize ? \
109 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
110#define get_datasize(mc) \
111 (((microcode_t *)mc)->hdr.datasize ? \
112 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
113
114#define sigmatch(s1, s2, p1, p2) \
115 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
116
117#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
118
119/* serialize access to the physical write to MSR 0x79 */
120static DEFINE_SPINLOCK(microcode_update_lock);
121
122/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
123static DEFINE_MUTEX(microcode_mutex);
124
125static struct ucode_cpu_info {
126 int valid;
127 unsigned int sig;
128 unsigned int pf;
129 unsigned int rev;
130 microcode_t *mc;
131} ucode_cpu_info[NR_CPUS];
132
133static void collect_cpu_info(int cpu_num)
134{
135 struct cpuinfo_x86 *c = cpu_data + cpu_num;
136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
137 unsigned int val[2];
138
139 /* We should bind the task to the CPU */
140 BUG_ON(raw_smp_processor_id() != cpu_num);
141 uci->pf = uci->rev = 0;
142 uci->mc = NULL;
143 uci->valid = 1;
144
145 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
146 cpu_has(c, X86_FEATURE_IA64)) {
147 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
148 "processor\n", cpu_num);
149 uci->valid = 0;
150 return;
151 }
152
153 uci->sig = cpuid_eax(0x00000001);
154
155 if ((c->x86_model >= 5) || (c->x86 > 6)) {
156 /* get processor flags from MSR 0x17 */
157 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
158 uci->pf = 1 << ((val[1] >> 18) & 7);
159 }
160
161 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
162 /* see notes above for revision 1.07. Apparent chip bug */
163 sync_core();
164 /* get the current revision from MSR 0x8B */
165 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
166 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
167 uci->sig, uci->pf, uci->rev);
168}
169
170static inline int microcode_update_match(int cpu_num,
171 microcode_header_t *mc_header, int sig, int pf)
172{
173 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
174
175 if (!sigmatch(sig, uci->sig, pf, uci->pf)
176 || mc_header->rev <= uci->rev)
177 return 0;
178 return 1;
179}
180
181static int microcode_sanity_check(void *mc)
182{
183 microcode_header_t *mc_header = mc;
184 struct extended_sigtable *ext_header = NULL;
185 struct extended_signature *ext_sig;
186 unsigned long total_size, data_size, ext_table_size;
187 int sum, orig_sum, ext_sigcount = 0, i;
188
189 total_size = get_totalsize(mc_header);
190 data_size = get_datasize(mc_header);
191 if (data_size + MC_HEADER_SIZE > total_size) {
192 printk(KERN_ERR "microcode: error! "
193 "Bad data size in microcode data file\n");
194 return -EINVAL;
195 }
196
197 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
198 printk(KERN_ERR "microcode: error! "
199 "Unknown microcode update format\n");
200 return -EINVAL;
201 }
202 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
203 if (ext_table_size) {
204 if ((ext_table_size < EXT_HEADER_SIZE)
205 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
206 printk(KERN_ERR "microcode: error! "
207 "Small exttable size in microcode data file\n");
208 return -EINVAL;
209 }
210 ext_header = mc + MC_HEADER_SIZE + data_size;
211 if (ext_table_size != exttable_size(ext_header)) {
212 printk(KERN_ERR "microcode: error! "
213 "Bad exttable size in microcode data file\n");
214 return -EFAULT;
215 }
216 ext_sigcount = ext_header->count;
217 }
218
219 /* check extended table checksum */
220 if (ext_table_size) {
221 int ext_table_sum = 0;
222 int *ext_tablep = (int *)ext_header;
223
224 i = ext_table_size / DWSIZE;
225 while (i--)
226 ext_table_sum += ext_tablep[i];
227 if (ext_table_sum) {
228 printk(KERN_WARNING "microcode: aborting, "
229 "bad extended signature table checksum\n");
230 return -EINVAL;
231 }
232 }
233
234 /* calculate the checksum */
235 orig_sum = 0;
236 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
237 while (i--)
238 orig_sum += ((int *)mc)[i];
239 if (orig_sum) {
240 printk(KERN_ERR "microcode: aborting, bad checksum\n");
241 return -EINVAL;
242 }
243 if (!ext_table_size)
244 return 0;
245 /* check extended signature checksum */
246 for (i = 0; i < ext_sigcount; i++) {
247 ext_sig = (struct extended_signature *)((void *)ext_header
248 + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
249 sum = orig_sum
250 - (mc_header->sig + mc_header->pf + mc_header->cksum)
251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
252 if (sum) {
253 printk(KERN_ERR "microcode: aborting, bad checksum\n");
254 return -EINVAL;
255 }
256 }
257 return 0;
258}
259
260/*
261 * return 0 - no update found
262 * return 1 - found update
263 * return < 0 - error
264 */
265static int get_maching_microcode(void *mc, int cpu)
266{
267 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
268 microcode_header_t *mc_header = mc;
269 struct extended_sigtable *ext_header;
270 unsigned long total_size = get_totalsize(mc_header);
271 int ext_sigcount, i;
272 struct extended_signature *ext_sig;
273 void *new_mc;
274
275 if (microcode_update_match(cpu, mc_header,
276 mc_header->sig, mc_header->pf))
277 goto find;
278
279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
280 return 0;
281
282 ext_header = (struct extended_sigtable *)(mc +
283 get_datasize(mc_header) + MC_HEADER_SIZE);
284 ext_sigcount = ext_header->count;
285 ext_sig = (struct extended_signature *)((void *)ext_header
286 + EXT_HEADER_SIZE);
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU %d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d updated from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 pr_debug("microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed(current, cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed(current, old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
428}
429
430static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
431{
432 ssize_t ret;
433
434 if ((len >> PAGE_SHIFT) > num_physpages) {
435 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
436 return -EINVAL;
437 }
438
439 lock_cpu_hotplug();
440 mutex_lock(&microcode_mutex);
441
442 user_buffer = (void __user *) buf;
443 user_buffer_size = (int) len;
444
445 ret = do_microcode_update();
446 if (!ret)
447 ret = (ssize_t)len;
448
449 mutex_unlock(&microcode_mutex);
450 unlock_cpu_hotplug();
451
452 return ret;
453}
454
455static const struct file_operations microcode_fops = {
456 .owner = THIS_MODULE,
457 .write = microcode_write,
458 .open = microcode_open,
459};
460
461static struct miscdevice microcode_dev = {
462 .minor = MICROCODE_MINOR,
463 .name = "microcode",
464 .fops = &microcode_fops,
465};
466
467static int __init microcode_dev_init (void)
468{
469 int error;
470
471 error = misc_register(&microcode_dev);
472 if (error) {
473 printk(KERN_ERR
474 "microcode: can't misc_register on minor=%d\n",
475 MICROCODE_MINOR);
476 return error;
477 }
478
479 return 0;
480}
481
482static void microcode_dev_exit (void)
483{
484 misc_deregister(&microcode_dev);
485}
486
487MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
488#else
489#define microcode_dev_init() 0
490#define microcode_dev_exit() do { } while(0)
491#endif
492
493static long get_next_ucode_from_buffer(void **mc, void *buf,
494 unsigned long size, long offset)
495{
496 microcode_header_t *mc_header;
497 unsigned long total_size;
498
499 /* No more data */
500 if (offset >= size)
501 return 0;
502 mc_header = (microcode_header_t *)(buf + offset);
503 total_size = get_totalsize(mc_header);
504
505 if (offset + total_size > size) {
506 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
507 return -EINVAL;
508 }
509
510 *mc = vmalloc(total_size);
511 if (!*mc) {
512 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
513 return -ENOMEM;
514 }
515 memcpy(*mc, buf + offset, total_size);
516 return offset + total_size;
517}
518
519/* fake device for request_firmware */
520static struct platform_device *microcode_pdev;
521
522static int cpu_request_microcode(int cpu)
523{
524 char name[30];
525 struct cpuinfo_x86 *c = cpu_data + cpu;
526 const struct firmware *firmware;
527 void *buf;
528 unsigned long size;
529 long offset = 0;
530 int error;
531 void *mc;
532
533 /* We should bind the task to the CPU */
534 BUG_ON(cpu != raw_smp_processor_id());
535 sprintf(name,"intel-ucode/%02x-%02x-%02x",
536 c->x86, c->x86_model, c->x86_mask);
537 error = request_firmware(&firmware, name, &microcode_pdev->dev);
538 if (error) {
539 pr_debug("ucode data file %s load failed\n", name);
540 return error;
541 }
542 buf = (void *)firmware->data;
543 size = firmware->size;
544 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
545 > 0) {
546 error = microcode_sanity_check(mc);
547 if (error)
548 break;
549 error = get_maching_microcode(mc, cpu);
550 if (error < 0)
551 break;
552 /*
553 * It's possible the data file has multiple matching ucode,
554 * lets keep searching till the latest version
555 */
556 if (error == 1) {
557 apply_microcode(cpu);
558 error = 0;
559 }
560 vfree(mc);
561 }
562 if (offset > 0)
563 vfree(mc);
564 if (offset < 0)
565 error = offset;
566 release_firmware(firmware);
567
568 return error;
569}
570
571static int apply_microcode_check_cpu(int cpu)
572{
573 struct cpuinfo_x86 *c = cpu_data + cpu;
574 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
575 cpumask_t old;
576 unsigned int val[2];
577 int err = 0;
578
579 /* Check if the microcode is available */
580 if (!uci->mc)
581 return 0;
582
583 old = current->cpus_allowed;
584 set_cpus_allowed(current, cpumask_of_cpu(cpu));
585
586 /* Check if the microcode we have in memory matches the CPU */
587 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
588 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
589 err = -EINVAL;
590
591 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
592 /* get processor flags from MSR 0x17 */
593 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
594 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
595 err = -EINVAL;
596 }
597
598 if (!err) {
599 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
600 /* see notes above for revision 1.07. Apparent chip bug */
601 sync_core();
602 /* get the current revision from MSR 0x8B */
603 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
604 if (uci->rev != val[1])
605 err = -EINVAL;
606 }
607
608 if (!err)
609 apply_microcode(cpu);
610 else
611 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
612 " sig=0x%x, pf=0x%x, rev=0x%x\n",
613 cpu, uci->sig, uci->pf, uci->rev);
614
615 set_cpus_allowed(current, old);
616 return err;
617}
618
619static void microcode_init_cpu(int cpu, int resume)
620{
621 cpumask_t old;
622 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
623
624 old = current->cpus_allowed;
625
626 set_cpus_allowed(current, cpumask_of_cpu(cpu));
627 mutex_lock(&microcode_mutex);
628 collect_cpu_info(cpu);
629 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
630 cpu_request_microcode(cpu);
631 mutex_unlock(&microcode_mutex);
632 set_cpus_allowed(current, old);
633}
634
635static void microcode_fini_cpu(int cpu)
636{
637 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
638
639 mutex_lock(&microcode_mutex);
640 uci->valid = 0;
641 vfree(uci->mc);
642 uci->mc = NULL;
643 mutex_unlock(&microcode_mutex);
644}
645
646static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
647{
648 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
649 char *end;
650 unsigned long val = simple_strtoul(buf, &end, 0);
651 int err = 0;
652 int cpu = dev->id;
653
654 if (end == buf)
655 return -EINVAL;
656 if (val == 1) {
657 cpumask_t old;
658
659 old = current->cpus_allowed;
660
661 lock_cpu_hotplug();
662 set_cpus_allowed(current, cpumask_of_cpu(cpu));
663
664 mutex_lock(&microcode_mutex);
665 if (uci->valid)
666 err = cpu_request_microcode(cpu);
667 mutex_unlock(&microcode_mutex);
668 unlock_cpu_hotplug();
669 set_cpus_allowed(current, old);
670 }
671 if (err)
672 return err;
673 return sz;
674}
675
676static ssize_t version_show(struct sys_device *dev, char *buf)
677{
678 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
679
680 return sprintf(buf, "0x%x\n", uci->rev);
681}
682
683static ssize_t pf_show(struct sys_device *dev, char *buf)
684{
685 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
686
687 return sprintf(buf, "0x%x\n", uci->pf);
688}
689
690static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
691static SYSDEV_ATTR(version, 0400, version_show, NULL);
692static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
693
694static struct attribute *mc_default_attrs[] = {
695 &attr_reload.attr,
696 &attr_version.attr,
697 &attr_processor_flags.attr,
698 NULL
699};
700
701static struct attribute_group mc_attr_group = {
702 .attrs = mc_default_attrs,
703 .name = "microcode",
704};
705
706static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
707{
708 int err, cpu = sys_dev->id;
709 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
710
711 if (!cpu_online(cpu))
712 return 0;
713
714 pr_debug("Microcode:CPU %d added\n", cpu);
715 memset(uci, 0, sizeof(*uci));
716
717 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
718 if (err)
719 return err;
720
721 microcode_init_cpu(cpu, resume);
722
723 return 0;
724}
725
726static int mc_sysdev_add(struct sys_device *sys_dev)
727{
728 return __mc_sysdev_add(sys_dev, 0);
729}
730
731static int mc_sysdev_remove(struct sys_device *sys_dev)
732{
733 int cpu = sys_dev->id;
734
735 if (!cpu_online(cpu))
736 return 0;
737
738 pr_debug("Microcode:CPU %d removed\n", cpu);
739 microcode_fini_cpu(cpu);
740 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
741 return 0;
742}
743
744static int mc_sysdev_resume(struct sys_device *dev)
745{
746 int cpu = dev->id;
747
748 if (!cpu_online(cpu))
749 return 0;
750 pr_debug("Microcode:CPU %d resumed\n", cpu);
751 /* only CPU 0 will apply ucode here */
752 apply_microcode(0);
753 return 0;
754}
755
756static struct sysdev_driver mc_sysdev_driver = {
757 .add = mc_sysdev_add,
758 .remove = mc_sysdev_remove,
759 .resume = mc_sysdev_resume,
760};
761
762static __cpuinit int
763mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
764{
765 unsigned int cpu = (unsigned long)hcpu;
766 struct sys_device *sys_dev;
767
768 sys_dev = get_cpu_sysdev(cpu);
769 switch (action) {
770 case CPU_UP_CANCELED_FROZEN:
771 /* The CPU refused to come up during a system resume */
772 microcode_fini_cpu(cpu);
773 break;
774 case CPU_ONLINE:
775 case CPU_DOWN_FAILED:
776 mc_sysdev_add(sys_dev);
777 break;
778 case CPU_ONLINE_FROZEN:
779 /* System-wide resume is in progress, try to apply microcode */
780 if (apply_microcode_check_cpu(cpu)) {
781 /* The application of microcode failed */
782 microcode_fini_cpu(cpu);
783 __mc_sysdev_add(sys_dev, 1);
784 break;
785 }
786 case CPU_DOWN_FAILED_FROZEN:
787 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
788 printk(KERN_ERR "Microcode: Failed to create the sysfs "
789 "group for CPU%d\n", cpu);
790 break;
791 case CPU_DOWN_PREPARE:
792 mc_sysdev_remove(sys_dev);
793 break;
794 case CPU_DOWN_PREPARE_FROZEN:
795 /* Suspend is in progress, only remove the interface */
796 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
797 break;
798 }
799 return NOTIFY_OK;
800}
801
802static struct notifier_block __cpuinitdata mc_cpu_notifier = {
803 .notifier_call = mc_cpu_callback,
804};
805
806static int __init microcode_init (void)
807{
808 int error;
809
810 error = microcode_dev_init();
811 if (error)
812 return error;
813 microcode_pdev = platform_device_register_simple("microcode", -1,
814 NULL, 0);
815 if (IS_ERR(microcode_pdev)) {
816 microcode_dev_exit();
817 return PTR_ERR(microcode_pdev);
818 }
819
820 lock_cpu_hotplug();
821 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
822 unlock_cpu_hotplug();
823 if (error) {
824 microcode_dev_exit();
825 platform_device_unregister(microcode_pdev);
826 return error;
827 }
828
829 register_hotcpu_notifier(&mc_cpu_notifier);
830
831 printk(KERN_INFO
832 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
833 return 0;
834}
835
836static void __exit microcode_exit (void)
837{
838 microcode_dev_exit();
839
840 unregister_hotcpu_notifier(&mc_cpu_notifier);
841
842 lock_cpu_hotplug();
843 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
844 unlock_cpu_hotplug();
845
846 platform_device_unregister(microcode_pdev);
847}
848
849module_init(microcode_init)
850module_exit(microcode_exit)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
new file mode 100644
index 000000000000..3db0a5442eb1
--- /dev/null
+++ b/arch/x86/kernel/module_32.c
@@ -0,0 +1,152 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24#include <linux/bug.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(fmt...)
30#endif
31
32void *module_alloc(unsigned long size)
33{
34 if (size == 0)
35 return NULL;
36 return vmalloc_exec(size);
37}
38
39
40/* Free memory returned from module_alloc */
41void module_free(struct module *mod, void *module_region)
42{
43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */
46}
47
48/* We don't need anything special. */
49int module_frob_arch_sections(Elf_Ehdr *hdr,
50 Elf_Shdr *sechdrs,
51 char *secstrings,
52 struct module *mod)
53{
54 return 0;
55}
56
57int apply_relocate(Elf32_Shdr *sechdrs,
58 const char *strtab,
59 unsigned int symindex,
60 unsigned int relsec,
61 struct module *me)
62{
63 unsigned int i;
64 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
65 Elf32_Sym *sym;
66 uint32_t *location;
67
68 DEBUGP("Applying relocate section %u to %u\n", relsec,
69 sechdrs[relsec].sh_info);
70 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
71 /* This is where to make the change */
72 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
73 + rel[i].r_offset;
74 /* This is the symbol it is referring to. Note that all
75 undefined symbols have been resolved. */
76 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
77 + ELF32_R_SYM(rel[i].r_info);
78
79 switch (ELF32_R_TYPE(rel[i].r_info)) {
80 case R_386_32:
81 /* We add the value into the location given */
82 *location += sym->st_value;
83 break;
84 case R_386_PC32:
85 /* Add the value, subtract its postition */
86 *location += sym->st_value - (uint32_t)location;
87 break;
88 default:
89 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
90 me->name, ELF32_R_TYPE(rel[i].r_info));
91 return -ENOEXEC;
92 }
93 }
94 return 0;
95}
96
97int apply_relocate_add(Elf32_Shdr *sechdrs,
98 const char *strtab,
99 unsigned int symindex,
100 unsigned int relsec,
101 struct module *me)
102{
103 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
104 me->name);
105 return -ENOEXEC;
106}
107
108int module_finalize(const Elf_Ehdr *hdr,
109 const Elf_Shdr *sechdrs,
110 struct module *me)
111{
112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
125 }
126
127 if (alt) {
128 /* patch .altinstructions */
129 void *aseg = (void *)alt->sh_addr;
130 apply_alternatives(aseg, aseg + alt->sh_size);
131 }
132 if (locks && text) {
133 void *lseg = (void *)locks->sh_addr;
134 void *tseg = (void *)text->sh_addr;
135 alternatives_smp_module_add(me, me->name,
136 lseg, lseg + locks->sh_size,
137 tseg, tseg + text->sh_size);
138 }
139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
146}
147
148void module_arch_cleanup(struct module *mod)
149{
150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
152}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
new file mode 100644
index 000000000000..13abb4ebfb79
--- /dev/null
+++ b/arch/x86/kernel/mpparse_32.c
@@ -0,0 +1,1132 @@
1/*
2 * Intel Multiprocessor Specification 1.1 and 1.4
3 * compliant MP-table parsing routines.
4 *
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes
9 * Erich Boleyn : MP v1.4 and additional changes.
10 * Alan Cox : Added EBDA scanning
11 * Ingo Molnar : various cleanups and rewrites
12 * Maciej W. Rozycki: Bits for default MP configurations
13 * Paul Diefenbaugh: Added full ACPI support
14 */
15
16#include <linux/mm.h>
17#include <linux/init.h>
18#include <linux/acpi.h>
19#include <linux/delay.h>
20#include <linux/bootmem.h>
21#include <linux/kernel_stat.h>
22#include <linux/mc146818rtc.h>
23#include <linux/bitops.h>
24
25#include <asm/smp.h>
26#include <asm/acpi.h>
27#include <asm/mtrr.h>
28#include <asm/mpspec.h>
29#include <asm/io_apic.h>
30
31#include <mach_apic.h>
32#include <mach_apicdef.h>
33#include <mach_mpparse.h>
34#include <bios_ebda.h>
35
36/* Have we found an MP table */
37int smp_found_config;
38unsigned int __cpuinitdata maxcpus = NR_CPUS;
39
40/*
41 * Various Linux-internal data structures created from the
42 * MP-table.
43 */
44int apic_version [MAX_APICS];
45int mp_bus_id_to_type [MAX_MP_BUSSES];
46int mp_bus_id_to_node [MAX_MP_BUSSES];
47int mp_bus_id_to_local [MAX_MP_BUSSES];
48int quad_local_to_mp_bus_id [NR_CPUS/4][4];
49int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
50static int mp_current_pci_id;
51
52/* I/O APIC entries */
53struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
54
55/* # of MP IRQ source entries */
56struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
57
58/* MP IRQ source entries */
59int mp_irq_entries;
60
61int nr_ioapics;
62
63int pic_mode;
64unsigned long mp_lapic_addr;
65
66unsigned int def_to_bigsmp = 0;
67
68/* Processor that is doing the boot up */
69unsigned int boot_cpu_physical_apicid = -1U;
70/* Internal processor count */
71unsigned int __cpuinitdata num_processors;
72
73/* Bitmask of physically existing CPUs */
74physid_mask_t phys_cpu_present_map;
75
76u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
77
78/*
79 * Intel MP BIOS table parsing routines:
80 */
81
82
83/*
84 * Checksum an MP configuration block.
85 */
86
87static int __init mpf_checksum(unsigned char *mp, int len)
88{
89 int sum = 0;
90
91 while (len--)
92 sum += *mp++;
93
94 return sum & 0xFF;
95}
96
97/*
98 * Have to match translation table entries to main table entries by counter
99 * hence the mpc_record variable .... can't see a less disgusting way of
100 * doing this ....
101 */
102
103static int mpc_record;
104static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
105
106static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
107{
108 int ver, apicid;
109 physid_mask_t phys_cpu;
110
111 if (!(m->mpc_cpuflag & CPU_ENABLED))
112 return;
113
114 apicid = mpc_apic_id(m, translation_table[mpc_record]);
115
116 if (m->mpc_featureflag&(1<<0))
117 Dprintk(" Floating point unit present.\n");
118 if (m->mpc_featureflag&(1<<7))
119 Dprintk(" Machine Exception supported.\n");
120 if (m->mpc_featureflag&(1<<8))
121 Dprintk(" 64 bit compare & exchange supported.\n");
122 if (m->mpc_featureflag&(1<<9))
123 Dprintk(" Internal APIC present.\n");
124 if (m->mpc_featureflag&(1<<11))
125 Dprintk(" SEP present.\n");
126 if (m->mpc_featureflag&(1<<12))
127 Dprintk(" MTRR present.\n");
128 if (m->mpc_featureflag&(1<<13))
129 Dprintk(" PGE present.\n");
130 if (m->mpc_featureflag&(1<<14))
131 Dprintk(" MCA present.\n");
132 if (m->mpc_featureflag&(1<<15))
133 Dprintk(" CMOV present.\n");
134 if (m->mpc_featureflag&(1<<16))
135 Dprintk(" PAT present.\n");
136 if (m->mpc_featureflag&(1<<17))
137 Dprintk(" PSE present.\n");
138 if (m->mpc_featureflag&(1<<18))
139 Dprintk(" PSN present.\n");
140 if (m->mpc_featureflag&(1<<19))
141 Dprintk(" Cache Line Flush Instruction present.\n");
142 /* 20 Reserved */
143 if (m->mpc_featureflag&(1<<21))
144 Dprintk(" Debug Trace and EMON Store present.\n");
145 if (m->mpc_featureflag&(1<<22))
146 Dprintk(" ACPI Thermal Throttle Registers present.\n");
147 if (m->mpc_featureflag&(1<<23))
148 Dprintk(" MMX present.\n");
149 if (m->mpc_featureflag&(1<<24))
150 Dprintk(" FXSR present.\n");
151 if (m->mpc_featureflag&(1<<25))
152 Dprintk(" XMM present.\n");
153 if (m->mpc_featureflag&(1<<26))
154 Dprintk(" Willamette New Instructions present.\n");
155 if (m->mpc_featureflag&(1<<27))
156 Dprintk(" Self Snoop present.\n");
157 if (m->mpc_featureflag&(1<<28))
158 Dprintk(" HT present.\n");
159 if (m->mpc_featureflag&(1<<29))
160 Dprintk(" Thermal Monitor present.\n");
161 /* 30, 31 Reserved */
162
163
164 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
165 Dprintk(" Bootup CPU\n");
166 boot_cpu_physical_apicid = m->mpc_apicid;
167 }
168
169 ver = m->mpc_apicver;
170
171 /*
172 * Validate version
173 */
174 if (ver == 0x0) {
175 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
176 "fixing up to 0x10. (tell your hw vendor)\n",
177 m->mpc_apicid);
178 ver = 0x10;
179 }
180 apic_version[m->mpc_apicid] = ver;
181
182 phys_cpu = apicid_to_cpu_present(apicid);
183 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
184
185 if (num_processors >= NR_CPUS) {
186 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
187 " Processor ignored.\n", NR_CPUS);
188 return;
189 }
190
191 if (num_processors >= maxcpus) {
192 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
193 " Processor ignored.\n", maxcpus);
194 return;
195 }
196
197 cpu_set(num_processors, cpu_possible_map);
198 num_processors++;
199
200 /*
201 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
202 * but we need to work other dependencies like SMP_SUSPEND etc
203 * before this can be done without some confusion.
204 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
205 * - Ashok Raj <ashok.raj@intel.com>
206 */
207 if (num_processors > 8) {
208 switch (boot_cpu_data.x86_vendor) {
209 case X86_VENDOR_INTEL:
210 if (!APIC_XAPIC(ver)) {
211 def_to_bigsmp = 0;
212 break;
213 }
214 /* If P4 and above fall through */
215 case X86_VENDOR_AMD:
216 def_to_bigsmp = 1;
217 }
218 }
219 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
220}
221
222static void __init MP_bus_info (struct mpc_config_bus *m)
223{
224 char str[7];
225
226 memcpy(str, m->mpc_bustype, 6);
227 str[6] = 0;
228
229 mpc_oem_bus_info(m, str, translation_table[mpc_record]);
230
231#if MAX_MP_BUSSES < 256
232 if (m->mpc_busid >= MAX_MP_BUSSES) {
233 printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
234 " is too large, max. supported is %d\n",
235 m->mpc_busid, str, MAX_MP_BUSSES - 1);
236 return;
237 }
238#endif
239
240 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
241 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
242 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
243 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
244 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
245 mpc_oem_pci_bus(m, translation_table[mpc_record]);
246 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
247 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
248 mp_current_pci_id++;
249 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
250 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
251 } else {
252 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
253 }
254}
255
256static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
257{
258 if (!(m->mpc_flags & MPC_APIC_USABLE))
259 return;
260
261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
263 if (nr_ioapics >= MAX_IO_APICS) {
264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
265 MAX_IO_APICS, nr_ioapics);
266 panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
267 }
268 if (!m->mpc_apicaddr) {
269 printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
270 " found in MP table, skipping!\n");
271 return;
272 }
273 mp_ioapics[nr_ioapics] = *m;
274 nr_ioapics++;
275}
276
277static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
278{
279 mp_irqs [mp_irq_entries] = *m;
280 Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
281 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
282 m->mpc_irqtype, m->mpc_irqflag & 3,
283 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
284 m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
285 if (++mp_irq_entries == MAX_IRQ_SOURCES)
286 panic("Max # of irq sources exceeded!!\n");
287}
288
289static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
290{
291 Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
292 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
293 m->mpc_irqtype, m->mpc_irqflag & 3,
294 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
295 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
296}
297
298#ifdef CONFIG_X86_NUMAQ
299static void __init MP_translation_info (struct mpc_config_translation *m)
300{
301 printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
302
303 if (mpc_record >= MAX_MPC_ENTRY)
304 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
305 else
306 translation_table[mpc_record] = m; /* stash this for later */
307 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
308 node_set_online(m->trans_quad);
309}
310
311/*
312 * Read/parse the MPC oem tables
313 */
314
315static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
316 unsigned short oemsize)
317{
318 int count = sizeof (*oemtable); /* the header size */
319 unsigned char *oemptr = ((unsigned char *)oemtable)+count;
320
321 mpc_record = 0;
322 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
323 if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
324 {
325 printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
326 oemtable->oem_signature[0],
327 oemtable->oem_signature[1],
328 oemtable->oem_signature[2],
329 oemtable->oem_signature[3]);
330 return;
331 }
332 if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
333 {
334 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
335 return;
336 }
337 while (count < oemtable->oem_length) {
338 switch (*oemptr) {
339 case MP_TRANSLATION:
340 {
341 struct mpc_config_translation *m=
342 (struct mpc_config_translation *)oemptr;
343 MP_translation_info(m);
344 oemptr += sizeof(*m);
345 count += sizeof(*m);
346 ++mpc_record;
347 break;
348 }
349 default:
350 {
351 printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
352 return;
353 }
354 }
355 }
356}
357
358static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
359 char *productid)
360{
361 if (strncmp(oem, "IBM NUMA", 8))
362 printk("Warning! May not be a NUMA-Q system!\n");
363 if (mpc->mpc_oemptr)
364 smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
365 mpc->mpc_oemsize);
366}
367#endif /* CONFIG_X86_NUMAQ */
368
369/*
370 * Read/parse the MPC
371 */
372
373static int __init smp_read_mpc(struct mp_config_table *mpc)
374{
375 char str[16];
376 char oem[10];
377 int count=sizeof(*mpc);
378 unsigned char *mpt=((unsigned char *)mpc)+count;
379
380 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
381 printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
382 *(u32 *)mpc->mpc_signature);
383 return 0;
384 }
385 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
386 printk(KERN_ERR "SMP mptable: checksum error!\n");
387 return 0;
388 }
389 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
390 printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
391 mpc->mpc_spec);
392 return 0;
393 }
394 if (!mpc->mpc_lapic) {
395 printk(KERN_ERR "SMP mptable: null local APIC address!\n");
396 return 0;
397 }
398 memcpy(oem,mpc->mpc_oem,8);
399 oem[8]=0;
400 printk(KERN_INFO "OEM ID: %s ",oem);
401
402 memcpy(str,mpc->mpc_productid,12);
403 str[12]=0;
404 printk("Product ID: %s ",str);
405
406 mps_oem_check(mpc, oem, str);
407
408 printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
409
410 /*
411 * Save the local APIC address (it might be non-default) -- but only
412 * if we're not using ACPI.
413 */
414 if (!acpi_lapic)
415 mp_lapic_addr = mpc->mpc_lapic;
416
417 /*
418 * Now process the configuration blocks.
419 */
420 mpc_record = 0;
421 while (count < mpc->mpc_length) {
422 switch(*mpt) {
423 case MP_PROCESSOR:
424 {
425 struct mpc_config_processor *m=
426 (struct mpc_config_processor *)mpt;
427 /* ACPI may have already provided this data */
428 if (!acpi_lapic)
429 MP_processor_info(m);
430 mpt += sizeof(*m);
431 count += sizeof(*m);
432 break;
433 }
434 case MP_BUS:
435 {
436 struct mpc_config_bus *m=
437 (struct mpc_config_bus *)mpt;
438 MP_bus_info(m);
439 mpt += sizeof(*m);
440 count += sizeof(*m);
441 break;
442 }
443 case MP_IOAPIC:
444 {
445 struct mpc_config_ioapic *m=
446 (struct mpc_config_ioapic *)mpt;
447 MP_ioapic_info(m);
448 mpt+=sizeof(*m);
449 count+=sizeof(*m);
450 break;
451 }
452 case MP_INTSRC:
453 {
454 struct mpc_config_intsrc *m=
455 (struct mpc_config_intsrc *)mpt;
456
457 MP_intsrc_info(m);
458 mpt+=sizeof(*m);
459 count+=sizeof(*m);
460 break;
461 }
462 case MP_LINTSRC:
463 {
464 struct mpc_config_lintsrc *m=
465 (struct mpc_config_lintsrc *)mpt;
466 MP_lintsrc_info(m);
467 mpt+=sizeof(*m);
468 count+=sizeof(*m);
469 break;
470 }
471 default:
472 {
473 count = mpc->mpc_length;
474 break;
475 }
476 }
477 ++mpc_record;
478 }
479 setup_apic_routing();
480 if (!num_processors)
481 printk(KERN_ERR "SMP mptable: no processors registered!\n");
482 return num_processors;
483}
484
485static int __init ELCR_trigger(unsigned int irq)
486{
487 unsigned int port;
488
489 port = 0x4d0 + (irq >> 3);
490 return (inb(port) >> (irq & 7)) & 1;
491}
492
493static void __init construct_default_ioirq_mptable(int mpc_default_type)
494{
495 struct mpc_config_intsrc intsrc;
496 int i;
497 int ELCR_fallback = 0;
498
499 intsrc.mpc_type = MP_INTSRC;
500 intsrc.mpc_irqflag = 0; /* conforming */
501 intsrc.mpc_srcbus = 0;
502 intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
503
504 intsrc.mpc_irqtype = mp_INT;
505
506 /*
507 * If true, we have an ISA/PCI system with no IRQ entries
508 * in the MP table. To prevent the PCI interrupts from being set up
509 * incorrectly, we try to use the ELCR. The sanity check to see if
510 * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
511 * never be level sensitive, so we simply see if the ELCR agrees.
512 * If it does, we assume it's valid.
513 */
514 if (mpc_default_type == 5) {
515 printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
516
517 if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
518 printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
519 else {
520 printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
521 ELCR_fallback = 1;
522 }
523 }
524
525 for (i = 0; i < 16; i++) {
526 switch (mpc_default_type) {
527 case 2:
528 if (i == 0 || i == 13)
529 continue; /* IRQ0 & IRQ13 not connected */
530 /* fall through */
531 default:
532 if (i == 2)
533 continue; /* IRQ2 is never connected */
534 }
535
536 if (ELCR_fallback) {
537 /*
538 * If the ELCR indicates a level-sensitive interrupt, we
539 * copy that information over to the MP table in the
540 * irqflag field (level sensitive, active high polarity).
541 */
542 if (ELCR_trigger(i))
543 intsrc.mpc_irqflag = 13;
544 else
545 intsrc.mpc_irqflag = 0;
546 }
547
548 intsrc.mpc_srcbusirq = i;
549 intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
550 MP_intsrc_info(&intsrc);
551 }
552
553 intsrc.mpc_irqtype = mp_ExtINT;
554 intsrc.mpc_srcbusirq = 0;
555 intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
556 MP_intsrc_info(&intsrc);
557}
558
559static inline void __init construct_default_ISA_mptable(int mpc_default_type)
560{
561 struct mpc_config_processor processor;
562 struct mpc_config_bus bus;
563 struct mpc_config_ioapic ioapic;
564 struct mpc_config_lintsrc lintsrc;
565 int linttypes[2] = { mp_ExtINT, mp_NMI };
566 int i;
567
568 /*
569 * local APIC has default address
570 */
571 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
572
573 /*
574 * 2 CPUs, numbered 0 & 1.
575 */
576 processor.mpc_type = MP_PROCESSOR;
577 /* Either an integrated APIC or a discrete 82489DX. */
578 processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
579 processor.mpc_cpuflag = CPU_ENABLED;
580 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
581 (boot_cpu_data.x86_model << 4) |
582 boot_cpu_data.x86_mask;
583 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
584 processor.mpc_reserved[0] = 0;
585 processor.mpc_reserved[1] = 0;
586 for (i = 0; i < 2; i++) {
587 processor.mpc_apicid = i;
588 MP_processor_info(&processor);
589 }
590
591 bus.mpc_type = MP_BUS;
592 bus.mpc_busid = 0;
593 switch (mpc_default_type) {
594 default:
595 printk("???\n");
596 printk(KERN_ERR "Unknown standard configuration %d\n",
597 mpc_default_type);
598 /* fall through */
599 case 1:
600 case 5:
601 memcpy(bus.mpc_bustype, "ISA ", 6);
602 break;
603 case 2:
604 case 6:
605 case 3:
606 memcpy(bus.mpc_bustype, "EISA ", 6);
607 break;
608 case 4:
609 case 7:
610 memcpy(bus.mpc_bustype, "MCA ", 6);
611 }
612 MP_bus_info(&bus);
613 if (mpc_default_type > 4) {
614 bus.mpc_busid = 1;
615 memcpy(bus.mpc_bustype, "PCI ", 6);
616 MP_bus_info(&bus);
617 }
618
619 ioapic.mpc_type = MP_IOAPIC;
620 ioapic.mpc_apicid = 2;
621 ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
622 ioapic.mpc_flags = MPC_APIC_USABLE;
623 ioapic.mpc_apicaddr = 0xFEC00000;
624 MP_ioapic_info(&ioapic);
625
626 /*
627 * We set up most of the low 16 IO-APIC pins according to MPS rules.
628 */
629 construct_default_ioirq_mptable(mpc_default_type);
630
631 lintsrc.mpc_type = MP_LINTSRC;
632 lintsrc.mpc_irqflag = 0; /* conforming */
633 lintsrc.mpc_srcbusid = 0;
634 lintsrc.mpc_srcbusirq = 0;
635 lintsrc.mpc_destapic = MP_APIC_ALL;
636 for (i = 0; i < 2; i++) {
637 lintsrc.mpc_irqtype = linttypes[i];
638 lintsrc.mpc_destapiclint = i;
639 MP_lintsrc_info(&lintsrc);
640 }
641}
642
643static struct intel_mp_floating *mpf_found;
644
645/*
646 * Scan the memory blocks for an SMP configuration block.
647 */
648void __init get_smp_config (void)
649{
650 struct intel_mp_floating *mpf = mpf_found;
651
652 /*
653 * ACPI supports both logical (e.g. Hyper-Threading) and physical
654 * processors, where MPS only supports physical.
655 */
656 if (acpi_lapic && acpi_ioapic) {
657 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
658 return;
659 }
660 else if (acpi_lapic)
661 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
662
663 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
664 if (mpf->mpf_feature2 & (1<<7)) {
665 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
666 pic_mode = 1;
667 } else {
668 printk(KERN_INFO " Virtual Wire compatibility mode.\n");
669 pic_mode = 0;
670 }
671
672 /*
673 * Now see if we need to read further.
674 */
675 if (mpf->mpf_feature1 != 0) {
676
677 printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
678 construct_default_ISA_mptable(mpf->mpf_feature1);
679
680 } else if (mpf->mpf_physptr) {
681
682 /*
683 * Read the physical hardware table. Anything here will
684 * override the defaults.
685 */
686 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
687 smp_found_config = 0;
688 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
689 printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
690 return;
691 }
692 /*
693 * If there are no explicit MP IRQ entries, then we are
694 * broken. We set up most of the low 16 IO-APIC pins to
695 * ISA defaults and hope it will work.
696 */
697 if (!mp_irq_entries) {
698 struct mpc_config_bus bus;
699
700 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
701
702 bus.mpc_type = MP_BUS;
703 bus.mpc_busid = 0;
704 memcpy(bus.mpc_bustype, "ISA ", 6);
705 MP_bus_info(&bus);
706
707 construct_default_ioirq_mptable(0);
708 }
709
710 } else
711 BUG();
712
713 printk(KERN_INFO "Processors: %d\n", num_processors);
714 /*
715 * Only use the first configuration found.
716 */
717}
718
719static int __init smp_scan_config (unsigned long base, unsigned long length)
720{
721 unsigned long *bp = phys_to_virt(base);
722 struct intel_mp_floating *mpf;
723
724 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
725 if (sizeof(*mpf) != 16)
726 printk("Error: MPF size\n");
727
728 while (length > 0) {
729 mpf = (struct intel_mp_floating *)bp;
730 if ((*bp == SMP_MAGIC_IDENT) &&
731 (mpf->mpf_length == 1) &&
732 !mpf_checksum((unsigned char *)bp, 16) &&
733 ((mpf->mpf_specification == 1)
734 || (mpf->mpf_specification == 4)) ) {
735
736 smp_found_config = 1;
737 printk(KERN_INFO "found SMP MP-table at %08lx\n",
738 virt_to_phys(mpf));
739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
740 if (mpf->mpf_physptr) {
741 /*
742 * We cannot access to MPC table to compute
743 * table size yet, as only few megabytes from
744 * the bottom is mapped now.
745 * PC-9800's MPC table places on the very last
746 * of physical memory; so that simply reserving
747 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
748 * in reserve_bootmem.
749 */
750 unsigned long size = PAGE_SIZE;
751 unsigned long end = max_low_pfn * PAGE_SIZE;
752 if (mpf->mpf_physptr + size > end)
753 size = end - mpf->mpf_physptr;
754 reserve_bootmem(mpf->mpf_physptr, size);
755 }
756
757 mpf_found = mpf;
758 return 1;
759 }
760 bp += 4;
761 length -= 16;
762 }
763 return 0;
764}
765
766void __init find_smp_config (void)
767{
768 unsigned int address;
769
770 /*
771 * FIXME: Linux assumes you have 640K of base ram..
772 * this continues the error...
773 *
774 * 1) Scan the bottom 1K for a signature
775 * 2) Scan the top 1K of base RAM
776 * 3) Scan the 64K of bios
777 */
778 if (smp_scan_config(0x0,0x400) ||
779 smp_scan_config(639*0x400,0x400) ||
780 smp_scan_config(0xF0000,0x10000))
781 return;
782 /*
783 * If it is an SMP machine we should know now, unless the
784 * configuration is in an EISA/MCA bus machine with an
785 * extended bios data area.
786 *
787 * there is a real-mode segmented pointer pointing to the
788 * 4K EBDA area at 0x40E, calculate and scan it here.
789 *
790 * NOTE! There are Linux loaders that will corrupt the EBDA
791 * area, and as such this kind of SMP config may be less
792 * trustworthy, simply because the SMP table may have been
793 * stomped on during early boot. These loaders are buggy and
794 * should be fixed.
795 *
796 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
797 */
798
799 address = get_bios_ebda();
800 if (address)
801 smp_scan_config(address, 0x400);
802}
803
804int es7000_plat;
805
806/* --------------------------------------------------------------------------
807 ACPI-based MP Configuration
808 -------------------------------------------------------------------------- */
809
810#ifdef CONFIG_ACPI
811
812void __init mp_register_lapic_address(u64 address)
813{
814 mp_lapic_addr = (unsigned long) address;
815
816 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
817
818 if (boot_cpu_physical_apicid == -1U)
819 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
820
821 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
822}
823
824void __cpuinit mp_register_lapic (u8 id, u8 enabled)
825{
826 struct mpc_config_processor processor;
827 int boot_cpu = 0;
828
829 if (MAX_APICS - id <= 0) {
830 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
831 id, MAX_APICS);
832 return;
833 }
834
835 if (id == boot_cpu_physical_apicid)
836 boot_cpu = 1;
837
838 processor.mpc_type = MP_PROCESSOR;
839 processor.mpc_apicid = id;
840 processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
841 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
842 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
843 processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
844 (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
845 processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
846 processor.mpc_reserved[0] = 0;
847 processor.mpc_reserved[1] = 0;
848
849 MP_processor_info(&processor);
850}
851
852#ifdef CONFIG_X86_IO_APIC
853
854#define MP_ISA_BUS 0
855#define MP_MAX_IOAPIC_PIN 127
856
857static struct mp_ioapic_routing {
858 int apic_id;
859 int gsi_base;
860 int gsi_end;
861 u32 pin_programmed[4];
862} mp_ioapic_routing[MAX_IO_APICS];
863
864static int mp_find_ioapic (int gsi)
865{
866 int i = 0;
867
868 /* Find the IOAPIC that manages this GSI. */
869 for (i = 0; i < nr_ioapics; i++) {
870 if ((gsi >= mp_ioapic_routing[i].gsi_base)
871 && (gsi <= mp_ioapic_routing[i].gsi_end))
872 return i;
873 }
874
875 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
876
877 return -1;
878}
879
880void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
881{
882 int idx = 0;
883 int tmpid;
884
885 if (nr_ioapics >= MAX_IO_APICS) {
886 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
887 "(found %d)\n", MAX_IO_APICS, nr_ioapics);
888 panic("Recompile kernel with bigger MAX_IO_APICS!\n");
889 }
890 if (!address) {
891 printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
892 " found in MADT table, skipping!\n");
893 return;
894 }
895
896 idx = nr_ioapics++;
897
898 mp_ioapics[idx].mpc_type = MP_IOAPIC;
899 mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
900 mp_ioapics[idx].mpc_apicaddr = address;
901
902 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
903 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
904 && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
905 tmpid = io_apic_get_unique_id(idx, id);
906 else
907 tmpid = id;
908 if (tmpid == -1) {
909 nr_ioapics--;
910 return;
911 }
912 mp_ioapics[idx].mpc_apicid = tmpid;
913 mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
914
915 /*
916 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
917 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
918 */
919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
920 mp_ioapic_routing[idx].gsi_base = gsi_base;
921 mp_ioapic_routing[idx].gsi_end = gsi_base +
922 io_apic_get_redir_entries(idx);
923
924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
927 mp_ioapic_routing[idx].gsi_base,
928 mp_ioapic_routing[idx].gsi_end);
929}
930
931void __init
932mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
933{
934 struct mpc_config_intsrc intsrc;
935 int ioapic = -1;
936 int pin = -1;
937
938 /*
939 * Convert 'gsi' to 'ioapic.pin'.
940 */
941 ioapic = mp_find_ioapic(gsi);
942 if (ioapic < 0)
943 return;
944 pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
945
946 /*
947 * TBD: This check is for faulty timer entries, where the override
948 * erroneously sets the trigger to level, resulting in a HUGE
949 * increase of timer interrupts!
950 */
951 if ((bus_irq == 0) && (trigger == 3))
952 trigger = 1;
953
954 intsrc.mpc_type = MP_INTSRC;
955 intsrc.mpc_irqtype = mp_INT;
956 intsrc.mpc_irqflag = (trigger << 2) | polarity;
957 intsrc.mpc_srcbus = MP_ISA_BUS;
958 intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
959 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
960 intsrc.mpc_dstirq = pin; /* INTIN# */
961
962 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
963 intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
964 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
965 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
966
967 mp_irqs[mp_irq_entries] = intsrc;
968 if (++mp_irq_entries == MAX_IRQ_SOURCES)
969 panic("Max # of irq sources exceeded!\n");
970}
971
972void __init mp_config_acpi_legacy_irqs (void)
973{
974 struct mpc_config_intsrc intsrc;
975 int i = 0;
976 int ioapic = -1;
977
978 /*
979 * Fabricate the legacy ISA bus (bus #31).
980 */
981 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
982 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
983
984 /*
985 * Older generations of ES7000 have no legacy identity mappings
986 */
987 if (es7000_plat == 1)
988 return;
989
990 /*
991 * Locate the IOAPIC that manages the ISA IRQs (0-15).
992 */
993 ioapic = mp_find_ioapic(0);
994 if (ioapic < 0)
995 return;
996
997 intsrc.mpc_type = MP_INTSRC;
998 intsrc.mpc_irqflag = 0; /* Conforming */
999 intsrc.mpc_srcbus = MP_ISA_BUS;
1000 intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
1001
1002 /*
1003 * Use the default configuration for the IRQs 0-15. Unless
1004 * overriden by (MADT) interrupt source override entries.
1005 */
1006 for (i = 0; i < 16; i++) {
1007 int idx;
1008
1009 for (idx = 0; idx < mp_irq_entries; idx++) {
1010 struct mpc_config_intsrc *irq = mp_irqs + idx;
1011
1012 /* Do we already have a mapping for this ISA IRQ? */
1013 if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
1014 break;
1015
1016 /* Do we already have a mapping for this IOAPIC pin */
1017 if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
1018 (irq->mpc_dstirq == i))
1019 break;
1020 }
1021
1022 if (idx != mp_irq_entries) {
1023 printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
1024 continue; /* IRQ already used */
1025 }
1026
1027 intsrc.mpc_irqtype = mp_INT;
1028 intsrc.mpc_srcbusirq = i; /* Identity mapped */
1029 intsrc.mpc_dstirq = i;
1030
1031 Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
1032 "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
1033 (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
1034 intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
1035 intsrc.mpc_dstirq);
1036
1037 mp_irqs[mp_irq_entries] = intsrc;
1038 if (++mp_irq_entries == MAX_IRQ_SOURCES)
1039 panic("Max # of irq sources exceeded!\n");
1040 }
1041}
1042
1043#define MAX_GSI_NUM 4096
1044
1045int mp_register_gsi(u32 gsi, int triggering, int polarity)
1046{
1047 int ioapic = -1;
1048 int ioapic_pin = 0;
1049 int idx, bit = 0;
1050 static int pci_irq = 16;
1051 /*
1052 * Mapping between Global System Interrups, which
1053 * represent all possible interrupts, and IRQs
1054 * assigned to actual devices.
1055 */
1056 static int gsi_to_irq[MAX_GSI_NUM];
1057
1058 /* Don't set up the ACPI SCI because it's already set up */
1059 if (acpi_gbl_FADT.sci_interrupt == gsi)
1060 return gsi;
1061
1062 ioapic = mp_find_ioapic(gsi);
1063 if (ioapic < 0) {
1064 printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
1065 return gsi;
1066 }
1067
1068 ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
1069
1070 if (ioapic_renumber_irq)
1071 gsi = ioapic_renumber_irq(ioapic, gsi);
1072
1073 /*
1074 * Avoid pin reprogramming. PRTs typically include entries
1075 * with redundant pin->gsi mappings (but unique PCI devices);
1076 * we only program the IOAPIC on the first.
1077 */
1078 bit = ioapic_pin % 32;
1079 idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
1080 if (idx > 3) {
1081 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1082 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
1083 ioapic_pin);
1084 return gsi;
1085 }
1086 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
1087 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1088 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1089 return gsi_to_irq[gsi];
1090 }
1091
1092 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
1093
1094 if (triggering == ACPI_LEVEL_SENSITIVE) {
1095 /*
1096 * For PCI devices assign IRQs in order, avoiding gaps
1097 * due to unused I/O APIC pins.
1098 */
1099 int irq = gsi;
1100 if (gsi < MAX_GSI_NUM) {
1101 /*
1102 * Retain the VIA chipset work-around (gsi > 15), but
1103 * avoid a problem where the 8254 timer (IRQ0) is setup
1104 * via an override (so it's not on pin 0 of the ioapic),
1105 * and at the same time, the pin 0 interrupt is a PCI
1106 * type. The gsi > 15 test could cause these two pins
1107 * to be shared as IRQ0, and they are not shareable.
1108 * So test for this condition, and if necessary, avoid
1109 * the pin collision.
1110 */
1111 if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
1112 gsi = pci_irq++;
1113 /*
1114 * Don't assign IRQ used by ACPI SCI
1115 */
1116 if (gsi == acpi_gbl_FADT.sci_interrupt)
1117 gsi = pci_irq++;
1118 gsi_to_irq[irq] = gsi;
1119 } else {
1120 printk(KERN_ERR "GSI %u is too high\n", gsi);
1121 return gsi;
1122 }
1123 }
1124
1125 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1126 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1127 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1128 return gsi;
1129}
1130
1131#endif /* CONFIG_X86_IO_APIC */
1132#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644
index 000000000000..0c1069b8d638
--- /dev/null
+++ b/arch/x86/kernel/msr.c
@@ -0,0 +1,224 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
8 * USA; either version 2 of the License, or (at your option) any later
9 * version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * msr.c
15 *
16 * x86 MSR access device
17 *
18 * This device is accessed by lseek() to the appropriate register number
19 * and then read/write in chunks of 8 bytes. A larger size means multiple
20 * reads or writes of the same register.
21 *
22 * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
23 * an SMP box will direct the access to CPU %d.
24 */
25
26#include <linux/module.h>
27
28#include <linux/types.h>
29#include <linux/errno.h>
30#include <linux/fcntl.h>
31#include <linux/init.h>
32#include <linux/poll.h>
33#include <linux/smp.h>
34#include <linux/smp_lock.h>
35#include <linux/major.h>
36#include <linux/fs.h>
37#include <linux/device.h>
38#include <linux/cpu.h>
39#include <linux/notifier.h>
40
41#include <asm/processor.h>
42#include <asm/msr.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static struct class *msr_class;
47
48static loff_t msr_seek(struct file *file, loff_t offset, int orig)
49{
50 loff_t ret = -EINVAL;
51
52 lock_kernel();
53 switch (orig) {
54 case 0:
55 file->f_pos = offset;
56 ret = file->f_pos;
57 break;
58 case 1:
59 file->f_pos += offset;
60 ret = file->f_pos;
61 }
62 unlock_kernel();
63 return ret;
64}
65
66static ssize_t msr_read(struct file *file, char __user * buf,
67 size_t count, loff_t * ppos)
68{
69 u32 __user *tmp = (u32 __user *) buf;
70 u32 data[2];
71 u32 reg = *ppos;
72 int cpu = iminor(file->f_path.dentry->d_inode);
73 int err;
74
75 if (count % 8)
76 return -EINVAL; /* Invalid chunk size */
77
78 for (; count; count -= 8) {
79 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
80 if (err)
81 return -EIO;
82 if (copy_to_user(tmp, &data, 8))
83 return -EFAULT;
84 tmp += 2;
85 }
86
87 return ((char __user *)tmp) - buf;
88}
89
90static ssize_t msr_write(struct file *file, const char __user *buf,
91 size_t count, loff_t *ppos)
92{
93 const u32 __user *tmp = (const u32 __user *)buf;
94 u32 data[2];
95 u32 reg = *ppos;
96 int cpu = iminor(file->f_path.dentry->d_inode);
97 int err;
98
99 if (count % 8)
100 return -EINVAL; /* Invalid chunk size */
101
102 for (; count; count -= 8) {
103 if (copy_from_user(&data, tmp, 8))
104 return -EFAULT;
105 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
106 if (err)
107 return -EIO;
108 tmp += 2;
109 }
110
111 return ((char __user *)tmp) - buf;
112}
113
114static int msr_open(struct inode *inode, struct file *file)
115{
116 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
117 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
118
119 if (cpu >= NR_CPUS || !cpu_online(cpu))
120 return -ENXIO; /* No such CPU */
121 if (!cpu_has(c, X86_FEATURE_MSR))
122 return -EIO; /* MSR not supported */
123
124 return 0;
125}
126
127/*
128 * File operations we support
129 */
130static const struct file_operations msr_fops = {
131 .owner = THIS_MODULE,
132 .llseek = msr_seek,
133 .read = msr_read,
134 .write = msr_write,
135 .open = msr_open,
136};
137
138static int msr_device_create(int i)
139{
140 int err = 0;
141 struct device *dev;
142
143 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
144 if (IS_ERR(dev))
145 err = PTR_ERR(dev);
146 return err;
147}
148
149static int msr_class_cpu_callback(struct notifier_block *nfb,
150 unsigned long action, void *hcpu)
151{
152 unsigned int cpu = (unsigned long)hcpu;
153
154 switch (action) {
155 case CPU_ONLINE:
156 case CPU_ONLINE_FROZEN:
157 msr_device_create(cpu);
158 break;
159 case CPU_DEAD:
160 case CPU_DEAD_FROZEN:
161 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
162 break;
163 }
164 return NOTIFY_OK;
165}
166
167static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
168{
169 .notifier_call = msr_class_cpu_callback,
170};
171
172static int __init msr_init(void)
173{
174 int i, err = 0;
175 i = 0;
176
177 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
178 printk(KERN_ERR "msr: unable to get major %d for msr\n",
179 MSR_MAJOR);
180 err = -EBUSY;
181 goto out;
182 }
183 msr_class = class_create(THIS_MODULE, "msr");
184 if (IS_ERR(msr_class)) {
185 err = PTR_ERR(msr_class);
186 goto out_chrdev;
187 }
188 for_each_online_cpu(i) {
189 err = msr_device_create(i);
190 if (err != 0)
191 goto out_class;
192 }
193 register_hotcpu_notifier(&msr_class_cpu_notifier);
194
195 err = 0;
196 goto out;
197
198out_class:
199 i = 0;
200 for_each_online_cpu(i)
201 device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
202 class_destroy(msr_class);
203out_chrdev:
204 unregister_chrdev(MSR_MAJOR, "cpu/msr");
205out:
206 return err;
207}
208
209static void __exit msr_exit(void)
210{
211 int cpu = 0;
212 for_each_online_cpu(cpu)
213 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
214 class_destroy(msr_class);
215 unregister_chrdev(MSR_MAJOR, "cpu/msr");
216 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
217}
218
219module_init(msr_init);
220module_exit(msr_exit)
221
222MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
223MODULE_DESCRIPTION("x86 generic MSR driver");
224MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
new file mode 100644
index 000000000000..c7227e2180f8
--- /dev/null
+++ b/arch/x86/kernel/nmi_32.c
@@ -0,0 +1,468 @@
1/*
2 * linux/arch/i386/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12 * Pavel Machek and
13 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
14 */
15
16#include <linux/delay.h>
17#include <linux/interrupt.h>
18#include <linux/module.h>
19#include <linux/nmi.h>
20#include <linux/sysdev.h>
21#include <linux/sysctl.h>
22#include <linux/percpu.h>
23#include <linux/kprobes.h>
24#include <linux/cpumask.h>
25#include <linux/kernel_stat.h>
26#include <linux/kdebug.h>
27
28#include <asm/smp.h>
29#include <asm/nmi.h>
30
31#include "mach_traps.h"
32
33int unknown_nmi_panic;
34int nmi_watchdog_enabled;
35
36static cpumask_t backtrace_mask = CPU_MASK_NONE;
37
38/* nmi_active:
39 * >0: the lapic NMI watchdog is active, but can be disabled
40 * <0: the lapic NMI watchdog has not been set up, and cannot
41 * be enabled
42 * 0: the lapic NMI watchdog is disabled, but can be enabled
43 */
44atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
45
46unsigned int nmi_watchdog = NMI_DEFAULT;
47static unsigned int nmi_hz = HZ;
48
49static DEFINE_PER_CPU(short, wd_enabled);
50
51/* local prototypes */
52static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
53
54static int endflag __initdata = 0;
55
56#ifdef CONFIG_SMP
57/* The performance counters used by NMI_LOCAL_APIC don't trigger when
58 * the CPU is idle. To make sure the NMI watchdog really ticks on all
59 * CPUs during the test make them busy.
60 */
61static __init void nmi_cpu_busy(void *data)
62{
63 local_irq_enable_in_hardirq();
64 /* Intentionally don't use cpu_relax here. This is
65 to make sure that the performance counter really ticks,
66 even if there is a simulator or similar that catches the
67 pause instruction. On a real HT machine this is fine because
68 all other CPUs are busy with "useless" delay loops and don't
69 care if they get somewhat less cycles. */
70 while (endflag == 0)
71 mb();
72}
73#endif
74
75static int __init check_nmi_watchdog(void)
76{
77 unsigned int *prev_nmi_count;
78 int cpu;
79
80 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
81 return 0;
82
83 if (!atomic_read(&nmi_active))
84 return 0;
85
86 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
87 if (!prev_nmi_count)
88 return -1;
89
90 printk(KERN_INFO "Testing NMI watchdog ... ");
91
92 if (nmi_watchdog == NMI_LOCAL_APIC)
93 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
94
95 for_each_possible_cpu(cpu)
96 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
97 local_irq_enable();
98 mdelay((20*1000)/nmi_hz); // wait 20 ticks
99
100 for_each_possible_cpu(cpu) {
101#ifdef CONFIG_SMP
102 /* Check cpu_callin_map here because that is set
103 after the timer is started. */
104 if (!cpu_isset(cpu, cpu_callin_map))
105 continue;
106#endif
107 if (!per_cpu(wd_enabled, cpu))
108 continue;
109 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
110 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
111 cpu,
112 prev_nmi_count[cpu],
113 nmi_count(cpu));
114 per_cpu(wd_enabled, cpu) = 0;
115 atomic_dec(&nmi_active);
116 }
117 }
118 endflag = 1;
119 if (!atomic_read(&nmi_active)) {
120 kfree(prev_nmi_count);
121 atomic_set(&nmi_active, -1);
122 return -1;
123 }
124 printk("OK.\n");
125
126 /* now that we know it works we can reduce NMI frequency to
127 something more reasonable; makes a difference in some configs */
128 if (nmi_watchdog == NMI_LOCAL_APIC)
129 nmi_hz = lapic_adjust_nmi_hz(1);
130
131 kfree(prev_nmi_count);
132 return 0;
133}
134/* This needs to happen later in boot so counters are working */
135late_initcall(check_nmi_watchdog);
136
137static int __init setup_nmi_watchdog(char *str)
138{
139 int nmi;
140
141 get_option(&str, &nmi);
142
143 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
144 return 0;
145
146 nmi_watchdog = nmi;
147 return 1;
148}
149
150__setup("nmi_watchdog=", setup_nmi_watchdog);
151
152
153/* Suspend/resume support */
154
155#ifdef CONFIG_PM
156
157static int nmi_pm_active; /* nmi_active before suspend */
158
159static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
160{
161 /* only CPU0 goes here, other CPUs should be offline */
162 nmi_pm_active = atomic_read(&nmi_active);
163 stop_apic_nmi_watchdog(NULL);
164 BUG_ON(atomic_read(&nmi_active) != 0);
165 return 0;
166}
167
168static int lapic_nmi_resume(struct sys_device *dev)
169{
170 /* only CPU0 goes here, other CPUs should be offline */
171 if (nmi_pm_active > 0) {
172 setup_apic_nmi_watchdog(NULL);
173 touch_nmi_watchdog();
174 }
175 return 0;
176}
177
178
179static struct sysdev_class nmi_sysclass = {
180 set_kset_name("lapic_nmi"),
181 .resume = lapic_nmi_resume,
182 .suspend = lapic_nmi_suspend,
183};
184
185static struct sys_device device_lapic_nmi = {
186 .id = 0,
187 .cls = &nmi_sysclass,
188};
189
190static int __init init_lapic_nmi_sysfs(void)
191{
192 int error;
193
194 /* should really be a BUG_ON but b/c this is an
195 * init call, it just doesn't work. -dcz
196 */
197 if (nmi_watchdog != NMI_LOCAL_APIC)
198 return 0;
199
200 if (atomic_read(&nmi_active) < 0)
201 return 0;
202
203 error = sysdev_class_register(&nmi_sysclass);
204 if (!error)
205 error = sysdev_register(&device_lapic_nmi);
206 return error;
207}
208/* must come after the local APIC's device_initcall() */
209late_initcall(init_lapic_nmi_sysfs);
210
211#endif /* CONFIG_PM */
212
213static void __acpi_nmi_enable(void *__unused)
214{
215 apic_write_around(APIC_LVT0, APIC_DM_NMI);
216}
217
218/*
219 * Enable timer based NMIs on all CPUs:
220 */
221void acpi_nmi_enable(void)
222{
223 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
224 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
225}
226
227static void __acpi_nmi_disable(void *__unused)
228{
229 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
230}
231
232/*
233 * Disable timer based NMIs on all CPUs:
234 */
235void acpi_nmi_disable(void)
236{
237 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
238 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
239}
240
241void setup_apic_nmi_watchdog (void *unused)
242{
243 if (__get_cpu_var(wd_enabled))
244 return;
245
246 /* cheap hack to support suspend/resume */
247 /* if cpu0 is not active neither should the other cpus */
248 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
249 return;
250
251 switch (nmi_watchdog) {
252 case NMI_LOCAL_APIC:
253 __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
254 if (lapic_watchdog_init(nmi_hz) < 0) {
255 __get_cpu_var(wd_enabled) = 0;
256 return;
257 }
258 /* FALL THROUGH */
259 case NMI_IO_APIC:
260 __get_cpu_var(wd_enabled) = 1;
261 atomic_inc(&nmi_active);
262 }
263}
264
265void stop_apic_nmi_watchdog(void *unused)
266{
267 /* only support LOCAL and IO APICs for now */
268 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
269 (nmi_watchdog != NMI_IO_APIC))
270 return;
271 if (__get_cpu_var(wd_enabled) == 0)
272 return;
273 if (nmi_watchdog == NMI_LOCAL_APIC)
274 lapic_watchdog_stop();
275 __get_cpu_var(wd_enabled) = 0;
276 atomic_dec(&nmi_active);
277}
278
279/*
280 * the best way to detect whether a CPU has a 'hard lockup' problem
281 * is to check it's local APIC timer IRQ counts. If they are not
282 * changing then that CPU has some problem.
283 *
284 * as these watchdog NMI IRQs are generated on every CPU, we only
285 * have to check the current processor.
286 *
287 * since NMIs don't listen to _any_ locks, we have to be extremely
288 * careful not to rely on unsafe variables. The printk might lock
289 * up though, so we have to break up any console locks first ...
290 * [when there will be more tty-related locks, break them up
291 * here too!]
292 */
293
294static unsigned int
295 last_irq_sums [NR_CPUS],
296 alert_counter [NR_CPUS];
297
298void touch_nmi_watchdog(void)
299{
300 if (nmi_watchdog > 0) {
301 unsigned cpu;
302
303 /*
304 * Just reset the alert counters, (other CPUs might be
305 * spinning on locks we hold):
306 */
307 for_each_present_cpu(cpu) {
308 if (alert_counter[cpu])
309 alert_counter[cpu] = 0;
310 }
311 }
312
313 /*
314 * Tickle the softlockup detector too:
315 */
316 touch_softlockup_watchdog();
317}
318EXPORT_SYMBOL(touch_nmi_watchdog);
319
320extern void die_nmi(struct pt_regs *, const char *msg);
321
322__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
323{
324
325 /*
326 * Since current_thread_info()-> is always on the stack, and we
327 * always switch the stack NMI-atomically, it's safe to use
328 * smp_processor_id().
329 */
330 unsigned int sum;
331 int touched = 0;
332 int cpu = smp_processor_id();
333 int rc=0;
334
335 /* check for other users first */
336 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
337 == NOTIFY_STOP) {
338 rc = 1;
339 touched = 1;
340 }
341
342 if (cpu_isset(cpu, backtrace_mask)) {
343 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
344
345 spin_lock(&lock);
346 printk("NMI backtrace for cpu %d\n", cpu);
347 dump_stack();
348 spin_unlock(&lock);
349 cpu_clear(cpu, backtrace_mask);
350 }
351
352 /*
353 * Take the local apic timer and PIT/HPET into account. We don't
354 * know which one is active, when we have highres/dyntick on
355 */
356 sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
357
358 /* if the none of the timers isn't firing, this cpu isn't doing much */
359 if (!touched && last_irq_sums[cpu] == sum) {
360 /*
361 * Ayiee, looks like this CPU is stuck ...
362 * wait a few IRQs (5 seconds) before doing the oops ...
363 */
364 alert_counter[cpu]++;
365 if (alert_counter[cpu] == 5*nmi_hz)
366 /*
367 * die_nmi will return ONLY if NOTIFY_STOP happens..
368 */
369 die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
370 } else {
371 last_irq_sums[cpu] = sum;
372 alert_counter[cpu] = 0;
373 }
374 /* see if the nmi watchdog went off */
375 if (!__get_cpu_var(wd_enabled))
376 return rc;
377 switch (nmi_watchdog) {
378 case NMI_LOCAL_APIC:
379 rc |= lapic_wd_event(nmi_hz);
380 break;
381 case NMI_IO_APIC:
382 /* don't know how to accurately check for this.
383 * just assume it was a watchdog timer interrupt
384 * This matches the old behaviour.
385 */
386 rc = 1;
387 break;
388 }
389 return rc;
390}
391
392int do_nmi_callback(struct pt_regs * regs, int cpu)
393{
394#ifdef CONFIG_SYSCTL
395 if (unknown_nmi_panic)
396 return unknown_nmi_panic_callback(regs, cpu);
397#endif
398 return 0;
399}
400
401#ifdef CONFIG_SYSCTL
402
403static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
404{
405 unsigned char reason = get_nmi_reason();
406 char buf[64];
407
408 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
409 die_nmi(regs, buf);
410 return 0;
411}
412
413/*
414 * proc handler for /proc/sys/kernel/nmi
415 */
416int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
417 void __user *buffer, size_t *length, loff_t *ppos)
418{
419 int old_state;
420
421 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
422 old_state = nmi_watchdog_enabled;
423 proc_dointvec(table, write, file, buffer, length, ppos);
424 if (!!old_state == !!nmi_watchdog_enabled)
425 return 0;
426
427 if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
428 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
429 return -EIO;
430 }
431
432 if (nmi_watchdog == NMI_DEFAULT) {
433 if (lapic_watchdog_ok())
434 nmi_watchdog = NMI_LOCAL_APIC;
435 else
436 nmi_watchdog = NMI_IO_APIC;
437 }
438
439 if (nmi_watchdog == NMI_LOCAL_APIC) {
440 if (nmi_watchdog_enabled)
441 enable_lapic_nmi_watchdog();
442 else
443 disable_lapic_nmi_watchdog();
444 } else {
445 printk( KERN_WARNING
446 "NMI watchdog doesn't know what hardware to touch\n");
447 return -EIO;
448 }
449 return 0;
450}
451
452#endif
453
454void __trigger_all_cpu_backtrace(void)
455{
456 int i;
457
458 backtrace_mask = cpu_online_map;
459 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
460 for (i = 0; i < 10 * 1000; i++) {
461 if (cpus_empty(backtrace_mask))
462 break;
463 mdelay(1);
464 }
465}
466
467EXPORT_SYMBOL(nmi_active);
468EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
new file mode 100644
index 000000000000..9000d82c6dc0
--- /dev/null
+++ b/arch/x86/kernel/numaq_32.c
@@ -0,0 +1,89 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/module.h>
30#include <linux/nodemask.h>
31#include <asm/numaq.h>
32#include <asm/topology.h>
33#include <asm/processor.h>
34
35#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
36
37/*
38 * Function: smp_dump_qct()
39 *
40 * Description: gets memory layout from the quad config table. This
41 * function also updates node_online_map with the nodes (quads) present.
42 */
43static void __init smp_dump_qct(void)
44{
45 int node;
46 struct eachquadmem *eq;
47 struct sys_cfg_data *scd =
48 (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
49
50 nodes_clear(node_online_map);
51 for_each_node(node) {
52 if (scd->quads_present31_0 & (1 << node)) {
53 node_set_online(node);
54 eq = &scd->eq[node];
55 /* Convert to pages */
56 node_start_pfn[node] = MB_TO_PAGES(
57 eq->hi_shrd_mem_start - eq->priv_mem_size);
58 node_end_pfn[node] = MB_TO_PAGES(
59 eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
60
61 memory_present(node,
62 node_start_pfn[node], node_end_pfn[node]);
63 node_remap_size[node] = node_memmap_size_bytes(node,
64 node_start_pfn[node],
65 node_end_pfn[node]);
66 }
67 }
68}
69
70/*
71 * Unlike Summit, we don't really care to let the NUMA-Q
72 * fall back to flat mode. Don't compile for NUMA-Q
73 * unless you really need it!
74 */
75int __init get_memcfg_numaq(void)
76{
77 smp_dump_qct();
78 return 1;
79}
80
81static int __init numaq_tsc_disable(void)
82{
83 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 tsc_disable = 1;
86 }
87 return 0;
88}
89arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
new file mode 100644
index 000000000000..739cfb207dd7
--- /dev/null
+++ b/arch/x86/kernel/paravirt_32.c
@@ -0,0 +1,392 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/highmem.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35#include <asm/timer.h>
36
37/* nop stub */
38void _paravirt_nop(void)
39{
40}
41
42static void __init default_banner(void)
43{
44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
45 paravirt_ops.name);
46}
47
48char *memory_setup(void)
49{
50 return paravirt_ops.memory_setup();
51}
52
53/* Simple instruction patching code. */
54#define DEF_NATIVE(name, code) \
55 extern const char start_##name[], end_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":")
57
58DEF_NATIVE(irq_disable, "cli");
59DEF_NATIVE(irq_enable, "sti");
60DEF_NATIVE(restore_fl, "push %eax; popf");
61DEF_NATIVE(save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret");
63DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(clts, "clts");
68DEF_NATIVE(read_tsc, "rdtsc");
69
70DEF_NATIVE(ud2a, "ud2a");
71
72static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
73 unsigned long addr, unsigned len)
74{
75 const unsigned char *start, *end;
76 unsigned ret;
77
78 switch(type) {
79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
80 SITE(irq_disable);
81 SITE(irq_enable);
82 SITE(restore_fl);
83 SITE(save_fl);
84 SITE(iret);
85 SITE(irq_enable_sysexit);
86 SITE(read_cr2);
87 SITE(read_cr3);
88 SITE(write_cr3);
89 SITE(clts);
90 SITE(read_tsc);
91#undef SITE
92
93 patch_site:
94 ret = paravirt_patch_insns(ibuf, len, start, end);
95 break;
96
97 case PARAVIRT_PATCH(make_pgd):
98 case PARAVIRT_PATCH(make_pte):
99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
109
110 default:
111 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
112 break;
113 }
114
115 return ret;
116}
117
118unsigned paravirt_patch_nop(void)
119{
120 return 0;
121}
122
123unsigned paravirt_patch_ignore(unsigned len)
124{
125 return len;
126}
127
128struct branch {
129 unsigned char opcode;
130 u32 delta;
131} __attribute__((packed));
132
133unsigned paravirt_patch_call(void *insnbuf,
134 const void *target, u16 tgt_clobbers,
135 unsigned long addr, u16 site_clobbers,
136 unsigned len)
137{
138 struct branch *b = insnbuf;
139 unsigned long delta = (unsigned long)target - (addr+5);
140
141 if (tgt_clobbers & ~site_clobbers)
142 return len; /* target would clobber too much for this site */
143 if (len < 5)
144 return len; /* call too long for patch site */
145
146 b->opcode = 0xe8; /* call */
147 b->delta = delta;
148 BUILD_BUG_ON(sizeof(*b) != 5);
149
150 return 5;
151}
152
153unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
154 unsigned long addr, unsigned len)
155{
156 struct branch *b = insnbuf;
157 unsigned long delta = (unsigned long)target - (addr+5);
158
159 if (len < 5)
160 return len; /* call too long for patch site */
161
162 b->opcode = 0xe9; /* jmp */
163 b->delta = delta;
164
165 return 5;
166}
167
168unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
169 unsigned long addr, unsigned len)
170{
171 void *opfunc = *((void **)&paravirt_ops + type);
172 unsigned ret;
173
174 if (opfunc == NULL)
175 /* If there's no function, patch it with a ud2a (BUG) */
176 ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
177 else if (opfunc == paravirt_nop)
178 /* If the operation is a nop, then nop the callsite */
179 ret = paravirt_patch_nop();
180 else if (type == PARAVIRT_PATCH(iret) ||
181 type == PARAVIRT_PATCH(irq_enable_sysexit))
182 /* If operation requires a jmp, then jmp */
183 ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
184 else
185 /* Otherwise call the function; assume target could
186 clobber any caller-save reg */
187 ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
188 addr, clobbers, len);
189
190 return ret;
191}
192
193unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
194 const char *start, const char *end)
195{
196 unsigned insn_len = end - start;
197
198 if (insn_len > len || start == NULL)
199 insn_len = len;
200 else
201 memcpy(insnbuf, start, insn_len);
202
203 return insn_len;
204}
205
206void init_IRQ(void)
207{
208 paravirt_ops.init_IRQ();
209}
210
211static void native_flush_tlb(void)
212{
213 __native_flush_tlb();
214}
215
216/*
217 * Global pages have to be flushed a bit differently. Not a real
218 * performance problem because this does not happen often.
219 */
220static void native_flush_tlb_global(void)
221{
222 __native_flush_tlb_global();
223}
224
225static void native_flush_tlb_single(unsigned long addr)
226{
227 __native_flush_tlb_single(addr);
228}
229
230/* These are in entry.S */
231extern void native_iret(void);
232extern void native_irq_enable_sysexit(void);
233
234static int __init print_banner(void)
235{
236 paravirt_ops.banner();
237 return 0;
238}
239core_initcall(print_banner);
240
241static struct resource reserve_ioports = {
242 .start = 0,
243 .end = IO_SPACE_LIMIT,
244 .name = "paravirt-ioport",
245 .flags = IORESOURCE_IO | IORESOURCE_BUSY,
246};
247
248static struct resource reserve_iomem = {
249 .start = 0,
250 .end = -1,
251 .name = "paravirt-iomem",
252 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
253};
254
255/*
256 * Reserve the whole legacy IO space to prevent any legacy drivers
257 * from wasting time probing for their hardware. This is a fairly
258 * brute-force approach to disabling all non-virtual drivers.
259 *
260 * Note that this must be called very early to have any effect.
261 */
262int paravirt_disable_iospace(void)
263{
264 int ret;
265
266 ret = request_resource(&ioport_resource, &reserve_ioports);
267 if (ret == 0) {
268 ret = request_resource(&iomem_resource, &reserve_iomem);
269 if (ret)
270 release_resource(&reserve_ioports);
271 }
272
273 return ret;
274}
275
276struct paravirt_ops paravirt_ops = {
277 .name = "bare hardware",
278 .paravirt_enabled = 0,
279 .kernel_rpl = 0,
280 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
281
282 .patch = native_patch,
283 .banner = default_banner,
284 .arch_setup = paravirt_nop,
285 .memory_setup = machine_specific_memory_setup,
286 .get_wallclock = native_get_wallclock,
287 .set_wallclock = native_set_wallclock,
288 .time_init = hpet_time_init,
289 .init_IRQ = native_init_IRQ,
290
291 .cpuid = native_cpuid,
292 .get_debugreg = native_get_debugreg,
293 .set_debugreg = native_set_debugreg,
294 .clts = native_clts,
295 .read_cr0 = native_read_cr0,
296 .write_cr0 = native_write_cr0,
297 .read_cr2 = native_read_cr2,
298 .write_cr2 = native_write_cr2,
299 .read_cr3 = native_read_cr3,
300 .write_cr3 = native_write_cr3,
301 .read_cr4 = native_read_cr4,
302 .read_cr4_safe = native_read_cr4_safe,
303 .write_cr4 = native_write_cr4,
304 .save_fl = native_save_fl,
305 .restore_fl = native_restore_fl,
306 .irq_disable = native_irq_disable,
307 .irq_enable = native_irq_enable,
308 .safe_halt = native_safe_halt,
309 .halt = native_halt,
310 .wbinvd = native_wbinvd,
311 .read_msr = native_read_msr_safe,
312 .write_msr = native_write_msr_safe,
313 .read_tsc = native_read_tsc,
314 .read_pmc = native_read_pmc,
315 .sched_clock = native_sched_clock,
316 .get_cpu_khz = native_calculate_cpu_khz,
317 .load_tr_desc = native_load_tr_desc,
318 .set_ldt = native_set_ldt,
319 .load_gdt = native_load_gdt,
320 .load_idt = native_load_idt,
321 .store_gdt = native_store_gdt,
322 .store_idt = native_store_idt,
323 .store_tr = native_store_tr,
324 .load_tls = native_load_tls,
325 .write_ldt_entry = write_dt_entry,
326 .write_gdt_entry = write_dt_entry,
327 .write_idt_entry = write_dt_entry,
328 .load_esp0 = native_load_esp0,
329
330 .set_iopl_mask = native_set_iopl_mask,
331 .io_delay = native_io_delay,
332
333#ifdef CONFIG_X86_LOCAL_APIC
334 .apic_write = native_apic_write,
335 .apic_write_atomic = native_apic_write_atomic,
336 .apic_read = native_apic_read,
337 .setup_boot_clock = setup_boot_APIC_clock,
338 .setup_secondary_clock = setup_secondary_APIC_clock,
339 .startup_ipi_hook = paravirt_nop,
340#endif
341 .set_lazy_mode = paravirt_nop,
342
343 .pagetable_setup_start = native_pagetable_setup_start,
344 .pagetable_setup_done = native_pagetable_setup_done,
345
346 .flush_tlb_user = native_flush_tlb,
347 .flush_tlb_kernel = native_flush_tlb_global,
348 .flush_tlb_single = native_flush_tlb_single,
349 .flush_tlb_others = native_flush_tlb_others,
350
351 .alloc_pt = paravirt_nop,
352 .alloc_pd = paravirt_nop,
353 .alloc_pd_clone = paravirt_nop,
354 .release_pt = paravirt_nop,
355 .release_pd = paravirt_nop,
356
357 .set_pte = native_set_pte,
358 .set_pte_at = native_set_pte_at,
359 .set_pmd = native_set_pmd,
360 .pte_update = paravirt_nop,
361 .pte_update_defer = paravirt_nop,
362
363#ifdef CONFIG_HIGHPTE
364 .kmap_atomic_pte = kmap_atomic,
365#endif
366
367#ifdef CONFIG_X86_PAE
368 .set_pte_atomic = native_set_pte_atomic,
369 .set_pte_present = native_set_pte_present,
370 .set_pud = native_set_pud,
371 .pte_clear = native_pte_clear,
372 .pmd_clear = native_pmd_clear,
373
374 .pmd_val = native_pmd_val,
375 .make_pmd = native_make_pmd,
376#endif
377
378 .pte_val = native_pte_val,
379 .pgd_val = native_pgd_val,
380
381 .make_pte = native_make_pte,
382 .make_pgd = native_make_pgd,
383
384 .irq_enable_sysexit = native_irq_enable_sysexit,
385 .iret = native_iret,
386
387 .dup_mmap = paravirt_nop,
388 .exit_mmap = paravirt_nop,
389 .activate_mm = paravirt_nop,
390};
391
392EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
new file mode 100644
index 000000000000..048f09b62553
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_32.c
@@ -0,0 +1,177 @@
1/*
2 * Dynamic DMA mapping support.
3 *
4 * On i386 there is no hardware dynamic DMA address translation,
5 * so consistent alloc/free are merely page allocation/freeing.
6 * The rest of the dynamic DMA mapping interface is implemented
7 * in asm/pci.h.
8 */
9
10#include <linux/types.h>
11#include <linux/mm.h>
12#include <linux/string.h>
13#include <linux/pci.h>
14#include <linux/module.h>
15#include <linux/pci.h>
16#include <asm/io.h>
17
18struct dma_coherent_mem {
19 void *virt_base;
20 u32 device_base;
21 int size;
22 int flags;
23 unsigned long *bitmap;
24};
25
26void *dma_alloc_coherent(struct device *dev, size_t size,
27 dma_addr_t *dma_handle, gfp_t gfp)
28{
29 void *ret;
30 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
31 int order = get_order(size);
32 /* ignore region specifiers */
33 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
34
35 if (mem) {
36 int page = bitmap_find_free_region(mem->bitmap, mem->size,
37 order);
38 if (page >= 0) {
39 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
40 ret = mem->virt_base + (page << PAGE_SHIFT);
41 memset(ret, 0, size);
42 return ret;
43 }
44 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
45 return NULL;
46 }
47
48 if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
49 gfp |= GFP_DMA;
50
51 ret = (void *)__get_free_pages(gfp, order);
52
53 if (ret != NULL) {
54 memset(ret, 0, size);
55 *dma_handle = virt_to_phys(ret);
56 }
57 return ret;
58}
59EXPORT_SYMBOL(dma_alloc_coherent);
60
61void dma_free_coherent(struct device *dev, size_t size,
62 void *vaddr, dma_addr_t dma_handle)
63{
64 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
65 int order = get_order(size);
66
67 if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
68 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
69
70 bitmap_release_region(mem->bitmap, page, order);
71 } else
72 free_pages((unsigned long)vaddr, order);
73}
74EXPORT_SYMBOL(dma_free_coherent);
75
76int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
77 dma_addr_t device_addr, size_t size, int flags)
78{
79 void __iomem *mem_base = NULL;
80 int pages = size >> PAGE_SHIFT;
81 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
82
83 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
84 goto out;
85 if (!size)
86 goto out;
87 if (dev->dma_mem)
88 goto out;
89
90 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
91
92 mem_base = ioremap(bus_addr, size);
93 if (!mem_base)
94 goto out;
95
96 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
97 if (!dev->dma_mem)
98 goto out;
99 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap)
101 goto free1_out;
102
103 dev->dma_mem->virt_base = mem_base;
104 dev->dma_mem->device_base = device_addr;
105 dev->dma_mem->size = pages;
106 dev->dma_mem->flags = flags;
107
108 if (flags & DMA_MEMORY_MAP)
109 return DMA_MEMORY_MAP;
110
111 return DMA_MEMORY_IO;
112
113 free1_out:
114 kfree(dev->dma_mem);
115 out:
116 if (mem_base)
117 iounmap(mem_base);
118 return 0;
119}
120EXPORT_SYMBOL(dma_declare_coherent_memory);
121
122void dma_release_declared_memory(struct device *dev)
123{
124 struct dma_coherent_mem *mem = dev->dma_mem;
125
126 if(!mem)
127 return;
128 dev->dma_mem = NULL;
129 iounmap(mem->virt_base);
130 kfree(mem->bitmap);
131 kfree(mem);
132}
133EXPORT_SYMBOL(dma_release_declared_memory);
134
135void *dma_mark_declared_memory_occupied(struct device *dev,
136 dma_addr_t device_addr, size_t size)
137{
138 struct dma_coherent_mem *mem = dev->dma_mem;
139 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
140 int pos, err;
141
142 if (!mem)
143 return ERR_PTR(-EINVAL);
144
145 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
146 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
147 if (err != 0)
148 return ERR_PTR(err);
149 return mem->virt_base + (pos << PAGE_SHIFT);
150}
151EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
152
153#ifdef CONFIG_PCI
154/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
155
156int forbid_dac;
157EXPORT_SYMBOL(forbid_dac);
158
159static __devinit void via_no_dac(struct pci_dev *dev)
160{
161 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
162 printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
163 forbid_dac = 1;
164 }
165}
166DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
167
168static int check_iommu(char *s)
169{
170 if (!strcmp(s, "usedac")) {
171 forbid_dac = -1;
172 return 1;
173 }
174 return 0;
175}
176__setup("iommu=", check_iommu);
177#endif
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/x86/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
1#include <linux/platform_device.h>
2#include <linux/errno.h>
3#include <linux/init.h>
4
5static __init int add_pcspkr(void)
6{
7 struct platform_device *pd;
8 int ret;
9
10 pd = platform_device_alloc("pcspkr", -1);
11 if (!pd)
12 return -ENOMEM;
13
14 ret = platform_device_add(pd);
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19}
20device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644
index 000000000000..84664710b784
--- /dev/null
+++ b/arch/x86/kernel/process_32.c
@@ -0,0 +1,951 @@
1/*
2 * linux/arch/i386/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * This file handles the architecture-dependent parts of process handling..
12 */
13
14#include <stdarg.h>
15
16#include <linux/cpu.h>
17#include <linux/errno.h>
18#include <linux/sched.h>
19#include <linux/fs.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/elfcore.h>
23#include <linux/smp.h>
24#include <linux/stddef.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
27#include <linux/user.h>
28#include <linux/a.out.h>
29#include <linux/interrupt.h>
30#include <linux/utsname.h>
31#include <linux/delay.h>
32#include <linux/reboot.h>
33#include <linux/init.h>
34#include <linux/mc146818rtc.h>
35#include <linux/module.h>
36#include <linux/kallsyms.h>
37#include <linux/ptrace.h>
38#include <linux/random.h>
39#include <linux/personality.h>
40#include <linux/tick.h>
41#include <linux/percpu.h>
42
43#include <asm/uaccess.h>
44#include <asm/pgtable.h>
45#include <asm/system.h>
46#include <asm/io.h>
47#include <asm/ldt.h>
48#include <asm/processor.h>
49#include <asm/i387.h>
50#include <asm/desc.h>
51#include <asm/vm86.h>
52#ifdef CONFIG_MATH_EMULATION
53#include <asm/math_emu.h>
54#endif
55
56#include <linux/err.h>
57
58#include <asm/tlbflush.h>
59#include <asm/cpu.h>
60
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
62
63static int hlt_counter;
64
65unsigned long boot_option_idle_override = 0;
66EXPORT_SYMBOL(boot_option_idle_override);
67
68DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
69EXPORT_PER_CPU_SYMBOL(current_task);
70
71DEFINE_PER_CPU(int, cpu_number);
72EXPORT_PER_CPU_SYMBOL(cpu_number);
73
74/*
75 * Return saved PC of a blocked thread.
76 */
77unsigned long thread_saved_pc(struct task_struct *tsk)
78{
79 return ((unsigned long *)tsk->thread.esp)[3];
80}
81
82/*
83 * Powermanagement idle function, if any..
84 */
85void (*pm_idle)(void);
86EXPORT_SYMBOL(pm_idle);
87static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
88
89void disable_hlt(void)
90{
91 hlt_counter++;
92}
93
94EXPORT_SYMBOL(disable_hlt);
95
96void enable_hlt(void)
97{
98 hlt_counter--;
99}
100
101EXPORT_SYMBOL(enable_hlt);
102
103/*
104 * We use this if we don't have any better
105 * idle routine..
106 */
107void default_idle(void)
108{
109 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
110 current_thread_info()->status &= ~TS_POLLING;
111 /*
112 * TS_POLLING-cleared state must be visible before we
113 * test NEED_RESCHED:
114 */
115 smp_mb();
116
117 local_irq_disable();
118 if (!need_resched())
119 safe_halt(); /* enables interrupts racelessly */
120 else
121 local_irq_enable();
122 current_thread_info()->status |= TS_POLLING;
123 } else {
124 /* loop is done by the caller */
125 cpu_relax();
126 }
127}
128#ifdef CONFIG_APM_MODULE
129EXPORT_SYMBOL(default_idle);
130#endif
131
132/*
133 * On SMP it's slightly faster (but much more power-consuming!)
134 * to poll the ->work.need_resched flag instead of waiting for the
135 * cross-CPU IPI to arrive. Use this option with caution.
136 */
137static void poll_idle (void)
138{
139 cpu_relax();
140}
141
142#ifdef CONFIG_HOTPLUG_CPU
143#include <asm/nmi.h>
144/* We don't actually take CPU down, just spin without interrupts. */
145static inline void play_dead(void)
146{
147 /* This must be done before dead CPU ack */
148 cpu_exit_clear();
149 wbinvd();
150 mb();
151 /* Ack it */
152 __get_cpu_var(cpu_state) = CPU_DEAD;
153
154 /*
155 * With physical CPU hotplug, we should halt the cpu
156 */
157 local_irq_disable();
158 while (1)
159 halt();
160}
161#else
162static inline void play_dead(void)
163{
164 BUG();
165}
166#endif /* CONFIG_HOTPLUG_CPU */
167
168/*
169 * The idle thread. There's no useful work to be
170 * done, so just try to conserve power and have a
171 * low exit latency (ie sit in a loop waiting for
172 * somebody to say that they'd like to reschedule)
173 */
174void cpu_idle(void)
175{
176 int cpu = smp_processor_id();
177
178 current_thread_info()->status |= TS_POLLING;
179
180 /* endless idle loop with no priority at all */
181 while (1) {
182 tick_nohz_stop_sched_tick();
183 while (!need_resched()) {
184 void (*idle)(void);
185
186 if (__get_cpu_var(cpu_idle_state))
187 __get_cpu_var(cpu_idle_state) = 0;
188
189 check_pgt_cache();
190 rmb();
191 idle = pm_idle;
192
193 if (!idle)
194 idle = default_idle;
195
196 if (cpu_is_offline(cpu))
197 play_dead();
198
199 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
200 idle();
201 }
202 tick_nohz_restart_sched_tick();
203 preempt_enable_no_resched();
204 schedule();
205 preempt_disable();
206 }
207}
208
209void cpu_idle_wait(void)
210{
211 unsigned int cpu, this_cpu = get_cpu();
212 cpumask_t map, tmp = current->cpus_allowed;
213
214 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
215 put_cpu();
216
217 cpus_clear(map);
218 for_each_online_cpu(cpu) {
219 per_cpu(cpu_idle_state, cpu) = 1;
220 cpu_set(cpu, map);
221 }
222
223 __get_cpu_var(cpu_idle_state) = 0;
224
225 wmb();
226 do {
227 ssleep(1);
228 for_each_online_cpu(cpu) {
229 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
230 cpu_clear(cpu, map);
231 }
232 cpus_and(map, map, cpu_online_map);
233 } while (!cpus_empty(map));
234
235 set_cpus_allowed(current, tmp);
236}
237EXPORT_SYMBOL_GPL(cpu_idle_wait);
238
239/*
240 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
241 * which can obviate IPI to trigger checking of need_resched.
242 * We execute MONITOR against need_resched and enter optimized wait state
243 * through MWAIT. Whenever someone changes need_resched, we would be woken
244 * up from MWAIT (without an IPI).
245 *
246 * New with Core Duo processors, MWAIT can take some hints based on CPU
247 * capability.
248 */
249void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
250{
251 if (!need_resched()) {
252 __monitor((void *)&current_thread_info()->flags, 0, 0);
253 smp_mb();
254 if (!need_resched())
255 __mwait(eax, ecx);
256 }
257}
258
259/* Default MONITOR/MWAIT with no hints, used for default C1 state */
260static void mwait_idle(void)
261{
262 local_irq_enable();
263 mwait_idle_with_hints(0, 0);
264}
265
266void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
267{
268 if (cpu_has(c, X86_FEATURE_MWAIT)) {
269 printk("monitor/mwait feature present.\n");
270 /*
271 * Skip, if setup has overridden idle.
272 * One CPU supports mwait => All CPUs supports mwait
273 */
274 if (!pm_idle) {
275 printk("using mwait in idle threads.\n");
276 pm_idle = mwait_idle;
277 }
278 }
279}
280
281static int __init idle_setup(char *str)
282{
283 if (!strcmp(str, "poll")) {
284 printk("using polling idle threads.\n");
285 pm_idle = poll_idle;
286#ifdef CONFIG_X86_SMP
287 if (smp_num_siblings > 1)
288 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
289#endif
290 } else if (!strcmp(str, "mwait"))
291 force_mwait = 1;
292 else
293 return -1;
294
295 boot_option_idle_override = 1;
296 return 0;
297}
298early_param("idle", idle_setup);
299
300void show_regs(struct pt_regs * regs)
301{
302 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
303 unsigned long d0, d1, d2, d3, d6, d7;
304
305 printk("\n");
306 printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
307 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
308 print_symbol("EIP is at %s\n", regs->eip);
309
310 if (user_mode_vm(regs))
311 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
312 printk(" EFLAGS: %08lx %s (%s %.*s)\n",
313 regs->eflags, print_tainted(), init_utsname()->release,
314 (int)strcspn(init_utsname()->version, " "),
315 init_utsname()->version);
316 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
317 regs->eax,regs->ebx,regs->ecx,regs->edx);
318 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
319 regs->esi, regs->edi, regs->ebp);
320 printk(" DS: %04x ES: %04x FS: %04x\n",
321 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
322
323 cr0 = read_cr0();
324 cr2 = read_cr2();
325 cr3 = read_cr3();
326 cr4 = read_cr4_safe();
327 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
328
329 get_debugreg(d0, 0);
330 get_debugreg(d1, 1);
331 get_debugreg(d2, 2);
332 get_debugreg(d3, 3);
333 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
334 d0, d1, d2, d3);
335 get_debugreg(d6, 6);
336 get_debugreg(d7, 7);
337 printk("DR6: %08lx DR7: %08lx\n", d6, d7);
338
339 show_trace(NULL, regs, &regs->esp);
340}
341
342/*
343 * This gets run with %ebx containing the
344 * function to call, and %edx containing
345 * the "args".
346 */
347extern void kernel_thread_helper(void);
348
349/*
350 * Create a kernel thread
351 */
352int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
353{
354 struct pt_regs regs;
355
356 memset(&regs, 0, sizeof(regs));
357
358 regs.ebx = (unsigned long) fn;
359 regs.edx = (unsigned long) arg;
360
361 regs.xds = __USER_DS;
362 regs.xes = __USER_DS;
363 regs.xfs = __KERNEL_PERCPU;
364 regs.orig_eax = -1;
365 regs.eip = (unsigned long) kernel_thread_helper;
366 regs.xcs = __KERNEL_CS | get_kernel_rpl();
367 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
368
369 /* Ok, create the new process.. */
370 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
371}
372EXPORT_SYMBOL(kernel_thread);
373
374/*
375 * Free current thread data structures etc..
376 */
377void exit_thread(void)
378{
379 /* The process may have allocated an io port bitmap... nuke it. */
380 if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
381 struct task_struct *tsk = current;
382 struct thread_struct *t = &tsk->thread;
383 int cpu = get_cpu();
384 struct tss_struct *tss = &per_cpu(init_tss, cpu);
385
386 kfree(t->io_bitmap_ptr);
387 t->io_bitmap_ptr = NULL;
388 clear_thread_flag(TIF_IO_BITMAP);
389 /*
390 * Careful, clear this in the TSS too:
391 */
392 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
393 t->io_bitmap_max = 0;
394 tss->io_bitmap_owner = NULL;
395 tss->io_bitmap_max = 0;
396 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
397 put_cpu();
398 }
399}
400
401void flush_thread(void)
402{
403 struct task_struct *tsk = current;
404
405 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
406 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
407 clear_tsk_thread_flag(tsk, TIF_DEBUG);
408 /*
409 * Forget coprocessor state..
410 */
411 clear_fpu(tsk);
412 clear_used_math();
413}
414
415void release_thread(struct task_struct *dead_task)
416{
417 BUG_ON(dead_task->mm);
418 release_vm86_irqs(dead_task);
419}
420
421/*
422 * This gets called before we allocate a new thread and copy
423 * the current task into it.
424 */
425void prepare_to_copy(struct task_struct *tsk)
426{
427 unlazy_fpu(tsk);
428}
429
430int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
431 unsigned long unused,
432 struct task_struct * p, struct pt_regs * regs)
433{
434 struct pt_regs * childregs;
435 struct task_struct *tsk;
436 int err;
437
438 childregs = task_pt_regs(p);
439 *childregs = *regs;
440 childregs->eax = 0;
441 childregs->esp = esp;
442
443 p->thread.esp = (unsigned long) childregs;
444 p->thread.esp0 = (unsigned long) (childregs+1);
445
446 p->thread.eip = (unsigned long) ret_from_fork;
447
448 savesegment(gs,p->thread.gs);
449
450 tsk = current;
451 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
452 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
453 IO_BITMAP_BYTES, GFP_KERNEL);
454 if (!p->thread.io_bitmap_ptr) {
455 p->thread.io_bitmap_max = 0;
456 return -ENOMEM;
457 }
458 set_tsk_thread_flag(p, TIF_IO_BITMAP);
459 }
460
461 /*
462 * Set a new TLS for the child thread?
463 */
464 if (clone_flags & CLONE_SETTLS) {
465 struct desc_struct *desc;
466 struct user_desc info;
467 int idx;
468
469 err = -EFAULT;
470 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
471 goto out;
472 err = -EINVAL;
473 if (LDT_empty(&info))
474 goto out;
475
476 idx = info.entry_number;
477 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
478 goto out;
479
480 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
481 desc->a = LDT_entry_a(&info);
482 desc->b = LDT_entry_b(&info);
483 }
484
485 err = 0;
486 out:
487 if (err && p->thread.io_bitmap_ptr) {
488 kfree(p->thread.io_bitmap_ptr);
489 p->thread.io_bitmap_max = 0;
490 }
491 return err;
492}
493
494/*
495 * fill in the user structure for a core dump..
496 */
497void dump_thread(struct pt_regs * regs, struct user * dump)
498{
499 int i;
500
501/* changed the size calculations - should hopefully work better. lbt */
502 dump->magic = CMAGIC;
503 dump->start_code = 0;
504 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
505 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
506 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
507 dump->u_dsize -= dump->u_tsize;
508 dump->u_ssize = 0;
509 for (i = 0; i < 8; i++)
510 dump->u_debugreg[i] = current->thread.debugreg[i];
511
512 if (dump->start_stack < TASK_SIZE)
513 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
514
515 dump->regs.ebx = regs->ebx;
516 dump->regs.ecx = regs->ecx;
517 dump->regs.edx = regs->edx;
518 dump->regs.esi = regs->esi;
519 dump->regs.edi = regs->edi;
520 dump->regs.ebp = regs->ebp;
521 dump->regs.eax = regs->eax;
522 dump->regs.ds = regs->xds;
523 dump->regs.es = regs->xes;
524 dump->regs.fs = regs->xfs;
525 savesegment(gs,dump->regs.gs);
526 dump->regs.orig_eax = regs->orig_eax;
527 dump->regs.eip = regs->eip;
528 dump->regs.cs = regs->xcs;
529 dump->regs.eflags = regs->eflags;
530 dump->regs.esp = regs->esp;
531 dump->regs.ss = regs->xss;
532
533 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
534}
535EXPORT_SYMBOL(dump_thread);
536
537/*
538 * Capture the user space registers if the task is not running (in user space)
539 */
540int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
541{
542 struct pt_regs ptregs = *task_pt_regs(tsk);
543 ptregs.xcs &= 0xffff;
544 ptregs.xds &= 0xffff;
545 ptregs.xes &= 0xffff;
546 ptregs.xss &= 0xffff;
547
548 elf_core_copy_regs(regs, &ptregs);
549
550 return 1;
551}
552
553#ifdef CONFIG_SECCOMP
554void hard_disable_TSC(void)
555{
556 write_cr4(read_cr4() | X86_CR4_TSD);
557}
558void disable_TSC(void)
559{
560 preempt_disable();
561 if (!test_and_set_thread_flag(TIF_NOTSC))
562 /*
563 * Must flip the CPU state synchronously with
564 * TIF_NOTSC in the current running context.
565 */
566 hard_disable_TSC();
567 preempt_enable();
568}
569void hard_enable_TSC(void)
570{
571 write_cr4(read_cr4() & ~X86_CR4_TSD);
572}
573#endif /* CONFIG_SECCOMP */
574
575static noinline void
576__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
577 struct tss_struct *tss)
578{
579 struct thread_struct *next;
580
581 next = &next_p->thread;
582
583 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
584 set_debugreg(next->debugreg[0], 0);
585 set_debugreg(next->debugreg[1], 1);
586 set_debugreg(next->debugreg[2], 2);
587 set_debugreg(next->debugreg[3], 3);
588 /* no 4 and 5 */
589 set_debugreg(next->debugreg[6], 6);
590 set_debugreg(next->debugreg[7], 7);
591 }
592
593#ifdef CONFIG_SECCOMP
594 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
595 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
596 /* prev and next are different */
597 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
598 hard_disable_TSC();
599 else
600 hard_enable_TSC();
601 }
602#endif
603
604 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
605 /*
606 * Disable the bitmap via an invalid offset. We still cache
607 * the previous bitmap owner and the IO bitmap contents:
608 */
609 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
610 return;
611 }
612
613 if (likely(next == tss->io_bitmap_owner)) {
614 /*
615 * Previous owner of the bitmap (hence the bitmap content)
616 * matches the next task, we dont have to do anything but
617 * to set a valid offset in the TSS:
618 */
619 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
620 return;
621 }
622 /*
623 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
624 * and we let the task to get a GPF in case an I/O instruction
625 * is performed. The handler of the GPF will verify that the
626 * faulting task has a valid I/O bitmap and, it true, does the
627 * real copy and restart the instruction. This will save us
628 * redundant copies when the currently switched task does not
629 * perform any I/O during its timeslice.
630 */
631 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
632}
633
634/*
635 * switch_to(x,yn) should switch tasks from x to y.
636 *
637 * We fsave/fwait so that an exception goes off at the right time
638 * (as a call from the fsave or fwait in effect) rather than to
639 * the wrong process. Lazy FP saving no longer makes any sense
640 * with modern CPU's, and this simplifies a lot of things (SMP
641 * and UP become the same).
642 *
643 * NOTE! We used to use the x86 hardware context switching. The
644 * reason for not using it any more becomes apparent when you
645 * try to recover gracefully from saved state that is no longer
646 * valid (stale segment register values in particular). With the
647 * hardware task-switch, there is no way to fix up bad state in
648 * a reasonable manner.
649 *
650 * The fact that Intel documents the hardware task-switching to
651 * be slow is a fairly red herring - this code is not noticeably
652 * faster. However, there _is_ some room for improvement here,
653 * so the performance issues may eventually be a valid point.
654 * More important, however, is the fact that this allows us much
655 * more flexibility.
656 *
657 * The return value (in %eax) will be the "prev" task after
658 * the task-switch, and shows up in ret_from_fork in entry.S,
659 * for example.
660 */
661struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
662{
663 struct thread_struct *prev = &prev_p->thread,
664 *next = &next_p->thread;
665 int cpu = smp_processor_id();
666 struct tss_struct *tss = &per_cpu(init_tss, cpu);
667
668 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
669
670 __unlazy_fpu(prev_p);
671
672
673 /* we're going to use this soon, after a few expensive things */
674 if (next_p->fpu_counter > 5)
675 prefetch(&next->i387.fxsave);
676
677 /*
678 * Reload esp0.
679 */
680 load_esp0(tss, next);
681
682 /*
683 * Save away %gs. No need to save %fs, as it was saved on the
684 * stack on entry. No need to save %es and %ds, as those are
685 * always kernel segments while inside the kernel. Doing this
686 * before setting the new TLS descriptors avoids the situation
687 * where we temporarily have non-reloadable segments in %fs
688 * and %gs. This could be an issue if the NMI handler ever
689 * used %fs or %gs (it does not today), or if the kernel is
690 * running inside of a hypervisor layer.
691 */
692 savesegment(gs, prev->gs);
693
694 /*
695 * Load the per-thread Thread-Local Storage descriptor.
696 */
697 load_TLS(next, cpu);
698
699 /*
700 * Restore IOPL if needed. In normal use, the flags restore
701 * in the switch assembly will handle this. But if the kernel
702 * is running virtualized at a non-zero CPL, the popf will
703 * not restore flags, so it must be done in a separate step.
704 */
705 if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
706 set_iopl_mask(next->iopl);
707
708 /*
709 * Now maybe handle debug registers and/or IO bitmaps
710 */
711 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
712 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
713 __switch_to_xtra(prev_p, next_p, tss);
714
715 /*
716 * Leave lazy mode, flushing any hypercalls made here.
717 * This must be done before restoring TLS segments so
718 * the GDT and LDT are properly updated, and must be
719 * done before math_state_restore, so the TS bit is up
720 * to date.
721 */
722 arch_leave_lazy_cpu_mode();
723
724 /* If the task has used fpu the last 5 timeslices, just do a full
725 * restore of the math state immediately to avoid the trap; the
726 * chances of needing FPU soon are obviously high now
727 */
728 if (next_p->fpu_counter > 5)
729 math_state_restore();
730
731 /*
732 * Restore %gs if needed (which is common)
733 */
734 if (prev->gs | next->gs)
735 loadsegment(gs, next->gs);
736
737 x86_write_percpu(current_task, next_p);
738
739 return prev_p;
740}
741
742asmlinkage int sys_fork(struct pt_regs regs)
743{
744 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
745}
746
747asmlinkage int sys_clone(struct pt_regs regs)
748{
749 unsigned long clone_flags;
750 unsigned long newsp;
751 int __user *parent_tidptr, *child_tidptr;
752
753 clone_flags = regs.ebx;
754 newsp = regs.ecx;
755 parent_tidptr = (int __user *)regs.edx;
756 child_tidptr = (int __user *)regs.edi;
757 if (!newsp)
758 newsp = regs.esp;
759 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
760}
761
762/*
763 * This is trivial, and on the face of it looks like it
764 * could equally well be done in user mode.
765 *
766 * Not so, for quite unobvious reasons - register pressure.
767 * In user mode vfork() cannot have a stack frame, and if
768 * done by calling the "clone()" system call directly, you
769 * do not have enough call-clobbered registers to hold all
770 * the information you need.
771 */
772asmlinkage int sys_vfork(struct pt_regs regs)
773{
774 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
775}
776
777/*
778 * sys_execve() executes a new program.
779 */
780asmlinkage int sys_execve(struct pt_regs regs)
781{
782 int error;
783 char * filename;
784
785 filename = getname((char __user *) regs.ebx);
786 error = PTR_ERR(filename);
787 if (IS_ERR(filename))
788 goto out;
789 error = do_execve(filename,
790 (char __user * __user *) regs.ecx,
791 (char __user * __user *) regs.edx,
792 &regs);
793 if (error == 0) {
794 task_lock(current);
795 current->ptrace &= ~PT_DTRACE;
796 task_unlock(current);
797 /* Make sure we don't return using sysenter.. */
798 set_thread_flag(TIF_IRET);
799 }
800 putname(filename);
801out:
802 return error;
803}
804
805#define top_esp (THREAD_SIZE - sizeof(unsigned long))
806#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
807
808unsigned long get_wchan(struct task_struct *p)
809{
810 unsigned long ebp, esp, eip;
811 unsigned long stack_page;
812 int count = 0;
813 if (!p || p == current || p->state == TASK_RUNNING)
814 return 0;
815 stack_page = (unsigned long)task_stack_page(p);
816 esp = p->thread.esp;
817 if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
818 return 0;
819 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
820 ebp = *(unsigned long *) esp;
821 do {
822 if (ebp < stack_page || ebp > top_ebp+stack_page)
823 return 0;
824 eip = *(unsigned long *) (ebp+4);
825 if (!in_sched_functions(eip))
826 return eip;
827 ebp = *(unsigned long *) ebp;
828 } while (count++ < 16);
829 return 0;
830}
831
832/*
833 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
834 */
835static int get_free_idx(void)
836{
837 struct thread_struct *t = &current->thread;
838 int idx;
839
840 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
841 if (desc_empty(t->tls_array + idx))
842 return idx + GDT_ENTRY_TLS_MIN;
843 return -ESRCH;
844}
845
846/*
847 * Set a given TLS descriptor:
848 */
849asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
850{
851 struct thread_struct *t = &current->thread;
852 struct user_desc info;
853 struct desc_struct *desc;
854 int cpu, idx;
855
856 if (copy_from_user(&info, u_info, sizeof(info)))
857 return -EFAULT;
858 idx = info.entry_number;
859
860 /*
861 * index -1 means the kernel should try to find and
862 * allocate an empty descriptor:
863 */
864 if (idx == -1) {
865 idx = get_free_idx();
866 if (idx < 0)
867 return idx;
868 if (put_user(idx, &u_info->entry_number))
869 return -EFAULT;
870 }
871
872 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
873 return -EINVAL;
874
875 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
876
877 /*
878 * We must not get preempted while modifying the TLS.
879 */
880 cpu = get_cpu();
881
882 if (LDT_empty(&info)) {
883 desc->a = 0;
884 desc->b = 0;
885 } else {
886 desc->a = LDT_entry_a(&info);
887 desc->b = LDT_entry_b(&info);
888 }
889 load_TLS(t, cpu);
890
891 put_cpu();
892
893 return 0;
894}
895
896/*
897 * Get the current Thread-Local Storage area:
898 */
899
900#define GET_BASE(desc) ( \
901 (((desc)->a >> 16) & 0x0000ffff) | \
902 (((desc)->b << 16) & 0x00ff0000) | \
903 ( (desc)->b & 0xff000000) )
904
905#define GET_LIMIT(desc) ( \
906 ((desc)->a & 0x0ffff) | \
907 ((desc)->b & 0xf0000) )
908
909#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
910#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
911#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
912#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
913#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
914#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
915
916asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
917{
918 struct user_desc info;
919 struct desc_struct *desc;
920 int idx;
921
922 if (get_user(idx, &u_info->entry_number))
923 return -EFAULT;
924 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
925 return -EINVAL;
926
927 memset(&info, 0, sizeof(info));
928
929 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
930
931 info.entry_number = idx;
932 info.base_addr = GET_BASE(desc);
933 info.limit = GET_LIMIT(desc);
934 info.seg_32bit = GET_32BIT(desc);
935 info.contents = GET_CONTENTS(desc);
936 info.read_exec_only = !GET_WRITABLE(desc);
937 info.limit_in_pages = GET_LIMIT_PAGES(desc);
938 info.seg_not_present = !GET_PRESENT(desc);
939 info.useable = GET_USEABLE(desc);
940
941 if (copy_to_user(u_info, &info, sizeof(info)))
942 return -EFAULT;
943 return 0;
944}
945
946unsigned long arch_align_stack(unsigned long sp)
947{
948 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
949 sp -= get_random_int() % 8192;
950 return sp & ~0xf;
951}
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
new file mode 100644
index 000000000000..7c1b92522e95
--- /dev/null
+++ b/arch/x86/kernel/ptrace_32.c
@@ -0,0 +1,723 @@
1/* ptrace.c */
2/* By Ross Biro 1/23/92 */
3/*
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 */
7
8#include <linux/kernel.h>
9#include <linux/sched.h>
10#include <linux/mm.h>
11#include <linux/smp.h>
12#include <linux/errno.h>
13#include <linux/ptrace.h>
14#include <linux/user.h>
15#include <linux/security.h>
16#include <linux/audit.h>
17#include <linux/seccomp.h>
18#include <linux/signal.h>
19
20#include <asm/uaccess.h>
21#include <asm/pgtable.h>
22#include <asm/system.h>
23#include <asm/processor.h>
24#include <asm/i387.h>
25#include <asm/debugreg.h>
26#include <asm/ldt.h>
27#include <asm/desc.h>
28
29/*
30 * does not yet catch signals sent when the child dies.
31 * in exit.c or in signal.c.
32 */
33
34/*
35 * Determines which flags the user has access to [1 = access, 0 = no access].
36 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
37 * Also masks reserved bits (31-22, 15, 5, 3, 1).
38 */
39#define FLAG_MASK 0x00050dd5
40
41/* set's the trap flag. */
42#define TRAP_FLAG 0x100
43
44/*
45 * Offset of eflags on child stack..
46 */
47#define EFL_OFFSET offsetof(struct pt_regs, eflags)
48
49static inline struct pt_regs *get_child_regs(struct task_struct *task)
50{
51 void *stack_top = (void *)task->thread.esp0;
52 return stack_top - sizeof(struct pt_regs);
53}
54
55/*
56 * This routine will get a word off of the processes privileged stack.
57 * the offset is bytes into the pt_regs structure on the stack.
58 * This routine assumes that all the privileged stacks are in our
59 * data space.
60 */
61static inline int get_stack_long(struct task_struct *task, int offset)
62{
63 unsigned char *stack;
64
65 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
66 stack += offset;
67 return (*((int *)stack));
68}
69
70/*
71 * This routine will put a word on the processes privileged stack.
72 * the offset is bytes into the pt_regs structure on the stack.
73 * This routine assumes that all the privileged stacks are in our
74 * data space.
75 */
76static inline int put_stack_long(struct task_struct *task, int offset,
77 unsigned long data)
78{
79 unsigned char * stack;
80
81 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
82 stack += offset;
83 *(unsigned long *) stack = data;
84 return 0;
85}
86
87static int putreg(struct task_struct *child,
88 unsigned long regno, unsigned long value)
89{
90 switch (regno >> 2) {
91 case GS:
92 if (value && (value & 3) != 3)
93 return -EIO;
94 child->thread.gs = value;
95 return 0;
96 case DS:
97 case ES:
98 case FS:
99 if (value && (value & 3) != 3)
100 return -EIO;
101 value &= 0xffff;
102 break;
103 case SS:
104 case CS:
105 if ((value & 3) != 3)
106 return -EIO;
107 value &= 0xffff;
108 break;
109 case EFL:
110 value &= FLAG_MASK;
111 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
112 break;
113 }
114 if (regno > FS*4)
115 regno -= 1*4;
116 put_stack_long(child, regno, value);
117 return 0;
118}
119
120static unsigned long getreg(struct task_struct *child,
121 unsigned long regno)
122{
123 unsigned long retval = ~0UL;
124
125 switch (regno >> 2) {
126 case GS:
127 retval = child->thread.gs;
128 break;
129 case DS:
130 case ES:
131 case FS:
132 case SS:
133 case CS:
134 retval = 0xffff;
135 /* fall through */
136 default:
137 if (regno > FS*4)
138 regno -= 1*4;
139 retval &= get_stack_long(child, regno);
140 }
141 return retval;
142}
143
144#define LDT_SEGMENT 4
145
146static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
147{
148 unsigned long addr, seg;
149
150 addr = regs->eip;
151 seg = regs->xcs & 0xffff;
152 if (regs->eflags & VM_MASK) {
153 addr = (addr & 0xffff) + (seg << 4);
154 return addr;
155 }
156
157 /*
158 * We'll assume that the code segments in the GDT
159 * are all zero-based. That is largely true: the
160 * TLS segments are used for data, and the PNPBIOS
161 * and APM bios ones we just ignore here.
162 */
163 if (seg & LDT_SEGMENT) {
164 u32 *desc;
165 unsigned long base;
166
167 seg &= ~7UL;
168
169 down(&child->mm->context.sem);
170 if (unlikely((seg >> 3) >= child->mm->context.size))
171 addr = -1L; /* bogus selector, access would fault */
172 else {
173 desc = child->mm->context.ldt + seg;
174 base = ((desc[0] >> 16) |
175 ((desc[1] & 0xff) << 16) |
176 (desc[1] & 0xff000000));
177
178 /* 16-bit code segment? */
179 if (!((desc[1] >> 22) & 1))
180 addr &= 0xffff;
181 addr += base;
182 }
183 up(&child->mm->context.sem);
184 }
185 return addr;
186}
187
188static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
189{
190 int i, copied;
191 unsigned char opcode[15];
192 unsigned long addr = convert_eip_to_linear(child, regs);
193
194 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
195 for (i = 0; i < copied; i++) {
196 switch (opcode[i]) {
197 /* popf and iret */
198 case 0x9d: case 0xcf:
199 return 1;
200 /* opcode and address size prefixes */
201 case 0x66: case 0x67:
202 continue;
203 /* irrelevant prefixes (segment overrides and repeats) */
204 case 0x26: case 0x2e:
205 case 0x36: case 0x3e:
206 case 0x64: case 0x65:
207 case 0xf0: case 0xf2: case 0xf3:
208 continue;
209
210 /*
211 * pushf: NOTE! We should probably not let
212 * the user see the TF bit being set. But
213 * it's more pain than it's worth to avoid
214 * it, and a debugger could emulate this
215 * all in user space if it _really_ cares.
216 */
217 case 0x9c:
218 default:
219 return 0;
220 }
221 }
222 return 0;
223}
224
225static void set_singlestep(struct task_struct *child)
226{
227 struct pt_regs *regs = get_child_regs(child);
228
229 /*
230 * Always set TIF_SINGLESTEP - this guarantees that
231 * we single-step system calls etc.. This will also
232 * cause us to set TF when returning to user mode.
233 */
234 set_tsk_thread_flag(child, TIF_SINGLESTEP);
235
236 /*
237 * If TF was already set, don't do anything else
238 */
239 if (regs->eflags & TRAP_FLAG)
240 return;
241
242 /* Set TF on the kernel stack.. */
243 regs->eflags |= TRAP_FLAG;
244
245 /*
246 * ..but if TF is changed by the instruction we will trace,
247 * don't mark it as being "us" that set it, so that we
248 * won't clear it by hand later.
249 */
250 if (is_setting_trap_flag(child, regs))
251 return;
252
253 child->ptrace |= PT_DTRACE;
254}
255
256static void clear_singlestep(struct task_struct *child)
257{
258 /* Always clear TIF_SINGLESTEP... */
259 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
260
261 /* But touch TF only if it was set by us.. */
262 if (child->ptrace & PT_DTRACE) {
263 struct pt_regs *regs = get_child_regs(child);
264 regs->eflags &= ~TRAP_FLAG;
265 child->ptrace &= ~PT_DTRACE;
266 }
267}
268
269/*
270 * Called by kernel/ptrace.c when detaching..
271 *
272 * Make sure the single step bit is not set.
273 */
274void ptrace_disable(struct task_struct *child)
275{
276 clear_singlestep(child);
277 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
278}
279
280/*
281 * Perform get_thread_area on behalf of the traced child.
282 */
283static int
284ptrace_get_thread_area(struct task_struct *child,
285 int idx, struct user_desc __user *user_desc)
286{
287 struct user_desc info;
288 struct desc_struct *desc;
289
290/*
291 * Get the current Thread-Local Storage area:
292 */
293
294#define GET_BASE(desc) ( \
295 (((desc)->a >> 16) & 0x0000ffff) | \
296 (((desc)->b << 16) & 0x00ff0000) | \
297 ( (desc)->b & 0xff000000) )
298
299#define GET_LIMIT(desc) ( \
300 ((desc)->a & 0x0ffff) | \
301 ((desc)->b & 0xf0000) )
302
303#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
304#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
305#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
306#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
307#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
308#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
309
310 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
311 return -EINVAL;
312
313 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
314
315 info.entry_number = idx;
316 info.base_addr = GET_BASE(desc);
317 info.limit = GET_LIMIT(desc);
318 info.seg_32bit = GET_32BIT(desc);
319 info.contents = GET_CONTENTS(desc);
320 info.read_exec_only = !GET_WRITABLE(desc);
321 info.limit_in_pages = GET_LIMIT_PAGES(desc);
322 info.seg_not_present = !GET_PRESENT(desc);
323 info.useable = GET_USEABLE(desc);
324
325 if (copy_to_user(user_desc, &info, sizeof(info)))
326 return -EFAULT;
327
328 return 0;
329}
330
331/*
332 * Perform set_thread_area on behalf of the traced child.
333 */
334static int
335ptrace_set_thread_area(struct task_struct *child,
336 int idx, struct user_desc __user *user_desc)
337{
338 struct user_desc info;
339 struct desc_struct *desc;
340
341 if (copy_from_user(&info, user_desc, sizeof(info)))
342 return -EFAULT;
343
344 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
345 return -EINVAL;
346
347 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
348 if (LDT_empty(&info)) {
349 desc->a = 0;
350 desc->b = 0;
351 } else {
352 desc->a = LDT_entry_a(&info);
353 desc->b = LDT_entry_b(&info);
354 }
355
356 return 0;
357}
358
359long arch_ptrace(struct task_struct *child, long request, long addr, long data)
360{
361 struct user * dummy = NULL;
362 int i, ret;
363 unsigned long __user *datap = (unsigned long __user *)data;
364
365 switch (request) {
366 /* when I and D space are separate, these will need to be fixed. */
367 case PTRACE_PEEKTEXT: /* read word at location addr. */
368 case PTRACE_PEEKDATA:
369 ret = generic_ptrace_peekdata(child, addr, data);
370 break;
371
372 /* read the word at location addr in the USER area. */
373 case PTRACE_PEEKUSR: {
374 unsigned long tmp;
375
376 ret = -EIO;
377 if ((addr & 3) || addr < 0 ||
378 addr > sizeof(struct user) - 3)
379 break;
380
381 tmp = 0; /* Default return condition */
382 if(addr < FRAME_SIZE*sizeof(long))
383 tmp = getreg(child, addr);
384 if(addr >= (long) &dummy->u_debugreg[0] &&
385 addr <= (long) &dummy->u_debugreg[7]){
386 addr -= (long) &dummy->u_debugreg[0];
387 addr = addr >> 2;
388 tmp = child->thread.debugreg[addr];
389 }
390 ret = put_user(tmp, datap);
391 break;
392 }
393
394 /* when I and D space are separate, this will have to be fixed. */
395 case PTRACE_POKETEXT: /* write the word at location addr. */
396 case PTRACE_POKEDATA:
397 ret = generic_ptrace_pokedata(child, addr, data);
398 break;
399
400 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
401 ret = -EIO;
402 if ((addr & 3) || addr < 0 ||
403 addr > sizeof(struct user) - 3)
404 break;
405
406 if (addr < FRAME_SIZE*sizeof(long)) {
407 ret = putreg(child, addr, data);
408 break;
409 }
410 /* We need to be very careful here. We implicitly
411 want to modify a portion of the task_struct, and we
412 have to be selective about what portions we allow someone
413 to modify. */
414
415 ret = -EIO;
416 if(addr >= (long) &dummy->u_debugreg[0] &&
417 addr <= (long) &dummy->u_debugreg[7]){
418
419 if(addr == (long) &dummy->u_debugreg[4]) break;
420 if(addr == (long) &dummy->u_debugreg[5]) break;
421 if(addr < (long) &dummy->u_debugreg[4] &&
422 ((unsigned long) data) >= TASK_SIZE-3) break;
423
424 /* Sanity-check data. Take one half-byte at once with
425 * check = (val >> (16 + 4*i)) & 0xf. It contains the
426 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
427 * 2 and 3 are LENi. Given a list of invalid values,
428 * we do mask |= 1 << invalid_value, so that
429 * (mask >> check) & 1 is a correct test for invalid
430 * values.
431 *
432 * R/Wi contains the type of the breakpoint /
433 * watchpoint, LENi contains the length of the watched
434 * data in the watchpoint case.
435 *
436 * The invalid values are:
437 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
438 * - R/Wi == 0x10 (break on I/O reads or writes), so
439 * mask |= 0x4444.
440 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
441 * 0x1110.
442 *
443 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
444 *
445 * See the Intel Manual "System Programming Guide",
446 * 15.2.4
447 *
448 * Note that LENi == 0x10 is defined on x86_64 in long
449 * mode (i.e. even for 32-bit userspace software, but
450 * 64-bit kernel), so the x86_64 mask value is 0x5454.
451 * See the AMD manual no. 24593 (AMD64 System
452 * Programming)*/
453
454 if(addr == (long) &dummy->u_debugreg[7]) {
455 data &= ~DR_CONTROL_RESERVED;
456 for(i=0; i<4; i++)
457 if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
458 goto out_tsk;
459 if (data)
460 set_tsk_thread_flag(child, TIF_DEBUG);
461 else
462 clear_tsk_thread_flag(child, TIF_DEBUG);
463 }
464 addr -= (long) &dummy->u_debugreg;
465 addr = addr >> 2;
466 child->thread.debugreg[addr] = data;
467 ret = 0;
468 }
469 break;
470
471 case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
472 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
473 case PTRACE_CONT: /* restart after signal. */
474 ret = -EIO;
475 if (!valid_signal(data))
476 break;
477 if (request == PTRACE_SYSEMU) {
478 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
479 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
480 } else if (request == PTRACE_SYSCALL) {
481 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
482 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
483 } else {
484 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
485 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
486 }
487 child->exit_code = data;
488 /* make sure the single step bit is not set. */
489 clear_singlestep(child);
490 wake_up_process(child);
491 ret = 0;
492 break;
493
494/*
495 * make the child exit. Best I can do is send it a sigkill.
496 * perhaps it should be put in the status that it wants to
497 * exit.
498 */
499 case PTRACE_KILL:
500 ret = 0;
501 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
502 break;
503 child->exit_code = SIGKILL;
504 /* make sure the single step bit is not set. */
505 clear_singlestep(child);
506 wake_up_process(child);
507 break;
508
509 case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
510 case PTRACE_SINGLESTEP: /* set the trap flag. */
511 ret = -EIO;
512 if (!valid_signal(data))
513 break;
514
515 if (request == PTRACE_SYSEMU_SINGLESTEP)
516 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
517 else
518 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
519
520 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
521 set_singlestep(child);
522 child->exit_code = data;
523 /* give it a chance to run. */
524 wake_up_process(child);
525 ret = 0;
526 break;
527
528 case PTRACE_DETACH:
529 /* detach a process that was attached. */
530 ret = ptrace_detach(child, data);
531 break;
532
533 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
534 if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
535 ret = -EIO;
536 break;
537 }
538 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
539 __put_user(getreg(child, i), datap);
540 datap++;
541 }
542 ret = 0;
543 break;
544 }
545
546 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
547 unsigned long tmp;
548 if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
549 ret = -EIO;
550 break;
551 }
552 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
553 __get_user(tmp, datap);
554 putreg(child, i, tmp);
555 datap++;
556 }
557 ret = 0;
558 break;
559 }
560
561 case PTRACE_GETFPREGS: { /* Get the child FPU state. */
562 if (!access_ok(VERIFY_WRITE, datap,
563 sizeof(struct user_i387_struct))) {
564 ret = -EIO;
565 break;
566 }
567 ret = 0;
568 if (!tsk_used_math(child))
569 init_fpu(child);
570 get_fpregs((struct user_i387_struct __user *)data, child);
571 break;
572 }
573
574 case PTRACE_SETFPREGS: { /* Set the child FPU state. */
575 if (!access_ok(VERIFY_READ, datap,
576 sizeof(struct user_i387_struct))) {
577 ret = -EIO;
578 break;
579 }
580 set_stopped_child_used_math(child);
581 set_fpregs(child, (struct user_i387_struct __user *)data);
582 ret = 0;
583 break;
584 }
585
586 case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
587 if (!access_ok(VERIFY_WRITE, datap,
588 sizeof(struct user_fxsr_struct))) {
589 ret = -EIO;
590 break;
591 }
592 if (!tsk_used_math(child))
593 init_fpu(child);
594 ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
595 break;
596 }
597
598 case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
599 if (!access_ok(VERIFY_READ, datap,
600 sizeof(struct user_fxsr_struct))) {
601 ret = -EIO;
602 break;
603 }
604 set_stopped_child_used_math(child);
605 ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
606 break;
607 }
608
609 case PTRACE_GET_THREAD_AREA:
610 ret = ptrace_get_thread_area(child, addr,
611 (struct user_desc __user *) data);
612 break;
613
614 case PTRACE_SET_THREAD_AREA:
615 ret = ptrace_set_thread_area(child, addr,
616 (struct user_desc __user *) data);
617 break;
618
619 default:
620 ret = ptrace_request(child, request, addr, data);
621 break;
622 }
623 out_tsk:
624 return ret;
625}
626
627void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
628{
629 struct siginfo info;
630
631 tsk->thread.trap_no = 1;
632 tsk->thread.error_code = error_code;
633
634 memset(&info, 0, sizeof(info));
635 info.si_signo = SIGTRAP;
636 info.si_code = TRAP_BRKPT;
637
638 /* User-mode eip? */
639 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
640
641 /* Send us the fakey SIGTRAP */
642 force_sig_info(SIGTRAP, &info, tsk);
643}
644
645/* notification of system call entry/exit
646 * - triggered by current->work.syscall_trace
647 */
648__attribute__((regparm(3)))
649int do_syscall_trace(struct pt_regs *regs, int entryexit)
650{
651 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
652 /*
653 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
654 * interception
655 */
656 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
657 int ret = 0;
658
659 /* do the secure computing check first */
660 if (!entryexit)
661 secure_computing(regs->orig_eax);
662
663 if (unlikely(current->audit_context)) {
664 if (entryexit)
665 audit_syscall_exit(AUDITSC_RESULT(regs->eax),
666 regs->eax);
667 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
668 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
669 * not used, entry.S will call us only on syscall exit, not
670 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
671 * calling send_sigtrap() on syscall entry.
672 *
673 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
674 * is_singlestep is false, despite his name, so we will still do
675 * the correct thing.
676 */
677 else if (is_singlestep)
678 goto out;
679 }
680
681 if (!(current->ptrace & PT_PTRACED))
682 goto out;
683
684 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
685 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
686 * here. We have to check this and return */
687 if (is_sysemu && entryexit)
688 return 0;
689
690 /* Fake a debug trap */
691 if (is_singlestep)
692 send_sigtrap(current, regs, 0);
693
694 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
695 goto out;
696
697 /* the 0x80 provides a way for the tracing parent to distinguish
698 between a syscall stop and SIGTRAP delivery */
699 /* Note that the debugger could change the result of test_thread_flag!*/
700 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
701
702 /*
703 * this isn't the same as continuing with a signal, but it will do
704 * for normal use. strace only continues with a signal if the
705 * stopping signal is not SIGTRAP. -brl
706 */
707 if (current->exit_code) {
708 send_sig(current->exit_code, current, 1);
709 current->exit_code = 0;
710 }
711 ret = is_sysemu;
712out:
713 if (unlikely(current->audit_context) && !entryexit)
714 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
715 regs->ebx, regs->ecx, regs->edx, regs->esi);
716 if (ret == 0)
717 return 0;
718
719 regs->orig_eax = -1; /* force skip of syscall restarting */
720 if (unlikely(current->audit_context))
721 audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
722 return 1;
723}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644
index 000000000000..6722469c2633
--- /dev/null
+++ b/arch/x86/kernel/quirks.c
@@ -0,0 +1,49 @@
1/*
2 * This file contains work-arounds for x86 and x86_64 platform bugs.
3 */
4#include <linux/pci.h>
5#include <linux/irq.h>
6
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
8
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10{
11 u8 config, rev;
12 u32 word;
13
14 /* BIOS may enable hardware IRQ balancing for
15 * E7520/E7320/E7525(revision ID 0x9 and below)
16 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms.
18 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
20 if (rev > 0x9)
21 return;
22
23 /* enable access to config space*/
24 pci_read_config_byte(dev, 0xf4, &config);
25 pci_write_config_byte(dev, 0xf4, config|0x2);
26
27 /* read xTPR register */
28 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
29
30 if (!(word & (1 << 13))) {
31 printk(KERN_INFO "Intel E7520/7320/7525 detected. "
32 "Disabling irq balancing and affinity\n");
33#ifdef CONFIG_IRQBALANCE
34 irqbalance_disable("");
35#endif
36 noirqdebug_setup("");
37#ifdef CONFIG_PROC_FS
38 no_irq_affinity = 1;
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
49#endif
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
new file mode 100644
index 000000000000..0d796248866c
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.c
@@ -0,0 +1,413 @@
1/*
2 * linux/arch/i386/kernel/reboot.c
3 */
4
5#include <linux/mm.h>
6#include <linux/module.h>
7#include <linux/delay.h>
8#include <linux/init.h>
9#include <linux/interrupt.h>
10#include <linux/mc146818rtc.h>
11#include <linux/efi.h>
12#include <linux/dmi.h>
13#include <linux/ctype.h>
14#include <linux/pm.h>
15#include <linux/reboot.h>
16#include <asm/uaccess.h>
17#include <asm/apic.h>
18#include <asm/desc.h>
19#include "mach_reboot.h"
20#include <asm/reboot_fixups.h>
21#include <asm/reboot.h>
22
23/*
24 * Power off function, if any
25 */
26void (*pm_power_off)(void);
27EXPORT_SYMBOL(pm_power_off);
28
29static int reboot_mode;
30static int reboot_thru_bios;
31
32#ifdef CONFIG_SMP
33static int reboot_cpu = -1;
34#endif
35static int __init reboot_setup(char *str)
36{
37 while(1) {
38 switch (*str) {
39 case 'w': /* "warm" reboot (no memory testing etc) */
40 reboot_mode = 0x1234;
41 break;
42 case 'c': /* "cold" reboot (with memory testing etc) */
43 reboot_mode = 0x0;
44 break;
45 case 'b': /* "bios" reboot by jumping through the BIOS */
46 reboot_thru_bios = 1;
47 break;
48 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
49 reboot_thru_bios = 0;
50 break;
51#ifdef CONFIG_SMP
52 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
53 if (isdigit(*(str+1))) {
54 reboot_cpu = (int) (*(str+1) - '0');
55 if (isdigit(*(str+2)))
56 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
57 }
58 /* we will leave sorting out the final value
59 when we are ready to reboot, since we might not
60 have set up boot_cpu_id or smp_num_cpu */
61 break;
62#endif
63 }
64 if((str = strchr(str,',')) != NULL)
65 str++;
66 else
67 break;
68 }
69 return 1;
70}
71
72__setup("reboot=", reboot_setup);
73
74/*
75 * Reboot options and system auto-detection code provided by
76 * Dell Inc. so their systems "just work". :-)
77 */
78
79/*
80 * Some machines require the "reboot=b" commandline option, this quirk makes that automatic.
81 */
82static int __init set_bios_reboot(struct dmi_system_id *d)
83{
84 if (!reboot_thru_bios) {
85 reboot_thru_bios = 1;
86 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
87 }
88 return 0;
89}
90
91static struct dmi_system_id __initdata reboot_dmi_table[] = {
92 { /* Handle problems with rebooting on Dell E520's */
93 .callback = set_bios_reboot,
94 .ident = "Dell E520",
95 .matches = {
96 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
97 DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
98 },
99 },
100 { /* Handle problems with rebooting on Dell 1300's */
101 .callback = set_bios_reboot,
102 .ident = "Dell PowerEdge 1300",
103 .matches = {
104 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
105 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
106 },
107 },
108 { /* Handle problems with rebooting on Dell 300's */
109 .callback = set_bios_reboot,
110 .ident = "Dell PowerEdge 300",
111 .matches = {
112 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
113 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
114 },
115 },
116 { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
117 .callback = set_bios_reboot,
118 .ident = "Dell OptiPlex 745",
119 .matches = {
120 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
121 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
122 DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
123 },
124 },
125 { /* Handle problems with rebooting on Dell 2400's */
126 .callback = set_bios_reboot,
127 .ident = "Dell PowerEdge 2400",
128 .matches = {
129 DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
130 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
131 },
132 },
133 { /* Handle problems with rebooting on HP laptops */
134 .callback = set_bios_reboot,
135 .ident = "HP Compaq Laptop",
136 .matches = {
137 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
138 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
139 },
140 },
141 { }
142};
143
144static int __init reboot_init(void)
145{
146 dmi_check_system(reboot_dmi_table);
147 return 0;
148}
149
150core_initcall(reboot_init);
151
152/* The following code and data reboots the machine by switching to real
153 mode and jumping to the BIOS reset entry point, as if the CPU has
154 really been reset. The previous version asked the keyboard
155 controller to pulse the CPU reset line, which is more thorough, but
156 doesn't work with at least one type of 486 motherboard. It is easy
157 to stop this code working; hence the copious comments. */
158
159static unsigned long long
160real_mode_gdt_entries [3] =
161{
162 0x0000000000000000ULL, /* Null descriptor */
163 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
164 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
165};
166
167static struct Xgt_desc_struct
168real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
169real_mode_idt = { 0x3ff, 0 },
170no_idt = { 0, 0 };
171
172
173/* This is 16-bit protected mode code to disable paging and the cache,
174 switch to real mode and jump to the BIOS reset code.
175
176 The instruction that switches to real mode by writing to CR0 must be
177 followed immediately by a far jump instruction, which set CS to a
178 valid value for real mode, and flushes the prefetch queue to avoid
179 running instructions that have already been decoded in protected
180 mode.
181
182 Clears all the flags except ET, especially PG (paging), PE
183 (protected-mode enable) and TS (task switch for coprocessor state
184 save). Flushes the TLB after paging has been disabled. Sets CD and
185 NW, to disable the cache on a 486, and invalidates the cache. This
186 is more like the state of a 486 after reset. I don't know if
187 something else should be done for other chips.
188
189 More could be done here to set up the registers as if a CPU reset had
190 occurred; hopefully real BIOSs don't assume much. */
191
192static unsigned char real_mode_switch [] =
193{
194 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
195 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
196 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
197 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
198 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
199 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
200 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
201 0x74, 0x02, /* jz f */
202 0x0f, 0x09, /* wbinvd */
203 0x24, 0x10, /* f: andb $0x10,al */
204 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
205};
206static unsigned char jump_to_bios [] =
207{
208 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
209};
210
211/*
212 * Switch to real mode and then execute the code
213 * specified by the code and length parameters.
214 * We assume that length will aways be less that 100!
215 */
216void machine_real_restart(unsigned char *code, int length)
217{
218 local_irq_disable();
219
220 /* Write zero to CMOS register number 0x0f, which the BIOS POST
221 routine will recognize as telling it to do a proper reboot. (Well
222 that's what this book in front of me says -- it may only apply to
223 the Phoenix BIOS though, it's not clear). At the same time,
224 disable NMIs by setting the top bit in the CMOS address register,
225 as we're about to do peculiar things to the CPU. I'm not sure if
226 `outb_p' is needed instead of just `outb'. Use it to be on the
227 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
228 */
229
230 spin_lock(&rtc_lock);
231 CMOS_WRITE(0x00, 0x8f);
232 spin_unlock(&rtc_lock);
233
234 /* Remap the kernel at virtual address zero, as well as offset zero
235 from the kernel segment. This assumes the kernel segment starts at
236 virtual address PAGE_OFFSET. */
237
238 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
239 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
240
241 /*
242 * Use `swapper_pg_dir' as our page directory.
243 */
244 load_cr3(swapper_pg_dir);
245
246 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
247 this on booting to tell it to "Bypass memory test (also warm
248 boot)". This seems like a fairly standard thing that gets set by
249 REBOOT.COM programs, and the previous reset routine did this
250 too. */
251
252 *((unsigned short *)0x472) = reboot_mode;
253
254 /* For the switch to real mode, copy some code to low memory. It has
255 to be in the first 64k because it is running in 16-bit mode, and it
256 has to have the same physical and virtual address, because it turns
257 off paging. Copy it near the end of the first page, out of the way
258 of BIOS variables. */
259
260 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
261 real_mode_switch, sizeof (real_mode_switch));
262 memcpy ((void *) (0x1000 - 100), code, length);
263
264 /* Set up the IDT for real mode. */
265
266 load_idt(&real_mode_idt);
267
268 /* Set up a GDT from which we can load segment descriptors for real
269 mode. The GDT is not used in real mode; it is just needed here to
270 prepare the descriptors. */
271
272 load_gdt(&real_mode_gdt);
273
274 /* Load the data segment registers, and thus the descriptors ready for
275 real mode. The base address of each segment is 0x100, 16 times the
276 selector value being loaded here. This is so that the segment
277 registers don't have to be reloaded after switching to real mode:
278 the values are consistent for real mode operation already. */
279
280 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
281 "\tmovl %%eax,%%ds\n"
282 "\tmovl %%eax,%%es\n"
283 "\tmovl %%eax,%%fs\n"
284 "\tmovl %%eax,%%gs\n"
285 "\tmovl %%eax,%%ss" : : : "eax");
286
287 /* Jump to the 16-bit code that we copied earlier. It disables paging
288 and the cache, switches to real mode, and jumps to the BIOS reset
289 entry point. */
290
291 __asm__ __volatile__ ("ljmp $0x0008,%0"
292 :
293 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
294}
295#ifdef CONFIG_APM_MODULE
296EXPORT_SYMBOL(machine_real_restart);
297#endif
298
299static void native_machine_shutdown(void)
300{
301#ifdef CONFIG_SMP
302 int reboot_cpu_id;
303
304 /* The boot cpu is always logical cpu 0 */
305 reboot_cpu_id = 0;
306
307 /* See if there has been given a command line override */
308 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
309 cpu_isset(reboot_cpu, cpu_online_map)) {
310 reboot_cpu_id = reboot_cpu;
311 }
312
313 /* Make certain the cpu I'm rebooting on is online */
314 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
315 reboot_cpu_id = smp_processor_id();
316 }
317
318 /* Make certain I only run on the appropriate processor */
319 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
320
321 /* O.K. Now that I'm on the appropriate processor, stop
322 * all of the others, and disable their local APICs.
323 */
324
325 smp_send_stop();
326#endif /* CONFIG_SMP */
327
328 lapic_shutdown();
329
330#ifdef CONFIG_X86_IO_APIC
331 disable_IO_APIC();
332#endif
333}
334
335void __attribute__((weak)) mach_reboot_fixups(void)
336{
337}
338
339static void native_machine_emergency_restart(void)
340{
341 if (!reboot_thru_bios) {
342 if (efi_enabled) {
343 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
344 load_idt(&no_idt);
345 __asm__ __volatile__("int3");
346 }
347 /* rebooting needs to touch the page at absolute addr 0 */
348 *((unsigned short *)__va(0x472)) = reboot_mode;
349 for (;;) {
350 mach_reboot_fixups(); /* for board specific fixups */
351 mach_reboot();
352 /* That didn't work - force a triple fault.. */
353 load_idt(&no_idt);
354 __asm__ __volatile__("int3");
355 }
356 }
357 if (efi_enabled)
358 efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
359
360 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
361}
362
363static void native_machine_restart(char * __unused)
364{
365 machine_shutdown();
366 machine_emergency_restart();
367}
368
369static void native_machine_halt(void)
370{
371}
372
373static void native_machine_power_off(void)
374{
375 if (pm_power_off) {
376 machine_shutdown();
377 pm_power_off();
378 }
379}
380
381
382struct machine_ops machine_ops = {
383 .power_off = native_machine_power_off,
384 .shutdown = native_machine_shutdown,
385 .emergency_restart = native_machine_emergency_restart,
386 .restart = native_machine_restart,
387 .halt = native_machine_halt,
388};
389
390void machine_power_off(void)
391{
392 machine_ops.power_off();
393}
394
395void machine_shutdown(void)
396{
397 machine_ops.shutdown();
398}
399
400void machine_emergency_restart(void)
401{
402 machine_ops.emergency_restart();
403}
404
405void machine_restart(char *cmd)
406{
407 machine_ops.restart(cmd);
408}
409
410void machine_halt(void)
411{
412 machine_ops.halt();
413}
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644
index 000000000000..03e1cce58f49
--- /dev/null
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -0,0 +1,68 @@
1/*
2 * linux/arch/i386/kernel/reboot_fixups.c
3 *
4 * This is a good place to put board specific reboot fixups.
5 *
6 * List of supported fixups:
7 * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
8 * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
9 *
10 */
11
12#include <asm/delay.h>
13#include <linux/pci.h>
14#include <asm/reboot_fixups.h>
15#include <asm/msr.h>
16
17static void cs5530a_warm_reset(struct pci_dev *dev)
18{
19 /* writing 1 to the reset control register, 0x44 causes the
20 cs5530a to perform a system warm reset */
21 pci_write_config_byte(dev, 0x44, 0x1);
22 udelay(50); /* shouldn't get here but be safe and spin-a-while */
23 return;
24}
25
26static void cs5536_warm_reset(struct pci_dev *dev)
27{
28 /*
29 * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
30 * writing 1 to the LSB of this MSR causes a hard reset.
31 */
32 wrmsrl(0x51400017, 1ULL);
33 udelay(50); /* shouldn't get here but be safe and spin a while */
34}
35
36struct device_fixup {
37 unsigned int vendor;
38 unsigned int device;
39 void (*reboot_fixup)(struct pci_dev *);
40};
41
42static struct device_fixup fixups_table[] = {
43{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
44{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
45};
46
47/*
48 * we see if any fixup is available for our current hardware. if there
49 * is a fixup, we call it and we expect to never return from it. if we
50 * do return, we keep looking and then eventually fall back to the
51 * standard mach_reboot on return.
52 */
53void mach_reboot_fixups(void)
54{
55 struct device_fixup *cur;
56 struct pci_dev *dev;
57 int i;
58
59 for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
60 cur = &(fixups_table[i]);
61 dev = pci_get_device(cur->vendor, cur->device, NULL);
62 if (!dev)
63 continue;
64
65 cur->reboot_fixup(dev);
66 }
67}
68
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644
index 000000000000..f151d6fae462
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -0,0 +1,252 @@
1/*
2 * relocate_kernel.S - put the kernel image in place to boot
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/linkage.h>
10#include <asm/page.h>
11#include <asm/kexec.h>
12
13/*
14 * Must be relocatable PIC code callable as a C function
15 */
16
17#define PTR(x) (x << 2)
18#define PAGE_ALIGNED (1 << PAGE_SHIFT)
19#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
20#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
21
22 .text
23 .align PAGE_ALIGNED
24 .globl relocate_kernel
25relocate_kernel:
26 movl 8(%esp), %ebp /* list of pages */
27
28#ifdef CONFIG_X86_PAE
29 /* map the control page at its virtual address */
30
31 movl PTR(VA_PGD)(%ebp), %edi
32 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
33 andl $0xc0000000, %eax
34 shrl $27, %eax
35 addl %edi, %eax
36
37 movl PTR(PA_PMD_0)(%ebp), %edx
38 orl $PAE_PGD_ATTR, %edx
39 movl %edx, (%eax)
40
41 movl PTR(VA_PMD_0)(%ebp), %edi
42 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
43 andl $0x3fe00000, %eax
44 shrl $18, %eax
45 addl %edi, %eax
46
47 movl PTR(PA_PTE_0)(%ebp), %edx
48 orl $PAGE_ATTR, %edx
49 movl %edx, (%eax)
50
51 movl PTR(VA_PTE_0)(%ebp), %edi
52 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
53 andl $0x001ff000, %eax
54 shrl $9, %eax
55 addl %edi, %eax
56
57 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
58 orl $PAGE_ATTR, %edx
59 movl %edx, (%eax)
60
61 /* identity map the control page at its physical address */
62
63 movl PTR(VA_PGD)(%ebp), %edi
64 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
65 andl $0xc0000000, %eax
66 shrl $27, %eax
67 addl %edi, %eax
68
69 movl PTR(PA_PMD_1)(%ebp), %edx
70 orl $PAE_PGD_ATTR, %edx
71 movl %edx, (%eax)
72
73 movl PTR(VA_PMD_1)(%ebp), %edi
74 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
75 andl $0x3fe00000, %eax
76 shrl $18, %eax
77 addl %edi, %eax
78
79 movl PTR(PA_PTE_1)(%ebp), %edx
80 orl $PAGE_ATTR, %edx
81 movl %edx, (%eax)
82
83 movl PTR(VA_PTE_1)(%ebp), %edi
84 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
85 andl $0x001ff000, %eax
86 shrl $9, %eax
87 addl %edi, %eax
88
89 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
90 orl $PAGE_ATTR, %edx
91 movl %edx, (%eax)
92#else
93 /* map the control page at its virtual address */
94
95 movl PTR(VA_PGD)(%ebp), %edi
96 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
97 andl $0xffc00000, %eax
98 shrl $20, %eax
99 addl %edi, %eax
100
101 movl PTR(PA_PTE_0)(%ebp), %edx
102 orl $PAGE_ATTR, %edx
103 movl %edx, (%eax)
104
105 movl PTR(VA_PTE_0)(%ebp), %edi
106 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
107 andl $0x003ff000, %eax
108 shrl $10, %eax
109 addl %edi, %eax
110
111 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
112 orl $PAGE_ATTR, %edx
113 movl %edx, (%eax)
114
115 /* identity map the control page at its physical address */
116
117 movl PTR(VA_PGD)(%ebp), %edi
118 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
119 andl $0xffc00000, %eax
120 shrl $20, %eax
121 addl %edi, %eax
122
123 movl PTR(PA_PTE_1)(%ebp), %edx
124 orl $PAGE_ATTR, %edx
125 movl %edx, (%eax)
126
127 movl PTR(VA_PTE_1)(%ebp), %edi
128 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
129 andl $0x003ff000, %eax
130 shrl $10, %eax
131 addl %edi, %eax
132
133 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
134 orl $PAGE_ATTR, %edx
135 movl %edx, (%eax)
136#endif
137
138relocate_new_kernel:
139 /* read the arguments and say goodbye to the stack */
140 movl 4(%esp), %ebx /* page_list */
141 movl 8(%esp), %ebp /* list of pages */
142 movl 12(%esp), %edx /* start address */
143 movl 16(%esp), %ecx /* cpu_has_pae */
144
145 /* zero out flags, and disable interrupts */
146 pushl $0
147 popfl
148
149 /* get physical address of control page now */
150 /* this is impossible after page table switch */
151 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
152
153 /* switch to new set of page tables */
154 movl PTR(PA_PGD)(%ebp), %eax
155 movl %eax, %cr3
156
157 /* setup a new stack at the end of the physical control page */
158 lea 4096(%edi), %esp
159
160 /* jump to identity mapped page */
161 movl %edi, %eax
162 addl $(identity_mapped - relocate_kernel), %eax
163 pushl %eax
164 ret
165
166identity_mapped:
167 /* store the start address on the stack */
168 pushl %edx
169
170 /* Set cr0 to a known state:
171 * 31 0 == Paging disabled
172 * 18 0 == Alignment check disabled
173 * 16 0 == Write protect disabled
174 * 3 0 == No task switch
175 * 2 0 == Don't do FP software emulation.
176 * 0 1 == Proctected mode enabled
177 */
178 movl %cr0, %eax
179 andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
180 orl $(1<<0), %eax
181 movl %eax, %cr0
182
183 /* clear cr4 if applicable */
184 testl %ecx, %ecx
185 jz 1f
186 /* Set cr4 to a known state:
187 * Setting everything to zero seems safe.
188 */
189 movl %cr4, %eax
190 andl $0, %eax
191 movl %eax, %cr4
192
193 jmp 1f
1941:
195
196 /* Flush the TLB (needed?) */
197 xorl %eax, %eax
198 movl %eax, %cr3
199
200 /* Do the copies */
201 movl %ebx, %ecx
202 jmp 1f
203
2040: /* top, read another word from the indirection page */
205 movl (%ebx), %ecx
206 addl $4, %ebx
2071:
208 testl $0x1, %ecx /* is it a destination page */
209 jz 2f
210 movl %ecx, %edi
211 andl $0xfffff000, %edi
212 jmp 0b
2132:
214 testl $0x2, %ecx /* is it an indirection page */
215 jz 2f
216 movl %ecx, %ebx
217 andl $0xfffff000, %ebx
218 jmp 0b
2192:
220 testl $0x4, %ecx /* is it the done indicator */
221 jz 2f
222 jmp 3f
2232:
224 testl $0x8, %ecx /* is it the source indicator */
225 jz 0b /* Ignore it otherwise */
226 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi
228
229 movl $1024, %ecx
230 rep ; movsl
231 jmp 0b
232
2333:
234
235 /* To be certain of avoiding problems with self-modifying code
236 * I need to execute a serializing instruction here.
237 * So I flush the TLB, it's handy, and not processor dependent.
238 */
239 xorl %eax, %eax
240 movl %eax, %cr3
241
242 /* set all of the registers to known values */
243 /* leave %esp alone */
244
245 xorl %eax, %eax
246 xorl %ebx, %ebx
247 xorl %ecx, %ecx
248 xorl %edx, %edx
249 xorl %esi, %esi
250 xorl %edi, %edi
251 xorl %ebp, %ebp
252 ret
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
new file mode 100644
index 000000000000..c7d3df23f589
--- /dev/null
+++ b/arch/x86/kernel/scx200_32.c
@@ -0,0 +1,131 @@
1/* linux/arch/i386/kernel/scx200.c
2
3 Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
4
5 National Semiconductor SCx200 support. */
6
7#include <linux/module.h>
8#include <linux/errno.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/mutex.h>
12#include <linux/pci.h>
13
14#include <linux/scx200.h>
15#include <linux/scx200_gpio.h>
16
17/* Verify that the configuration block really is there */
18#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
19
20#define NAME "scx200"
21
22MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
23MODULE_DESCRIPTION("NatSemi SCx200 Driver");
24MODULE_LICENSE("GPL");
25
26unsigned scx200_gpio_base = 0;
27long scx200_gpio_shadow[2];
28
29unsigned scx200_cb_base = 0;
30
31static struct pci_device_id scx200_tbl[] = {
32 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
33 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
34 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) },
35 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) },
36 { },
37};
38MODULE_DEVICE_TABLE(pci,scx200_tbl);
39
40static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
41
42static struct pci_driver scx200_pci_driver = {
43 .name = "scx200",
44 .id_table = scx200_tbl,
45 .probe = scx200_probe,
46};
47
48static DEFINE_MUTEX(scx200_gpio_config_lock);
49
50static void __devinit scx200_init_shadow(void)
51{
52 int bank;
53
54 /* read the current values driven on the GPIO signals */
55 for (bank = 0; bank < 2; ++bank)
56 scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
57}
58
59static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
60{
61 unsigned base;
62
63 if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
64 pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
65 base = pci_resource_start(pdev, 0);
66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
67
68 if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
70 return -EBUSY;
71 }
72
73 scx200_gpio_base = base;
74 scx200_init_shadow();
75
76 } else {
77 /* find the base of the Configuration Block */
78 if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
79 scx200_cb_base = SCx200_CB_BASE_FIXED;
80 } else {
81 pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
82 if (scx200_cb_probe(base)) {
83 scx200_cb_base = base;
84 } else {
85 printk(KERN_WARNING NAME ": Configuration Block not found\n");
86 return -ENODEV;
87 }
88 }
89 printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
90 }
91
92 return 0;
93}
94
95u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
96{
97 u32 config, new_config;
98
99 mutex_lock(&scx200_gpio_config_lock);
100
101 outl(index, scx200_gpio_base + 0x20);
102 config = inl(scx200_gpio_base + 0x24);
103
104 new_config = (config & mask) | bits;
105 outl(new_config, scx200_gpio_base + 0x24);
106
107 mutex_unlock(&scx200_gpio_config_lock);
108
109 return config;
110}
111
112static int __init scx200_init(void)
113{
114 printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
115
116 return pci_register_driver(&scx200_pci_driver);
117}
118
119static void __exit scx200_cleanup(void)
120{
121 pci_unregister_driver(&scx200_pci_driver);
122 release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
123}
124
125module_init(scx200_init);
126module_exit(scx200_cleanup);
127
128EXPORT_SYMBOL(scx200_gpio_base);
129EXPORT_SYMBOL(scx200_gpio_shadow);
130EXPORT_SYMBOL(scx200_gpio_configure);
131EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
new file mode 100644
index 000000000000..d474cd639bcb
--- /dev/null
+++ b/arch/x86/kernel/setup_32.c
@@ -0,0 +1,653 @@
1/*
2 * linux/arch/i386/kernel/setup.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7 *
8 * Memory region support
9 * David Parsons <orc@pell.chi.il.us>, July-August 1999
10 *
11 * Added E820 sanitization routine (removes overlapping memory regions);
12 * Brian Moyle <bmoyle@mvista.com>, February 2001
13 *
14 * Moved CPU detection code to cpu/${cpu}.c
15 * Patrick Mochel <mochel@osdl.org>, March 2002
16 *
17 * Provisions for empty E820 memory regions (reported by certain BIOSes).
18 * Alex Achenbach <xela@slit.de>, December 2002.
19 *
20 */
21
22/*
23 * This file handles the architecture-dependent parts of initialization
24 */
25
26#include <linux/sched.h>
27#include <linux/mm.h>
28#include <linux/mmzone.h>
29#include <linux/screen_info.h>
30#include <linux/ioport.h>
31#include <linux/acpi.h>
32#include <linux/apm_bios.h>
33#include <linux/initrd.h>
34#include <linux/bootmem.h>
35#include <linux/seq_file.h>
36#include <linux/console.h>
37#include <linux/mca.h>
38#include <linux/root_dev.h>
39#include <linux/highmem.h>
40#include <linux/module.h>
41#include <linux/efi.h>
42#include <linux/init.h>
43#include <linux/edd.h>
44#include <linux/nodemask.h>
45#include <linux/kexec.h>
46#include <linux/crash_dump.h>
47#include <linux/dmi.h>
48#include <linux/pfn.h>
49
50#include <video/edid.h>
51
52#include <asm/apic.h>
53#include <asm/e820.h>
54#include <asm/mpspec.h>
55#include <asm/mmzone.h>
56#include <asm/setup.h>
57#include <asm/arch_hooks.h>
58#include <asm/sections.h>
59#include <asm/io_apic.h>
60#include <asm/ist.h>
61#include <asm/io.h>
62#include <asm/vmi.h>
63#include <setup_arch.h>
64#include <bios_ebda.h>
65
66/* This value is set up by the early boot code to point to the value
67 immediately after the boot time page tables. It contains a *physical*
68 address, and must not be in the .bss segment! */
69unsigned long init_pg_tables_end __initdata = ~0UL;
70
71int disable_pse __devinitdata = 0;
72
73/*
74 * Machine setup..
75 */
76extern struct resource code_resource;
77extern struct resource data_resource;
78
79/* cpu data as detected by the assembly code in head.S */
80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
81/* common cpu data for all cpus */
82struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
83EXPORT_SYMBOL(boot_cpu_data);
84
85unsigned long mmu_cr4_features;
86
87/* for MCA, but anyone else can use it if they want */
88unsigned int machine_id;
89#ifdef CONFIG_MCA
90EXPORT_SYMBOL(machine_id);
91#endif
92unsigned int machine_submodel_id;
93unsigned int BIOS_revision;
94unsigned int mca_pentium_flag;
95
96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
97int bootloader_type;
98
99/* user-defined highmem size */
100static unsigned int highmem_pages = -1;
101
102/*
103 * Setup options
104 */
105struct screen_info screen_info;
106EXPORT_SYMBOL(screen_info);
107struct apm_info apm_info;
108EXPORT_SYMBOL(apm_info);
109struct edid_info edid_info;
110EXPORT_SYMBOL_GPL(edid_info);
111struct ist_info ist_info;
112#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
113 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
114EXPORT_SYMBOL(ist_info);
115#endif
116
117extern void early_cpu_init(void);
118extern int root_mountflags;
119
120unsigned long saved_videomode;
121
122#define RAMDISK_IMAGE_START_MASK 0x07FF
123#define RAMDISK_PROMPT_FLAG 0x8000
124#define RAMDISK_LOAD_FLAG 0x4000
125
126static char __initdata command_line[COMMAND_LINE_SIZE];
127
128struct boot_params __initdata boot_params;
129
130#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
131struct edd edd;
132#ifdef CONFIG_EDD_MODULE
133EXPORT_SYMBOL(edd);
134#endif
135/**
136 * copy_edd() - Copy the BIOS EDD information
137 * from boot_params into a safe place.
138 *
139 */
140static inline void copy_edd(void)
141{
142 memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
143 memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
144 edd.mbr_signature_nr = EDD_MBR_SIG_NR;
145 edd.edd_info_nr = EDD_NR;
146}
147#else
148static inline void copy_edd(void)
149{
150}
151#endif
152
153int __initdata user_defined_memmap = 0;
154
155/*
156 * "mem=nopentium" disables the 4MB page tables.
157 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
158 * to <mem>, overriding the bios size.
159 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
160 * <start> to <start>+<mem>, overriding the bios size.
161 *
162 * HPA tells me bootloaders need to parse mem=, so no new
163 * option should be mem= [also see Documentation/i386/boot.txt]
164 */
165static int __init parse_mem(char *arg)
166{
167 if (!arg)
168 return -EINVAL;
169
170 if (strcmp(arg, "nopentium") == 0) {
171 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
172 disable_pse = 1;
173 } else {
174 /* If the user specifies memory size, we
175 * limit the BIOS-provided memory map to
176 * that size. exactmap can be used to specify
177 * the exact map. mem=number can be used to
178 * trim the existing memory map.
179 */
180 unsigned long long mem_size;
181
182 mem_size = memparse(arg, &arg);
183 limit_regions(mem_size);
184 user_defined_memmap = 1;
185 }
186 return 0;
187}
188early_param("mem", parse_mem);
189
190#ifdef CONFIG_PROC_VMCORE
191/* elfcorehdr= specifies the location of elf core header
192 * stored by the crashed kernel.
193 */
194static int __init parse_elfcorehdr(char *arg)
195{
196 if (!arg)
197 return -EINVAL;
198
199 elfcorehdr_addr = memparse(arg, &arg);
200 return 0;
201}
202early_param("elfcorehdr", parse_elfcorehdr);
203#endif /* CONFIG_PROC_VMCORE */
204
205/*
206 * highmem=size forces highmem to be exactly 'size' bytes.
207 * This works even on boxes that have no highmem otherwise.
208 * This also works to reduce highmem size on bigger boxes.
209 */
210static int __init parse_highmem(char *arg)
211{
212 if (!arg)
213 return -EINVAL;
214
215 highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
216 return 0;
217}
218early_param("highmem", parse_highmem);
219
220/*
221 * vmalloc=size forces the vmalloc area to be exactly 'size'
222 * bytes. This can be used to increase (or decrease) the
223 * vmalloc area - the default is 128m.
224 */
225static int __init parse_vmalloc(char *arg)
226{
227 if (!arg)
228 return -EINVAL;
229
230 __VMALLOC_RESERVE = memparse(arg, &arg);
231 return 0;
232}
233early_param("vmalloc", parse_vmalloc);
234
235/*
236 * reservetop=size reserves a hole at the top of the kernel address space which
237 * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
238 * so relocating the fixmap can be done before paging initialization.
239 */
240static int __init parse_reservetop(char *arg)
241{
242 unsigned long address;
243
244 if (!arg)
245 return -EINVAL;
246
247 address = memparse(arg, &arg);
248 reserve_top_address(address);
249 return 0;
250}
251early_param("reservetop", parse_reservetop);
252
253/*
254 * Determine low and high memory ranges:
255 */
256unsigned long __init find_max_low_pfn(void)
257{
258 unsigned long max_low_pfn;
259
260 max_low_pfn = max_pfn;
261 if (max_low_pfn > MAXMEM_PFN) {
262 if (highmem_pages == -1)
263 highmem_pages = max_pfn - MAXMEM_PFN;
264 if (highmem_pages + MAXMEM_PFN < max_pfn)
265 max_pfn = MAXMEM_PFN + highmem_pages;
266 if (highmem_pages + MAXMEM_PFN > max_pfn) {
267 printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
268 highmem_pages = 0;
269 }
270 max_low_pfn = MAXMEM_PFN;
271#ifndef CONFIG_HIGHMEM
272 /* Maximum memory usable is what is directly addressable */
273 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
274 MAXMEM>>20);
275 if (max_pfn > MAX_NONPAE_PFN)
276 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
277 else
278 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
279 max_pfn = MAXMEM_PFN;
280#else /* !CONFIG_HIGHMEM */
281#ifndef CONFIG_HIGHMEM64G
282 if (max_pfn > MAX_NONPAE_PFN) {
283 max_pfn = MAX_NONPAE_PFN;
284 printk(KERN_WARNING "Warning only 4GB will be used.\n");
285 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
286 }
287#endif /* !CONFIG_HIGHMEM64G */
288#endif /* !CONFIG_HIGHMEM */
289 } else {
290 if (highmem_pages == -1)
291 highmem_pages = 0;
292#ifdef CONFIG_HIGHMEM
293 if (highmem_pages >= max_pfn) {
294 printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
295 highmem_pages = 0;
296 }
297 if (highmem_pages) {
298 if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
299 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
300 highmem_pages = 0;
301 }
302 max_low_pfn -= highmem_pages;
303 }
304#else
305 if (highmem_pages)
306 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
307#endif
308 }
309 return max_low_pfn;
310}
311
312/*
313 * workaround for Dell systems that neglect to reserve EBDA
314 */
315static void __init reserve_ebda_region(void)
316{
317 unsigned int addr;
318 addr = get_bios_ebda();
319 if (addr)
320 reserve_bootmem(addr, PAGE_SIZE);
321}
322
323#ifndef CONFIG_NEED_MULTIPLE_NODES
324void __init setup_bootmem_allocator(void);
325static unsigned long __init setup_memory(void)
326{
327 /*
328 * partially used pages are not usable - thus
329 * we are rounding upwards:
330 */
331 min_low_pfn = PFN_UP(init_pg_tables_end);
332
333 find_max_pfn();
334
335 max_low_pfn = find_max_low_pfn();
336
337#ifdef CONFIG_HIGHMEM
338 highstart_pfn = highend_pfn = max_pfn;
339 if (max_pfn > max_low_pfn) {
340 highstart_pfn = max_low_pfn;
341 }
342 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
343 pages_to_mb(highend_pfn - highstart_pfn));
344 num_physpages = highend_pfn;
345 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
346#else
347 num_physpages = max_low_pfn;
348 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
349#endif
350#ifdef CONFIG_FLATMEM
351 max_mapnr = num_physpages;
352#endif
353 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
354 pages_to_mb(max_low_pfn));
355
356 setup_bootmem_allocator();
357
358 return max_low_pfn;
359}
360
361void __init zone_sizes_init(void)
362{
363 unsigned long max_zone_pfns[MAX_NR_ZONES];
364 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
365 max_zone_pfns[ZONE_DMA] =
366 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
367 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
368#ifdef CONFIG_HIGHMEM
369 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
370 add_active_range(0, 0, highend_pfn);
371#else
372 add_active_range(0, 0, max_low_pfn);
373#endif
374
375 free_area_init_nodes(max_zone_pfns);
376}
377#else
378extern unsigned long __init setup_memory(void);
379extern void zone_sizes_init(void);
380#endif /* !CONFIG_NEED_MULTIPLE_NODES */
381
382void __init setup_bootmem_allocator(void)
383{
384 unsigned long bootmap_size;
385 /*
386 * Initialize the boot-time allocator (with low memory only):
387 */
388 bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
389
390 register_bootmem_low_pages(max_low_pfn);
391
392 /*
393 * Reserve the bootmem bitmap itself as well. We do this in two
394 * steps (first step was init_bootmem()) because this catches
395 * the (very unlikely) case of us accidentally initializing the
396 * bootmem allocator with an invalid RAM area.
397 */
398 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
399 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
400
401 /*
402 * reserve physical page 0 - it's a special BIOS page on many boxes,
403 * enabling clean reboots, SMP operation, laptop functions.
404 */
405 reserve_bootmem(0, PAGE_SIZE);
406
407 /* reserve EBDA region, it's a 4K region */
408 reserve_ebda_region();
409
410 /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
411 PCI prefetch into it (errata #56). Usually the page is reserved anyways,
412 unless you have no PS/2 mouse plugged in. */
413 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
414 boot_cpu_data.x86 == 6)
415 reserve_bootmem(0xa0000 - 4096, 4096);
416
417#ifdef CONFIG_SMP
418 /*
419 * But first pinch a few for the stack/trampoline stuff
420 * FIXME: Don't need the extra page at 4K, but need to fix
421 * trampoline before removing it. (see the GDT stuff)
422 */
423 reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
424#endif
425#ifdef CONFIG_ACPI_SLEEP
426 /*
427 * Reserve low memory region for sleep support.
428 */
429 acpi_reserve_bootmem();
430#endif
431#ifdef CONFIG_X86_FIND_SMP_CONFIG
432 /*
433 * Find and reserve possible boot-time SMP configuration:
434 */
435 find_smp_config();
436#endif
437 numa_kva_reserve();
438#ifdef CONFIG_BLK_DEV_INITRD
439 if (LOADER_TYPE && INITRD_START) {
440 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
441 reserve_bootmem(INITRD_START, INITRD_SIZE);
442 initrd_start = INITRD_START + PAGE_OFFSET;
443 initrd_end = initrd_start+INITRD_SIZE;
444 }
445 else {
446 printk(KERN_ERR "initrd extends beyond end of memory "
447 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
448 INITRD_START + INITRD_SIZE,
449 max_low_pfn << PAGE_SHIFT);
450 initrd_start = 0;
451 }
452 }
453#endif
454#ifdef CONFIG_KEXEC
455 if (crashk_res.start != crashk_res.end)
456 reserve_bootmem(crashk_res.start,
457 crashk_res.end - crashk_res.start + 1);
458#endif
459}
460
461/*
462 * The node 0 pgdat is initialized before all of these because
463 * it's needed for bootmem. node>0 pgdats have their virtual
464 * space allocated before the pagetables are in place to access
465 * them, so they can't be cleared then.
466 *
467 * This should all compile down to nothing when NUMA is off.
468 */
469static void __init remapped_pgdat_init(void)
470{
471 int nid;
472
473 for_each_online_node(nid) {
474 if (nid != 0)
475 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
476 }
477}
478
479#ifdef CONFIG_MCA
480static void set_mca_bus(int x)
481{
482 MCA_bus = x;
483}
484#else
485static void set_mca_bus(int x) { }
486#endif
487
488/* Overridden in paravirt.c if CONFIG_PARAVIRT */
489char * __init __attribute__((weak)) memory_setup(void)
490{
491 return machine_specific_memory_setup();
492}
493
494/*
495 * Determine if we were loaded by an EFI loader. If so, then we have also been
496 * passed the efi memmap, systab, etc., so we should use these data structures
497 * for initialization. Note, the efi init code path is determined by the
498 * global efi_enabled. This allows the same kernel image to be used on existing
499 * systems (with a traditional BIOS) as well as on EFI systems.
500 */
501void __init setup_arch(char **cmdline_p)
502{
503 unsigned long max_low_pfn;
504
505 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
506 pre_setup_arch_hook();
507 early_cpu_init();
508
509 /*
510 * FIXME: This isn't an official loader_type right
511 * now but does currently work with elilo.
512 * If we were configured as an EFI kernel, check to make
513 * sure that we were loaded correctly from elilo and that
514 * the system table is valid. If not, then initialize normally.
515 */
516#ifdef CONFIG_EFI
517 if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
518 efi_enabled = 1;
519#endif
520
521 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
522 screen_info = SCREEN_INFO;
523 edid_info = EDID_INFO;
524 apm_info.bios = APM_BIOS_INFO;
525 ist_info = IST_INFO;
526 saved_videomode = VIDEO_MODE;
527 if( SYS_DESC_TABLE.length != 0 ) {
528 set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
529 machine_id = SYS_DESC_TABLE.table[0];
530 machine_submodel_id = SYS_DESC_TABLE.table[1];
531 BIOS_revision = SYS_DESC_TABLE.table[2];
532 }
533 bootloader_type = LOADER_TYPE;
534
535#ifdef CONFIG_BLK_DEV_RAM
536 rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
537 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
538 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
539#endif
540 ARCH_SETUP
541 if (efi_enabled)
542 efi_init();
543 else {
544 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
545 print_memory_map(memory_setup());
546 }
547
548 copy_edd();
549
550 if (!MOUNT_ROOT_RDONLY)
551 root_mountflags &= ~MS_RDONLY;
552 init_mm.start_code = (unsigned long) _text;
553 init_mm.end_code = (unsigned long) _etext;
554 init_mm.end_data = (unsigned long) _edata;
555 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
556
557 code_resource.start = virt_to_phys(_text);
558 code_resource.end = virt_to_phys(_etext)-1;
559 data_resource.start = virt_to_phys(_etext);
560 data_resource.end = virt_to_phys(_edata)-1;
561
562 parse_early_param();
563
564 if (user_defined_memmap) {
565 printk(KERN_INFO "user-defined physical RAM map:\n");
566 print_memory_map("user");
567 }
568
569 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
570 *cmdline_p = command_line;
571
572 max_low_pfn = setup_memory();
573
574#ifdef CONFIG_VMI
575 /*
576 * Must be after max_low_pfn is determined, and before kernel
577 * pagetables are setup.
578 */
579 vmi_init();
580#endif
581
582 /*
583 * NOTE: before this point _nobody_ is allowed to allocate
584 * any memory using the bootmem allocator. Although the
585 * alloctor is now initialised only the first 8Mb of the kernel
586 * virtual address space has been mapped. All allocations before
587 * paging_init() has completed must use the alloc_bootmem_low_pages()
588 * variant (which allocates DMA'able memory) and care must be taken
589 * not to exceed the 8Mb limit.
590 */
591
592#ifdef CONFIG_SMP
593 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
594#endif
595 paging_init();
596 remapped_pgdat_init();
597 sparse_init();
598 zone_sizes_init();
599
600 /*
601 * NOTE: at this point the bootmem allocator is fully available.
602 */
603
604 paravirt_post_allocator_init();
605
606 dmi_scan_machine();
607
608#ifdef CONFIG_X86_GENERICARCH
609 generic_apic_probe();
610#endif
611 if (efi_enabled)
612 efi_map_memmap();
613
614#ifdef CONFIG_ACPI
615 /*
616 * Parse the ACPI tables for possible boot-time SMP configuration.
617 */
618 acpi_boot_table_init();
619#endif
620
621#ifdef CONFIG_PCI
622#ifdef CONFIG_X86_IO_APIC
623 check_acpi_pci(); /* Checks more than just ACPI actually */
624#endif
625#endif
626
627#ifdef CONFIG_ACPI
628 acpi_boot_init();
629
630#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
631 if (def_to_bigsmp)
632 printk(KERN_WARNING "More than 8 CPUs detected and "
633 "CONFIG_X86_PC cannot handle it.\nUse "
634 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
635#endif
636#endif
637#ifdef CONFIG_X86_LOCAL_APIC
638 if (smp_found_config)
639 get_smp_config();
640#endif
641
642 e820_register_memory();
643 e820_mark_nosave_regions();
644
645#ifdef CONFIG_VT
646#if defined(CONFIG_VGA_CONSOLE)
647 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
648 conswitchp = &vga_con;
649#elif defined(CONFIG_DUMMY_CONSOLE)
650 conswitchp = &dummy_con;
651#endif
652#endif
653}
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe_32.h
new file mode 100644
index 000000000000..0b2221711dad
--- /dev/null
+++ b/arch/x86/kernel/sigframe_32.h
@@ -0,0 +1,21 @@
1struct sigframe
2{
3 char __user *pretcode;
4 int sig;
5 struct sigcontext sc;
6 struct _fpstate fpstate;
7 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8];
9};
10
11struct rt_sigframe
12{
13 char __user *pretcode;
14 int sig;
15 struct siginfo __user *pinfo;
16 void __user *puc;
17 struct siginfo info;
18 struct ucontext uc;
19 struct _fpstate fpstate;
20 char retcode[8];
21};
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644
index 000000000000..c03570f7fe8e
--- /dev/null
+++ b/arch/x86/kernel/signal_32.c
@@ -0,0 +1,667 @@
1/*
2 * linux/arch/i386/kernel/signal.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
7 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/kernel.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/wait.h>
17#include <linux/unistd.h>
18#include <linux/stddef.h>
19#include <linux/personality.h>
20#include <linux/suspend.h>
21#include <linux/ptrace.h>
22#include <linux/elf.h>
23#include <linux/binfmts.h>
24#include <asm/processor.h>
25#include <asm/ucontext.h>
26#include <asm/uaccess.h>
27#include <asm/i387.h>
28#include "sigframe_32.h"
29
30#define DEBUG_SIG 0
31
32#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
33
34/*
35 * Atomically swap in the new signal mask, and wait for a signal.
36 */
37asmlinkage int
38sys_sigsuspend(int history0, int history1, old_sigset_t mask)
39{
40 mask &= _BLOCKABLE;
41 spin_lock_irq(&current->sighand->siglock);
42 current->saved_sigmask = current->blocked;
43 siginitset(&current->blocked, mask);
44 recalc_sigpending();
45 spin_unlock_irq(&current->sighand->siglock);
46
47 current->state = TASK_INTERRUPTIBLE;
48 schedule();
49 set_thread_flag(TIF_RESTORE_SIGMASK);
50 return -ERESTARTNOHAND;
51}
52
53asmlinkage int
54sys_sigaction(int sig, const struct old_sigaction __user *act,
55 struct old_sigaction __user *oact)
56{
57 struct k_sigaction new_ka, old_ka;
58 int ret;
59
60 if (act) {
61 old_sigset_t mask;
62 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
63 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
64 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
65 return -EFAULT;
66 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
67 __get_user(mask, &act->sa_mask);
68 siginitset(&new_ka.sa.sa_mask, mask);
69 }
70
71 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
72
73 if (!ret && oact) {
74 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
75 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
76 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
77 return -EFAULT;
78 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
79 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
80 }
81
82 return ret;
83}
84
85asmlinkage int
86sys_sigaltstack(unsigned long ebx)
87{
88 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
89 struct pt_regs *regs = (struct pt_regs *)&ebx;
90 const stack_t __user *uss = (const stack_t __user *)ebx;
91 stack_t __user *uoss = (stack_t __user *)regs->ecx;
92
93 return do_sigaltstack(uss, uoss, regs->esp);
94}
95
96
97/*
98 * Do a signal return; undo the signal stack.
99 */
100
101static int
102restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
103{
104 unsigned int err = 0;
105
106 /* Always make any pending restarted system calls return -EINTR */
107 current_thread_info()->restart_block.fn = do_no_restart_syscall;
108
109#define COPY(x) err |= __get_user(regs->x, &sc->x)
110
111#define COPY_SEG(seg) \
112 { unsigned short tmp; \
113 err |= __get_user(tmp, &sc->seg); \
114 regs->x##seg = tmp; }
115
116#define COPY_SEG_STRICT(seg) \
117 { unsigned short tmp; \
118 err |= __get_user(tmp, &sc->seg); \
119 regs->x##seg = tmp|3; }
120
121#define GET_SEG(seg) \
122 { unsigned short tmp; \
123 err |= __get_user(tmp, &sc->seg); \
124 loadsegment(seg,tmp); }
125
126#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_RF | \
127 X86_EFLAGS_OF | X86_EFLAGS_DF | \
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130
131 GET_SEG(gs);
132 COPY_SEG(fs);
133 COPY_SEG(es);
134 COPY_SEG(ds);
135 COPY(edi);
136 COPY(esi);
137 COPY(ebp);
138 COPY(esp);
139 COPY(ebx);
140 COPY(edx);
141 COPY(ecx);
142 COPY(eip);
143 COPY_SEG_STRICT(cs);
144 COPY_SEG_STRICT(ss);
145
146 {
147 unsigned int tmpflags;
148 err |= __get_user(tmpflags, &sc->eflags);
149 regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
150 regs->orig_eax = -1; /* disable syscall checks */
151 }
152
153 {
154 struct _fpstate __user * buf;
155 err |= __get_user(buf, &sc->fpstate);
156 if (buf) {
157 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
158 goto badframe;
159 err |= restore_i387(buf);
160 } else {
161 struct task_struct *me = current;
162 if (used_math()) {
163 clear_fpu(me);
164 clear_used_math();
165 }
166 }
167 }
168
169 err |= __get_user(*peax, &sc->eax);
170 return err;
171
172badframe:
173 return 1;
174}
175
176asmlinkage int sys_sigreturn(unsigned long __unused)
177{
178 struct pt_regs *regs = (struct pt_regs *) &__unused;
179 struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
180 sigset_t set;
181 int eax;
182
183 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
184 goto badframe;
185 if (__get_user(set.sig[0], &frame->sc.oldmask)
186 || (_NSIG_WORDS > 1
187 && __copy_from_user(&set.sig[1], &frame->extramask,
188 sizeof(frame->extramask))))
189 goto badframe;
190
191 sigdelsetmask(&set, ~_BLOCKABLE);
192 spin_lock_irq(&current->sighand->siglock);
193 current->blocked = set;
194 recalc_sigpending();
195 spin_unlock_irq(&current->sighand->siglock);
196
197 if (restore_sigcontext(regs, &frame->sc, &eax))
198 goto badframe;
199 return eax;
200
201badframe:
202 if (show_unhandled_signals && printk_ratelimit())
203 printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
204 " esp:%lx oeax:%lx\n",
205 current->pid > 1 ? KERN_INFO : KERN_EMERG,
206 current->comm, current->pid, frame, regs->eip,
207 regs->esp, regs->orig_eax);
208
209 force_sig(SIGSEGV, current);
210 return 0;
211}
212
213asmlinkage int sys_rt_sigreturn(unsigned long __unused)
214{
215 struct pt_regs *regs = (struct pt_regs *) &__unused;
216 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
217 sigset_t set;
218 int eax;
219
220 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
221 goto badframe;
222 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
223 goto badframe;
224
225 sigdelsetmask(&set, ~_BLOCKABLE);
226 spin_lock_irq(&current->sighand->siglock);
227 current->blocked = set;
228 recalc_sigpending();
229 spin_unlock_irq(&current->sighand->siglock);
230
231 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
232 goto badframe;
233
234 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
235 goto badframe;
236
237 return eax;
238
239badframe:
240 force_sig(SIGSEGV, current);
241 return 0;
242}
243
244/*
245 * Set up a signal frame.
246 */
247
248static int
249setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
250 struct pt_regs *regs, unsigned long mask)
251{
252 int tmp, err = 0;
253
254 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
255 savesegment(gs, tmp);
256 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
257
258 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
259 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
260 err |= __put_user(regs->edi, &sc->edi);
261 err |= __put_user(regs->esi, &sc->esi);
262 err |= __put_user(regs->ebp, &sc->ebp);
263 err |= __put_user(regs->esp, &sc->esp);
264 err |= __put_user(regs->ebx, &sc->ebx);
265 err |= __put_user(regs->edx, &sc->edx);
266 err |= __put_user(regs->ecx, &sc->ecx);
267 err |= __put_user(regs->eax, &sc->eax);
268 err |= __put_user(current->thread.trap_no, &sc->trapno);
269 err |= __put_user(current->thread.error_code, &sc->err);
270 err |= __put_user(regs->eip, &sc->eip);
271 err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
272 err |= __put_user(regs->eflags, &sc->eflags);
273 err |= __put_user(regs->esp, &sc->esp_at_signal);
274 err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
275
276 tmp = save_i387(fpstate);
277 if (tmp < 0)
278 err = 1;
279 else
280 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
281
282 /* non-iBCS2 extensions.. */
283 err |= __put_user(mask, &sc->oldmask);
284 err |= __put_user(current->thread.cr2, &sc->cr2);
285
286 return err;
287}
288
289/*
290 * Determine which stack to use..
291 */
292static inline void __user *
293get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
294{
295 unsigned long esp;
296
297 /* Default to using normal stack */
298 esp = regs->esp;
299
300 /* This is the X/Open sanctioned signal stack switching. */
301 if (ka->sa.sa_flags & SA_ONSTACK) {
302 if (sas_ss_flags(esp) == 0)
303 esp = current->sas_ss_sp + current->sas_ss_size;
304 }
305
306 /* This is the legacy signal stack switching. */
307 else if ((regs->xss & 0xffff) != __USER_DS &&
308 !(ka->sa.sa_flags & SA_RESTORER) &&
309 ka->sa.sa_restorer) {
310 esp = (unsigned long) ka->sa.sa_restorer;
311 }
312
313 esp -= frame_size;
314 /* Align the stack pointer according to the i386 ABI,
315 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
316 esp = ((esp + 4) & -16ul) - 4;
317 return (void __user *) esp;
318}
319
320/* These symbols are defined with the addresses in the vsyscall page.
321 See vsyscall-sigreturn.S. */
322extern void __user __kernel_sigreturn;
323extern void __user __kernel_rt_sigreturn;
324
325static int setup_frame(int sig, struct k_sigaction *ka,
326 sigset_t *set, struct pt_regs * regs)
327{
328 void __user *restorer;
329 struct sigframe __user *frame;
330 int err = 0;
331 int usig;
332
333 frame = get_sigframe(ka, regs, sizeof(*frame));
334
335 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
336 goto give_sigsegv;
337
338 usig = current_thread_info()->exec_domain
339 && current_thread_info()->exec_domain->signal_invmap
340 && sig < 32
341 ? current_thread_info()->exec_domain->signal_invmap[sig]
342 : sig;
343
344 err = __put_user(usig, &frame->sig);
345 if (err)
346 goto give_sigsegv;
347
348 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
349 if (err)
350 goto give_sigsegv;
351
352 if (_NSIG_WORDS > 1) {
353 err = __copy_to_user(&frame->extramask, &set->sig[1],
354 sizeof(frame->extramask));
355 if (err)
356 goto give_sigsegv;
357 }
358
359 if (current->binfmt->hasvdso)
360 restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
361 else
362 restorer = (void *)&frame->retcode;
363 if (ka->sa.sa_flags & SA_RESTORER)
364 restorer = ka->sa.sa_restorer;
365
366 /* Set up to return from userspace. */
367 err |= __put_user(restorer, &frame->pretcode);
368
369 /*
370 * This is popl %eax ; movl $,%eax ; int $0x80
371 *
372 * WE DO NOT USE IT ANY MORE! It's only left here for historical
373 * reasons and because gdb uses it as a signature to notice
374 * signal handler stack frames.
375 */
376 err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
377 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
378 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
379
380 if (err)
381 goto give_sigsegv;
382
383 /* Set up registers for signal handler */
384 regs->esp = (unsigned long) frame;
385 regs->eip = (unsigned long) ka->sa.sa_handler;
386 regs->eax = (unsigned long) sig;
387 regs->edx = (unsigned long) 0;
388 regs->ecx = (unsigned long) 0;
389
390 set_fs(USER_DS);
391 regs->xds = __USER_DS;
392 regs->xes = __USER_DS;
393 regs->xss = __USER_DS;
394 regs->xcs = __USER_CS;
395
396 /*
397 * Clear TF when entering the signal handler, but
398 * notify any tracer that was single-stepping it.
399 * The tracer may want to single-step inside the
400 * handler too.
401 */
402 regs->eflags &= ~TF_MASK;
403 if (test_thread_flag(TIF_SINGLESTEP))
404 ptrace_notify(SIGTRAP);
405
406#if DEBUG_SIG
407 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
408 current->comm, current->pid, frame, regs->eip, frame->pretcode);
409#endif
410
411 return 0;
412
413give_sigsegv:
414 force_sigsegv(sig, current);
415 return -EFAULT;
416}
417
418static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
419 sigset_t *set, struct pt_regs * regs)
420{
421 void __user *restorer;
422 struct rt_sigframe __user *frame;
423 int err = 0;
424 int usig;
425
426 frame = get_sigframe(ka, regs, sizeof(*frame));
427
428 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
429 goto give_sigsegv;
430
431 usig = current_thread_info()->exec_domain
432 && current_thread_info()->exec_domain->signal_invmap
433 && sig < 32
434 ? current_thread_info()->exec_domain->signal_invmap[sig]
435 : sig;
436
437 err |= __put_user(usig, &frame->sig);
438 err |= __put_user(&frame->info, &frame->pinfo);
439 err |= __put_user(&frame->uc, &frame->puc);
440 err |= copy_siginfo_to_user(&frame->info, info);
441 if (err)
442 goto give_sigsegv;
443
444 /* Create the ucontext. */
445 err |= __put_user(0, &frame->uc.uc_flags);
446 err |= __put_user(0, &frame->uc.uc_link);
447 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
448 err |= __put_user(sas_ss_flags(regs->esp),
449 &frame->uc.uc_stack.ss_flags);
450 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
451 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
452 regs, set->sig[0]);
453 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
454 if (err)
455 goto give_sigsegv;
456
457 /* Set up to return from userspace. */
458 restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
459 if (ka->sa.sa_flags & SA_RESTORER)
460 restorer = ka->sa.sa_restorer;
461 err |= __put_user(restorer, &frame->pretcode);
462
463 /*
464 * This is movl $,%eax ; int $0x80
465 *
466 * WE DO NOT USE IT ANY MORE! It's only left here for historical
467 * reasons and because gdb uses it as a signature to notice
468 * signal handler stack frames.
469 */
470 err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
471 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
472 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
473
474 if (err)
475 goto give_sigsegv;
476
477 /* Set up registers for signal handler */
478 regs->esp = (unsigned long) frame;
479 regs->eip = (unsigned long) ka->sa.sa_handler;
480 regs->eax = (unsigned long) usig;
481 regs->edx = (unsigned long) &frame->info;
482 regs->ecx = (unsigned long) &frame->uc;
483
484 set_fs(USER_DS);
485 regs->xds = __USER_DS;
486 regs->xes = __USER_DS;
487 regs->xss = __USER_DS;
488 regs->xcs = __USER_CS;
489
490 /*
491 * Clear TF when entering the signal handler, but
492 * notify any tracer that was single-stepping it.
493 * The tracer may want to single-step inside the
494 * handler too.
495 */
496 regs->eflags &= ~TF_MASK;
497 if (test_thread_flag(TIF_SINGLESTEP))
498 ptrace_notify(SIGTRAP);
499
500#if DEBUG_SIG
501 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
502 current->comm, current->pid, frame, regs->eip, frame->pretcode);
503#endif
504
505 return 0;
506
507give_sigsegv:
508 force_sigsegv(sig, current);
509 return -EFAULT;
510}
511
512/*
513 * OK, we're invoking a handler
514 */
515
516static int
517handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
518 sigset_t *oldset, struct pt_regs * regs)
519{
520 int ret;
521
522 /* Are we from a system call? */
523 if (regs->orig_eax >= 0) {
524 /* If so, check system call restarting.. */
525 switch (regs->eax) {
526 case -ERESTART_RESTARTBLOCK:
527 case -ERESTARTNOHAND:
528 regs->eax = -EINTR;
529 break;
530
531 case -ERESTARTSYS:
532 if (!(ka->sa.sa_flags & SA_RESTART)) {
533 regs->eax = -EINTR;
534 break;
535 }
536 /* fallthrough */
537 case -ERESTARTNOINTR:
538 regs->eax = regs->orig_eax;
539 regs->eip -= 2;
540 }
541 }
542
543 /*
544 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
545 * that register information in the sigcontext is correct.
546 */
547 if (unlikely(regs->eflags & TF_MASK)
548 && likely(current->ptrace & PT_DTRACE)) {
549 current->ptrace &= ~PT_DTRACE;
550 regs->eflags &= ~TF_MASK;
551 }
552
553 /* Set up the stack frame */
554 if (ka->sa.sa_flags & SA_SIGINFO)
555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
556 else
557 ret = setup_frame(sig, ka, oldset, regs);
558
559 if (ret == 0) {
560 spin_lock_irq(&current->sighand->siglock);
561 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
562 if (!(ka->sa.sa_flags & SA_NODEFER))
563 sigaddset(&current->blocked,sig);
564 recalc_sigpending();
565 spin_unlock_irq(&current->sighand->siglock);
566 }
567
568 return ret;
569}
570
571/*
572 * Note that 'init' is a special process: it doesn't get signals it doesn't
573 * want to handle. Thus you cannot kill init even with a SIGKILL even by
574 * mistake.
575 */
576static void fastcall do_signal(struct pt_regs *regs)
577{
578 siginfo_t info;
579 int signr;
580 struct k_sigaction ka;
581 sigset_t *oldset;
582
583 /*
584 * We want the common case to go fast, which
585 * is why we may in certain cases get here from
586 * kernel mode. Just return without doing anything
587 * if so. vm86 regs switched out by assembly code
588 * before reaching here, so testing against kernel
589 * CS suffices.
590 */
591 if (!user_mode(regs))
592 return;
593
594 if (test_thread_flag(TIF_RESTORE_SIGMASK))
595 oldset = &current->saved_sigmask;
596 else
597 oldset = &current->blocked;
598
599 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
600 if (signr > 0) {
601 /* Reenable any watchpoints before delivering the
602 * signal to user space. The processor register will
603 * have been cleared if the watchpoint triggered
604 * inside the kernel.
605 */
606 if (unlikely(current->thread.debugreg[7]))
607 set_debugreg(current->thread.debugreg[7], 7);
608
609 /* Whee! Actually deliver the signal. */
610 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
611 /* a signal was successfully delivered; the saved
612 * sigmask will have been stored in the signal frame,
613 * and will be restored by sigreturn, so we can simply
614 * clear the TIF_RESTORE_SIGMASK flag */
615 if (test_thread_flag(TIF_RESTORE_SIGMASK))
616 clear_thread_flag(TIF_RESTORE_SIGMASK);
617 }
618
619 return;
620 }
621
622 /* Did we come from a system call? */
623 if (regs->orig_eax >= 0) {
624 /* Restart the system call - no handlers present */
625 switch (regs->eax) {
626 case -ERESTARTNOHAND:
627 case -ERESTARTSYS:
628 case -ERESTARTNOINTR:
629 regs->eax = regs->orig_eax;
630 regs->eip -= 2;
631 break;
632
633 case -ERESTART_RESTARTBLOCK:
634 regs->eax = __NR_restart_syscall;
635 regs->eip -= 2;
636 break;
637 }
638 }
639
640 /* if there's no signal to deliver, we just put the saved sigmask
641 * back */
642 if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
643 clear_thread_flag(TIF_RESTORE_SIGMASK);
644 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
645 }
646}
647
648/*
649 * notification of userspace execution resumption
650 * - triggered by the TIF_WORK_MASK flags
651 */
652__attribute__((regparm(3)))
653void do_notify_resume(struct pt_regs *regs, void *_unused,
654 __u32 thread_info_flags)
655{
656 /* Pending single-step? */
657 if (thread_info_flags & _TIF_SINGLESTEP) {
658 regs->eflags |= TF_MASK;
659 clear_thread_flag(TIF_SINGLESTEP);
660 }
661
662 /* deal with pending signal delivery */
663 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
664 do_signal(regs);
665
666 clear_thread_flag(TIF_IRET);
667}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
new file mode 100644
index 000000000000..2d35d8502029
--- /dev/null
+++ b/arch/x86/kernel/smp_32.c
@@ -0,0 +1,707 @@
1/*
2 * Intel SMP support routines.
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 * This code is released under the GNU General Public License version 2 or
8 * later.
9 */
10
11#include <linux/init.h>
12
13#include <linux/mm.h>
14#include <linux/delay.h>
15#include <linux/spinlock.h>
16#include <linux/kernel_stat.h>
17#include <linux/mc146818rtc.h>
18#include <linux/cache.h>
19#include <linux/interrupt.h>
20#include <linux/cpu.h>
21#include <linux/module.h>
22
23#include <asm/mtrr.h>
24#include <asm/tlbflush.h>
25#include <asm/mmu_context.h>
26#include <mach_apic.h>
27
28/*
29 * Some notes on x86 processor bugs affecting SMP operation:
30 *
31 * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
32 * The Linux implications for SMP are handled as follows:
33 *
34 * Pentium III / [Xeon]
35 * None of the E1AP-E3AP errata are visible to the user.
36 *
37 * E1AP. see PII A1AP
38 * E2AP. see PII A2AP
39 * E3AP. see PII A3AP
40 *
41 * Pentium II / [Xeon]
42 * None of the A1AP-A3AP errata are visible to the user.
43 *
44 * A1AP. see PPro 1AP
45 * A2AP. see PPro 2AP
46 * A3AP. see PPro 7AP
47 *
48 * Pentium Pro
49 * None of 1AP-9AP errata are visible to the normal user,
50 * except occasional delivery of 'spurious interrupt' as trap #15.
51 * This is very rare and a non-problem.
52 *
53 * 1AP. Linux maps APIC as non-cacheable
54 * 2AP. worked around in hardware
55 * 3AP. fixed in C0 and above steppings microcode update.
56 * Linux does not use excessive STARTUP_IPIs.
57 * 4AP. worked around in hardware
58 * 5AP. symmetric IO mode (normal Linux operation) not affected.
59 * 'noapic' mode has vector 0xf filled out properly.
60 * 6AP. 'noapic' mode might be affected - fixed in later steppings
61 * 7AP. We do not assume writes to the LVT deassering IRQs
62 * 8AP. We do not enable low power mode (deep sleep) during MP bootup
63 * 9AP. We do not use mixed mode
64 *
65 * Pentium
66 * There is a marginal case where REP MOVS on 100MHz SMP
67 * machines with B stepping processors can fail. XXX should provide
68 * an L1cache=Writethrough or L1cache=off option.
69 *
70 * B stepping CPUs may hang. There are hardware work arounds
71 * for this. We warn about it in case your board doesn't have the work
72 * arounds. Basically thats so I can tell anyone with a B stepping
73 * CPU and SMP problems "tough".
74 *
75 * Specific items [From Pentium Processor Specification Update]
76 *
77 * 1AP. Linux doesn't use remote read
78 * 2AP. Linux doesn't trust APIC errors
79 * 3AP. We work around this
80 * 4AP. Linux never generated 3 interrupts of the same priority
81 * to cause a lost local interrupt.
82 * 5AP. Remote read is never used
83 * 6AP. not affected - worked around in hardware
84 * 7AP. not affected - worked around in hardware
85 * 8AP. worked around in hardware - we get explicit CS errors if not
86 * 9AP. only 'noapic' mode affected. Might generate spurious
87 * interrupts, we log only the first one and count the
88 * rest silently.
89 * 10AP. not affected - worked around in hardware
90 * 11AP. Linux reads the APIC between writes to avoid this, as per
91 * the documentation. Make sure you preserve this as it affects
92 * the C stepping chips too.
93 * 12AP. not affected - worked around in hardware
94 * 13AP. not affected - worked around in hardware
95 * 14AP. we always deassert INIT during bootup
96 * 15AP. not affected - worked around in hardware
97 * 16AP. not affected - worked around in hardware
98 * 17AP. not affected - worked around in hardware
99 * 18AP. not affected - worked around in hardware
100 * 19AP. not affected - worked around in BIOS
101 *
102 * If this sounds worrying believe me these bugs are either ___RARE___,
103 * or are signal timing bugs worked around in hardware and there's
104 * about nothing of note with C stepping upwards.
105 */
106
107DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
108
109/*
110 * the following functions deal with sending IPIs between CPUs.
111 *
112 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
113 */
114
115static inline int __prepare_ICR (unsigned int shortcut, int vector)
116{
117 unsigned int icr = shortcut | APIC_DEST_LOGICAL;
118
119 switch (vector) {
120 default:
121 icr |= APIC_DM_FIXED | vector;
122 break;
123 case NMI_VECTOR:
124 icr |= APIC_DM_NMI;
125 break;
126 }
127 return icr;
128}
129
130static inline int __prepare_ICR2 (unsigned int mask)
131{
132 return SET_APIC_DEST_FIELD(mask);
133}
134
135void __send_IPI_shortcut(unsigned int shortcut, int vector)
136{
137 /*
138 * Subtle. In the case of the 'never do double writes' workaround
139 * we have to lock out interrupts to be safe. As we don't care
140 * of the value read we use an atomic rmw access to avoid costly
141 * cli/sti. Otherwise we use an even cheaper single atomic write
142 * to the APIC.
143 */
144 unsigned int cfg;
145
146 /*
147 * Wait for idle.
148 */
149 apic_wait_icr_idle();
150
151 /*
152 * No need to touch the target chip field
153 */
154 cfg = __prepare_ICR(shortcut, vector);
155
156 /*
157 * Send the IPI. The write to APIC_ICR fires this off.
158 */
159 apic_write_around(APIC_ICR, cfg);
160}
161
162void fastcall send_IPI_self(int vector)
163{
164 __send_IPI_shortcut(APIC_DEST_SELF, vector);
165}
166
167/*
168 * This is used to send an IPI with no shorthand notation (the destination is
169 * specified in bits 56 to 63 of the ICR).
170 */
171static inline void __send_IPI_dest_field(unsigned long mask, int vector)
172{
173 unsigned long cfg;
174
175 /*
176 * Wait for idle.
177 */
178 if (unlikely(vector == NMI_VECTOR))
179 safe_apic_wait_icr_idle();
180 else
181 apic_wait_icr_idle();
182
183 /*
184 * prepare target chip field
185 */
186 cfg = __prepare_ICR2(mask);
187 apic_write_around(APIC_ICR2, cfg);
188
189 /*
190 * program the ICR
191 */
192 cfg = __prepare_ICR(0, vector);
193
194 /*
195 * Send the IPI. The write to APIC_ICR fires this off.
196 */
197 apic_write_around(APIC_ICR, cfg);
198}
199
200/*
201 * This is only used on smaller machines.
202 */
203void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
204{
205 unsigned long mask = cpus_addr(cpumask)[0];
206 unsigned long flags;
207
208 local_irq_save(flags);
209 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
210 __send_IPI_dest_field(mask, vector);
211 local_irq_restore(flags);
212}
213
214void send_IPI_mask_sequence(cpumask_t mask, int vector)
215{
216 unsigned long flags;
217 unsigned int query_cpu;
218
219 /*
220 * Hack. The clustered APIC addressing mode doesn't allow us to send
221 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
222 * should be modified to do 1 message per cluster ID - mbligh
223 */
224
225 local_irq_save(flags);
226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
227 if (cpu_isset(query_cpu, mask)) {
228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
229 vector);
230 }
231 }
232 local_irq_restore(flags);
233}
234
235#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
236
237/*
238 * Smarter SMP flushing macros.
239 * c/o Linus Torvalds.
240 *
241 * These mean you can really definitely utterly forget about
242 * writing to user space from interrupts. (Its not allowed anyway).
243 *
244 * Optimizations Manfred Spraul <manfred@colorfullife.com>
245 */
246
247static cpumask_t flush_cpumask;
248static struct mm_struct * flush_mm;
249static unsigned long flush_va;
250static DEFINE_SPINLOCK(tlbstate_lock);
251
252/*
253 * We cannot call mmdrop() because we are in interrupt context,
254 * instead update mm->cpu_vm_mask.
255 *
256 * We need to reload %cr3 since the page tables may be going
257 * away from under us..
258 */
259void leave_mm(unsigned long cpu)
260{
261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
262 BUG();
263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
264 load_cr3(swapper_pg_dir);
265}
266
267/*
268 *
269 * The flush IPI assumes that a thread switch happens in this order:
270 * [cpu0: the cpu that switches]
271 * 1) switch_mm() either 1a) or 1b)
272 * 1a) thread switch to a different mm
273 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
274 * Stop ipi delivery for the old mm. This is not synchronized with
275 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
276 * for the wrong mm, and in the worst case we perform a superflous
277 * tlb flush.
278 * 1a2) set cpu_tlbstate to TLBSTATE_OK
279 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
280 * was in lazy tlb mode.
281 * 1a3) update cpu_tlbstate[].active_mm
282 * Now cpu0 accepts tlb flushes for the new mm.
283 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
284 * Now the other cpus will send tlb flush ipis.
285 * 1a4) change cr3.
286 * 1b) thread switch without mm change
287 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
288 * flush ipis.
289 * 1b1) set cpu_tlbstate to TLBSTATE_OK
290 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
291 * Atomically set the bit [other cpus will start sending flush ipis],
292 * and test the bit.
293 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
294 * 2) switch %%esp, ie current
295 *
296 * The interrupt must handle 2 special cases:
297 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
298 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
299 * runs in kernel space, the cpu could load tlb entries for user space
300 * pages.
301 *
302 * The good news is that cpu_tlbstate is local to each cpu, no
303 * write/read ordering problems.
304 */
305
306/*
307 * TLB flush IPI:
308 *
309 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
310 * 2) Leave the mm if we are in the lazy tlb mode.
311 */
312
313fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
314{
315 unsigned long cpu;
316
317 cpu = get_cpu();
318
319 if (!cpu_isset(cpu, flush_cpumask))
320 goto out;
321 /*
322 * This was a BUG() but until someone can quote me the
323 * line from the intel manual that guarantees an IPI to
324 * multiple CPUs is retried _only_ on the erroring CPUs
325 * its staying as a return
326 *
327 * BUG();
328 */
329
330 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
331 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
332 if (flush_va == TLB_FLUSH_ALL)
333 local_flush_tlb();
334 else
335 __flush_tlb_one(flush_va);
336 } else
337 leave_mm(cpu);
338 }
339 ack_APIC_irq();
340 smp_mb__before_clear_bit();
341 cpu_clear(cpu, flush_cpumask);
342 smp_mb__after_clear_bit();
343out:
344 put_cpu_no_resched();
345}
346
347void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
348 unsigned long va)
349{
350 cpumask_t cpumask = *cpumaskp;
351
352 /*
353 * A couple of (to be removed) sanity checks:
354 *
355 * - current CPU must not be in mask
356 * - mask must exist :)
357 */
358 BUG_ON(cpus_empty(cpumask));
359 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
360 BUG_ON(!mm);
361
362#ifdef CONFIG_HOTPLUG_CPU
363 /* If a CPU which we ran on has gone down, OK. */
364 cpus_and(cpumask, cpumask, cpu_online_map);
365 if (unlikely(cpus_empty(cpumask)))
366 return;
367#endif
368
369 /*
370 * i'm not happy about this global shared spinlock in the
371 * MM hot path, but we'll see how contended it is.
372 * AK: x86-64 has a faster method that could be ported.
373 */
374 spin_lock(&tlbstate_lock);
375
376 flush_mm = mm;
377 flush_va = va;
378 cpus_or(flush_cpumask, cpumask, flush_cpumask);
379 /*
380 * We have to send the IPI only to
381 * CPUs affected.
382 */
383 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
384
385 while (!cpus_empty(flush_cpumask))
386 /* nothing. lockup detection does not belong here */
387 cpu_relax();
388
389 flush_mm = NULL;
390 flush_va = 0;
391 spin_unlock(&tlbstate_lock);
392}
393
394void flush_tlb_current_task(void)
395{
396 struct mm_struct *mm = current->mm;
397 cpumask_t cpu_mask;
398
399 preempt_disable();
400 cpu_mask = mm->cpu_vm_mask;
401 cpu_clear(smp_processor_id(), cpu_mask);
402
403 local_flush_tlb();
404 if (!cpus_empty(cpu_mask))
405 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
406 preempt_enable();
407}
408
409void flush_tlb_mm (struct mm_struct * mm)
410{
411 cpumask_t cpu_mask;
412
413 preempt_disable();
414 cpu_mask = mm->cpu_vm_mask;
415 cpu_clear(smp_processor_id(), cpu_mask);
416
417 if (current->active_mm == mm) {
418 if (current->mm)
419 local_flush_tlb();
420 else
421 leave_mm(smp_processor_id());
422 }
423 if (!cpus_empty(cpu_mask))
424 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
425
426 preempt_enable();
427}
428
429void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
430{
431 struct mm_struct *mm = vma->vm_mm;
432 cpumask_t cpu_mask;
433
434 preempt_disable();
435 cpu_mask = mm->cpu_vm_mask;
436 cpu_clear(smp_processor_id(), cpu_mask);
437
438 if (current->active_mm == mm) {
439 if(current->mm)
440 __flush_tlb_one(va);
441 else
442 leave_mm(smp_processor_id());
443 }
444
445 if (!cpus_empty(cpu_mask))
446 flush_tlb_others(cpu_mask, mm, va);
447
448 preempt_enable();
449}
450EXPORT_SYMBOL(flush_tlb_page);
451
452static void do_flush_tlb_all(void* info)
453{
454 unsigned long cpu = smp_processor_id();
455
456 __flush_tlb_all();
457 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
458 leave_mm(cpu);
459}
460
461void flush_tlb_all(void)
462{
463 on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
464}
465
466/*
467 * this function sends a 'reschedule' IPI to another CPU.
468 * it goes straight through and wastes no time serializing
469 * anything. Worst case is that we lose a reschedule ...
470 */
471static void native_smp_send_reschedule(int cpu)
472{
473 WARN_ON(cpu_is_offline(cpu));
474 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
475}
476
477/*
478 * Structure and data for smp_call_function(). This is designed to minimise
479 * static memory requirements. It also looks cleaner.
480 */
481static DEFINE_SPINLOCK(call_lock);
482
483struct call_data_struct {
484 void (*func) (void *info);
485 void *info;
486 atomic_t started;
487 atomic_t finished;
488 int wait;
489};
490
491void lock_ipi_call_lock(void)
492{
493 spin_lock_irq(&call_lock);
494}
495
496void unlock_ipi_call_lock(void)
497{
498 spin_unlock_irq(&call_lock);
499}
500
501static struct call_data_struct *call_data;
502
503static void __smp_call_function(void (*func) (void *info), void *info,
504 int nonatomic, int wait)
505{
506 struct call_data_struct data;
507 int cpus = num_online_cpus() - 1;
508
509 if (!cpus)
510 return;
511
512 data.func = func;
513 data.info = info;
514 atomic_set(&data.started, 0);
515 data.wait = wait;
516 if (wait)
517 atomic_set(&data.finished, 0);
518
519 call_data = &data;
520 mb();
521
522 /* Send a message to all other CPUs and wait for them to respond */
523 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
524
525 /* Wait for response */
526 while (atomic_read(&data.started) != cpus)
527 cpu_relax();
528
529 if (wait)
530 while (atomic_read(&data.finished) != cpus)
531 cpu_relax();
532}
533
534
535/**
536 * smp_call_function_mask(): Run a function on a set of other CPUs.
537 * @mask: The set of cpus to run on. Must not include the current cpu.
538 * @func: The function to run. This must be fast and non-blocking.
539 * @info: An arbitrary pointer to pass to the function.
540 * @wait: If true, wait (atomically) until function has completed on other CPUs.
541 *
542 * Returns 0 on success, else a negative status code.
543 *
544 * If @wait is true, then returns once @func has returned; otherwise
545 * it returns just before the target cpu calls @func.
546 *
547 * You must not call this function with disabled interrupts or from a
548 * hardware interrupt handler or from a bottom half handler.
549 */
550static int
551native_smp_call_function_mask(cpumask_t mask,
552 void (*func)(void *), void *info,
553 int wait)
554{
555 struct call_data_struct data;
556 cpumask_t allbutself;
557 int cpus;
558
559 /* Can deadlock when called with interrupts disabled */
560 WARN_ON(irqs_disabled());
561
562 /* Holding any lock stops cpus from going down. */
563 spin_lock(&call_lock);
564
565 allbutself = cpu_online_map;
566 cpu_clear(smp_processor_id(), allbutself);
567
568 cpus_and(mask, mask, allbutself);
569 cpus = cpus_weight(mask);
570
571 if (!cpus) {
572 spin_unlock(&call_lock);
573 return 0;
574 }
575
576 data.func = func;
577 data.info = info;
578 atomic_set(&data.started, 0);
579 data.wait = wait;
580 if (wait)
581 atomic_set(&data.finished, 0);
582
583 call_data = &data;
584 mb();
585
586 /* Send a message to other CPUs */
587 if (cpus_equal(mask, allbutself))
588 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
589 else
590 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
591
592 /* Wait for response */
593 while (atomic_read(&data.started) != cpus)
594 cpu_relax();
595
596 if (wait)
597 while (atomic_read(&data.finished) != cpus)
598 cpu_relax();
599 spin_unlock(&call_lock);
600
601 return 0;
602}
603
604static void stop_this_cpu (void * dummy)
605{
606 local_irq_disable();
607 /*
608 * Remove this CPU:
609 */
610 cpu_clear(smp_processor_id(), cpu_online_map);
611 disable_local_APIC();
612 if (cpu_data[smp_processor_id()].hlt_works_ok)
613 for(;;) halt();
614 for (;;);
615}
616
617/*
618 * this function calls the 'stop' function on all other CPUs in the system.
619 */
620
621static void native_smp_send_stop(void)
622{
623 /* Don't deadlock on the call lock in panic */
624 int nolock = !spin_trylock(&call_lock);
625 unsigned long flags;
626
627 local_irq_save(flags);
628 __smp_call_function(stop_this_cpu, NULL, 0, 0);
629 if (!nolock)
630 spin_unlock(&call_lock);
631 disable_local_APIC();
632 local_irq_restore(flags);
633}
634
635/*
636 * Reschedule call back. Nothing to do,
637 * all the work is done automatically when
638 * we return from the interrupt.
639 */
640fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
641{
642 ack_APIC_irq();
643}
644
645fastcall void smp_call_function_interrupt(struct pt_regs *regs)
646{
647 void (*func) (void *info) = call_data->func;
648 void *info = call_data->info;
649 int wait = call_data->wait;
650
651 ack_APIC_irq();
652 /*
653 * Notify initiating CPU that I've grabbed the data and am
654 * about to execute the function
655 */
656 mb();
657 atomic_inc(&call_data->started);
658 /*
659 * At this point the info structure may be out of scope unless wait==1
660 */
661 irq_enter();
662 (*func)(info);
663 irq_exit();
664
665 if (wait) {
666 mb();
667 atomic_inc(&call_data->finished);
668 }
669}
670
671static int convert_apicid_to_cpu(int apic_id)
672{
673 int i;
674
675 for (i = 0; i < NR_CPUS; i++) {
676 if (x86_cpu_to_apicid[i] == apic_id)
677 return i;
678 }
679 return -1;
680}
681
682int safe_smp_processor_id(void)
683{
684 int apicid, cpuid;
685
686 if (!boot_cpu_has(X86_FEATURE_APIC))
687 return 0;
688
689 apicid = hard_smp_processor_id();
690 if (apicid == BAD_APICID)
691 return 0;
692
693 cpuid = convert_apicid_to_cpu(apicid);
694
695 return cpuid >= 0 ? cpuid : 0;
696}
697
698struct smp_ops smp_ops = {
699 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
700 .smp_prepare_cpus = native_smp_prepare_cpus,
701 .cpu_up = native_cpu_up,
702 .smp_cpus_done = native_smp_cpus_done,
703
704 .smp_send_stop = native_smp_send_stop,
705 .smp_send_reschedule = native_smp_send_reschedule,
706 .smp_call_function_mask = native_smp_call_function_mask,
707};
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
new file mode 100644
index 000000000000..e4f61d1c6248
--- /dev/null
+++ b/arch/x86/kernel/smpboot_32.c
@@ -0,0 +1,1322 @@
1/*
2 * x86 SMP booting functions
3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
6 *
7 * Much of the core SMP work is based on previous work by Thomas Radke, to
8 * whom a great many thanks are extended.
9 *
10 * Thanks to Intel for making available several different Pentium,
11 * Pentium Pro and Pentium-II/Xeon MP machines.
12 * Original development of Linux SMP code supported by Caldera.
13 *
14 * This code is released under the GNU General Public License version 2 or
15 * later.
16 *
17 * Fixes
18 * Felix Koop : NR_CPUS used properly
19 * Jose Renau : Handle single CPU case.
20 * Alan Cox : By repeated request 8) - Total BogoMIPS report.
21 * Greg Wright : Fix for kernel stacks panic.
22 * Erich Boleyn : MP v1.4 and additional changes.
23 * Matthias Sattler : Changes for 2.1 kernel map.
24 * Michel Lespinasse : Changes for 2.1 kernel map.
25 * Michael Chastain : Change trampoline.S to gnu as.
26 * Alan Cox : Dumb bug: 'B' step PPro's are fine
27 * Ingo Molnar : Added APIC timers, based on code
28 * from Jose Renau
29 * Ingo Molnar : various cleanups and rewrites
30 * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
31 * Maciej W. Rozycki : Bits for genuine 82489DX APICs
32 * Martin J. Bligh : Added support for multi-quad systems
33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35
36#include <linux/module.h>
37#include <linux/init.h>
38#include <linux/kernel.h>
39
40#include <linux/mm.h>
41#include <linux/sched.h>
42#include <linux/kernel_stat.h>
43#include <linux/bootmem.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/percpu.h>
47#include <linux/nmi.h>
48
49#include <linux/delay.h>
50#include <linux/mc146818rtc.h>
51#include <asm/tlbflush.h>
52#include <asm/desc.h>
53#include <asm/arch_hooks.h>
54#include <asm/nmi.h>
55
56#include <mach_apic.h>
57#include <mach_wakecpu.h>
58#include <smpboot_hooks.h>
59#include <asm/vmi.h>
60#include <asm/mtrr.h>
61
62/* Set if we find a B stepping CPU */
63static int __devinitdata smp_b_stepping;
64
65/* Number of siblings per CPU package */
66int smp_num_siblings = 1;
67EXPORT_SYMBOL(smp_num_siblings);
68
69/* Last level cache ID of each logical CPU */
70int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
71
72/* representing HT siblings of each logical CPU */
73cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
74EXPORT_SYMBOL(cpu_sibling_map);
75
76/* representing HT and core siblings of each logical CPU */
77cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
78EXPORT_SYMBOL(cpu_core_map);
79
80/* bitmap of online cpus */
81cpumask_t cpu_online_map __read_mostly;
82EXPORT_SYMBOL(cpu_online_map);
83
84cpumask_t cpu_callin_map;
85cpumask_t cpu_callout_map;
86EXPORT_SYMBOL(cpu_callout_map);
87cpumask_t cpu_possible_map;
88EXPORT_SYMBOL(cpu_possible_map);
89static cpumask_t smp_commenced_mask;
90
91/* Per CPU bogomips and other parameters */
92struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
93EXPORT_SYMBOL(cpu_data);
94
95u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
96 { [0 ... NR_CPUS-1] = 0xff };
97EXPORT_SYMBOL(x86_cpu_to_apicid);
98
99u8 apicid_2_node[MAX_APICID];
100
101/*
102 * Trampoline 80x86 program as an array.
103 */
104
105extern unsigned char trampoline_data [];
106extern unsigned char trampoline_end [];
107static unsigned char *trampoline_base;
108static int trampoline_exec;
109
110static void map_cpu_to_logical_apicid(void);
111
112/* State of each CPU. */
113DEFINE_PER_CPU(int, cpu_state) = { 0 };
114
115/*
116 * Currently trivial. Write the real->protected mode
117 * bootstrap into the page concerned. The caller
118 * has made sure it's suitably aligned.
119 */
120
121static unsigned long __devinit setup_trampoline(void)
122{
123 memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
124 return virt_to_phys(trampoline_base);
125}
126
127/*
128 * We are called very early to get the low memory for the
129 * SMP bootup trampoline page.
130 */
131void __init smp_alloc_memory(void)
132{
133 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
134 /*
135 * Has to be in very low memory so we can execute
136 * real-mode AP code.
137 */
138 if (__pa(trampoline_base) >= 0x9F000)
139 BUG();
140 /*
141 * Make the SMP trampoline executable:
142 */
143 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
144}
145
146/*
147 * The bootstrap kernel entry code has set these up. Save them for
148 * a given CPU
149 */
150
151void __cpuinit smp_store_cpu_info(int id)
152{
153 struct cpuinfo_x86 *c = cpu_data + id;
154
155 *c = boot_cpu_data;
156 if (id!=0)
157 identify_secondary_cpu(c);
158 /*
159 * Mask B, Pentium, but not Pentium MMX
160 */
161 if (c->x86_vendor == X86_VENDOR_INTEL &&
162 c->x86 == 5 &&
163 c->x86_mask >= 1 && c->x86_mask <= 4 &&
164 c->x86_model <= 3)
165 /*
166 * Remember we have B step Pentia with bugs
167 */
168 smp_b_stepping = 1;
169
170 /*
171 * Certain Athlons might work (for various values of 'work') in SMP
172 * but they are not certified as MP capable.
173 */
174 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
175
176 if (num_possible_cpus() == 1)
177 goto valid_k7;
178
179 /* Athlon 660/661 is valid. */
180 if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
181 goto valid_k7;
182
183 /* Duron 670 is valid */
184 if ((c->x86_model==7) && (c->x86_mask==0))
185 goto valid_k7;
186
187 /*
188 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
189 * It's worth noting that the A5 stepping (662) of some Athlon XP's
190 * have the MP bit set.
191 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
192 */
193 if (((c->x86_model==6) && (c->x86_mask>=2)) ||
194 ((c->x86_model==7) && (c->x86_mask>=1)) ||
195 (c->x86_model> 7))
196 if (cpu_has_mp)
197 goto valid_k7;
198
199 /* If we get here, it's not a certified SMP capable AMD system. */
200 add_taint(TAINT_UNSAFE_SMP);
201 }
202
203valid_k7:
204 ;
205}
206
207extern void calibrate_delay(void);
208
209static atomic_t init_deasserted;
210
211static void __cpuinit smp_callin(void)
212{
213 int cpuid, phys_id;
214 unsigned long timeout;
215
216 /*
217 * If waken up by an INIT in an 82489DX configuration
218 * we may get here before an INIT-deassert IPI reaches
219 * our local APIC. We have to wait for the IPI or we'll
220 * lock up on an APIC access.
221 */
222 wait_for_init_deassert(&init_deasserted);
223
224 /*
225 * (This works even if the APIC is not enabled.)
226 */
227 phys_id = GET_APIC_ID(apic_read(APIC_ID));
228 cpuid = smp_processor_id();
229 if (cpu_isset(cpuid, cpu_callin_map)) {
230 printk("huh, phys CPU#%d, CPU#%d already present??\n",
231 phys_id, cpuid);
232 BUG();
233 }
234 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
235
236 /*
237 * STARTUP IPIs are fragile beasts as they might sometimes
238 * trigger some glue motherboard logic. Complete APIC bus
239 * silence for 1 second, this overestimates the time the
240 * boot CPU is spending to send the up to 2 STARTUP IPIs
241 * by a factor of two. This should be enough.
242 */
243
244 /*
245 * Waiting 2s total for startup (udelay is not yet working)
246 */
247 timeout = jiffies + 2*HZ;
248 while (time_before(jiffies, timeout)) {
249 /*
250 * Has the boot CPU finished it's STARTUP sequence?
251 */
252 if (cpu_isset(cpuid, cpu_callout_map))
253 break;
254 rep_nop();
255 }
256
257 if (!time_before(jiffies, timeout)) {
258 printk("BUG: CPU%d started up but did not get a callout!\n",
259 cpuid);
260 BUG();
261 }
262
263 /*
264 * the boot CPU has finished the init stage and is spinning
265 * on callin_map until we finish. We are free to set up this
266 * CPU, first the APIC. (this is probably redundant on most
267 * boards)
268 */
269
270 Dprintk("CALLIN, before setup_local_APIC().\n");
271 smp_callin_clear_local_apic();
272 setup_local_APIC();
273 map_cpu_to_logical_apicid();
274
275 /*
276 * Get our bogomips.
277 */
278 calibrate_delay();
279 Dprintk("Stack at about %p\n",&cpuid);
280
281 /*
282 * Save our processor parameters
283 */
284 smp_store_cpu_info(cpuid);
285
286 /*
287 * Allow the master to continue.
288 */
289 cpu_set(cpuid, cpu_callin_map);
290}
291
292static int cpucount;
293
294/* maps the cpu to the sched domain representing multi-core */
295cpumask_t cpu_coregroup_map(int cpu)
296{
297 struct cpuinfo_x86 *c = cpu_data + cpu;
298 /*
299 * For perf, we return last level cache shared map.
300 * And for power savings, we return cpu_core_map
301 */
302 if (sched_mc_power_savings || sched_smt_power_savings)
303 return cpu_core_map[cpu];
304 else
305 return c->llc_shared_map;
306}
307
308/* representing cpus for which sibling maps can be computed */
309static cpumask_t cpu_sibling_setup_map;
310
311void __cpuinit set_cpu_sibling_map(int cpu)
312{
313 int i;
314 struct cpuinfo_x86 *c = cpu_data;
315
316 cpu_set(cpu, cpu_sibling_setup_map);
317
318 if (smp_num_siblings > 1) {
319 for_each_cpu_mask(i, cpu_sibling_setup_map) {
320 if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
321 c[cpu].cpu_core_id == c[i].cpu_core_id) {
322 cpu_set(i, cpu_sibling_map[cpu]);
323 cpu_set(cpu, cpu_sibling_map[i]);
324 cpu_set(i, cpu_core_map[cpu]);
325 cpu_set(cpu, cpu_core_map[i]);
326 cpu_set(i, c[cpu].llc_shared_map);
327 cpu_set(cpu, c[i].llc_shared_map);
328 }
329 }
330 } else {
331 cpu_set(cpu, cpu_sibling_map[cpu]);
332 }
333
334 cpu_set(cpu, c[cpu].llc_shared_map);
335
336 if (current_cpu_data.x86_max_cores == 1) {
337 cpu_core_map[cpu] = cpu_sibling_map[cpu];
338 c[cpu].booted_cores = 1;
339 return;
340 }
341
342 for_each_cpu_mask(i, cpu_sibling_setup_map) {
343 if (cpu_llc_id[cpu] != BAD_APICID &&
344 cpu_llc_id[cpu] == cpu_llc_id[i]) {
345 cpu_set(i, c[cpu].llc_shared_map);
346 cpu_set(cpu, c[i].llc_shared_map);
347 }
348 if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
349 cpu_set(i, cpu_core_map[cpu]);
350 cpu_set(cpu, cpu_core_map[i]);
351 /*
352 * Does this new cpu bringup a new core?
353 */
354 if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
355 /*
356 * for each core in package, increment
357 * the booted_cores for this new cpu
358 */
359 if (first_cpu(cpu_sibling_map[i]) == i)
360 c[cpu].booted_cores++;
361 /*
362 * increment the core count for all
363 * the other cpus in this package
364 */
365 if (i != cpu)
366 c[i].booted_cores++;
367 } else if (i != cpu && !c[cpu].booted_cores)
368 c[cpu].booted_cores = c[i].booted_cores;
369 }
370 }
371}
372
373/*
374 * Activate a secondary processor.
375 */
376static void __cpuinit start_secondary(void *unused)
377{
378 /*
379 * Don't put *anything* before cpu_init(), SMP booting is too
380 * fragile that we want to limit the things done here to the
381 * most necessary things.
382 */
383#ifdef CONFIG_VMI
384 vmi_bringup();
385#endif
386 cpu_init();
387 preempt_disable();
388 smp_callin();
389 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
390 rep_nop();
391 /*
392 * Check TSC synchronization with the BP:
393 */
394 check_tsc_sync_target();
395
396 setup_secondary_clock();
397 if (nmi_watchdog == NMI_IO_APIC) {
398 disable_8259A_irq(0);
399 enable_NMI_through_LVT0(NULL);
400 enable_8259A_irq(0);
401 }
402 /*
403 * low-memory mappings have been cleared, flush them from
404 * the local TLBs too.
405 */
406 local_flush_tlb();
407
408 /* This must be done before setting cpu_online_map */
409 set_cpu_sibling_map(raw_smp_processor_id());
410 wmb();
411
412 /*
413 * We need to hold call_lock, so there is no inconsistency
414 * between the time smp_call_function() determines number of
415 * IPI receipients, and the time when the determination is made
416 * for which cpus receive the IPI. Holding this
417 * lock helps us to not include this cpu in a currently in progress
418 * smp_call_function().
419 */
420 lock_ipi_call_lock();
421 cpu_set(smp_processor_id(), cpu_online_map);
422 unlock_ipi_call_lock();
423 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
424
425 /* We can take interrupts now: we're officially "up". */
426 local_irq_enable();
427
428 wmb();
429 cpu_idle();
430}
431
432/*
433 * Everything has been set up for the secondary
434 * CPUs - they just need to reload everything
435 * from the task structure
436 * This function must not return.
437 */
438void __devinit initialize_secondary(void)
439{
440 /*
441 * We don't actually need to load the full TSS,
442 * basically just the stack pointer and the eip.
443 */
444
445 asm volatile(
446 "movl %0,%%esp\n\t"
447 "jmp *%1"
448 :
449 :"m" (current->thread.esp),"m" (current->thread.eip));
450}
451
452/* Static state in head.S used to set up a CPU */
453extern struct {
454 void * esp;
455 unsigned short ss;
456} stack_start;
457
458#ifdef CONFIG_NUMA
459
460/* which logical CPUs are on which nodes */
461cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
462 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
463EXPORT_SYMBOL(node_2_cpu_mask);
464/* which node each logical CPU is on */
465int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
466EXPORT_SYMBOL(cpu_2_node);
467
468/* set up a mapping between cpu and node. */
469static inline void map_cpu_to_node(int cpu, int node)
470{
471 printk("Mapping cpu %d to node %d\n", cpu, node);
472 cpu_set(cpu, node_2_cpu_mask[node]);
473 cpu_2_node[cpu] = node;
474}
475
476/* undo a mapping between cpu and node. */
477static inline void unmap_cpu_to_node(int cpu)
478{
479 int node;
480
481 printk("Unmapping cpu %d from all nodes\n", cpu);
482 for (node = 0; node < MAX_NUMNODES; node ++)
483 cpu_clear(cpu, node_2_cpu_mask[node]);
484 cpu_2_node[cpu] = 0;
485}
486#else /* !CONFIG_NUMA */
487
488#define map_cpu_to_node(cpu, node) ({})
489#define unmap_cpu_to_node(cpu) ({})
490
491#endif /* CONFIG_NUMA */
492
493u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
494
495static void map_cpu_to_logical_apicid(void)
496{
497 int cpu = smp_processor_id();
498 int apicid = logical_smp_processor_id();
499 int node = apicid_to_node(apicid);
500
501 if (!node_online(node))
502 node = first_online_node;
503
504 cpu_2_logical_apicid[cpu] = apicid;
505 map_cpu_to_node(cpu, node);
506}
507
508static void unmap_cpu_to_logical_apicid(int cpu)
509{
510 cpu_2_logical_apicid[cpu] = BAD_APICID;
511 unmap_cpu_to_node(cpu);
512}
513
514static inline void __inquire_remote_apic(int apicid)
515{
516 int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
517 char *names[] = { "ID", "VERSION", "SPIV" };
518 int timeout;
519 unsigned long status;
520
521 printk("Inquiring remote APIC #%d...\n", apicid);
522
523 for (i = 0; i < ARRAY_SIZE(regs); i++) {
524 printk("... APIC #%d %s: ", apicid, names[i]);
525
526 /*
527 * Wait for idle.
528 */
529 status = safe_apic_wait_icr_idle();
530 if (status)
531 printk("a previous APIC delivery may have failed\n");
532
533 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
534 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
535
536 timeout = 0;
537 do {
538 udelay(100);
539 status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
540 } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
541
542 switch (status) {
543 case APIC_ICR_RR_VALID:
544 status = apic_read(APIC_RRR);
545 printk("%lx\n", status);
546 break;
547 default:
548 printk("failed\n");
549 }
550 }
551}
552
553#ifdef WAKE_SECONDARY_VIA_NMI
554/*
555 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
556 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
557 * won't ... remember to clear down the APIC, etc later.
558 */
559static int __devinit
560wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
561{
562 unsigned long send_status, accept_status = 0;
563 int maxlvt;
564
565 /* Target chip */
566 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
567
568 /* Boot on the stack */
569 /* Kick the second */
570 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
571
572 Dprintk("Waiting for send to finish...\n");
573 send_status = safe_apic_wait_icr_idle();
574
575 /*
576 * Give the other CPU some time to accept the IPI.
577 */
578 udelay(200);
579 /*
580 * Due to the Pentium erratum 3AP.
581 */
582 maxlvt = lapic_get_maxlvt();
583 if (maxlvt > 3) {
584 apic_read_around(APIC_SPIV);
585 apic_write(APIC_ESR, 0);
586 }
587 accept_status = (apic_read(APIC_ESR) & 0xEF);
588 Dprintk("NMI sent.\n");
589
590 if (send_status)
591 printk("APIC never delivered???\n");
592 if (accept_status)
593 printk("APIC delivery error (%lx).\n", accept_status);
594
595 return (send_status | accept_status);
596}
597#endif /* WAKE_SECONDARY_VIA_NMI */
598
599#ifdef WAKE_SECONDARY_VIA_INIT
600static int __devinit
601wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
602{
603 unsigned long send_status, accept_status = 0;
604 int maxlvt, num_starts, j;
605
606 /*
607 * Be paranoid about clearing APIC errors.
608 */
609 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
610 apic_read_around(APIC_SPIV);
611 apic_write(APIC_ESR, 0);
612 apic_read(APIC_ESR);
613 }
614
615 Dprintk("Asserting INIT.\n");
616
617 /*
618 * Turn INIT on target chip
619 */
620 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
621
622 /*
623 * Send IPI
624 */
625 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
626 | APIC_DM_INIT);
627
628 Dprintk("Waiting for send to finish...\n");
629 send_status = safe_apic_wait_icr_idle();
630
631 mdelay(10);
632
633 Dprintk("Deasserting INIT.\n");
634
635 /* Target chip */
636 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
637
638 /* Send IPI */
639 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
640
641 Dprintk("Waiting for send to finish...\n");
642 send_status = safe_apic_wait_icr_idle();
643
644 atomic_set(&init_deasserted, 1);
645
646 /*
647 * Should we send STARTUP IPIs ?
648 *
649 * Determine this based on the APIC version.
650 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
651 */
652 if (APIC_INTEGRATED(apic_version[phys_apicid]))
653 num_starts = 2;
654 else
655 num_starts = 0;
656
657 /*
658 * Paravirt / VMI wants a startup IPI hook here to set up the
659 * target processor state.
660 */
661 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
662 (unsigned long) stack_start.esp);
663
664 /*
665 * Run STARTUP IPI loop.
666 */
667 Dprintk("#startup loops: %d.\n", num_starts);
668
669 maxlvt = lapic_get_maxlvt();
670
671 for (j = 1; j <= num_starts; j++) {
672 Dprintk("Sending STARTUP #%d.\n",j);
673 apic_read_around(APIC_SPIV);
674 apic_write(APIC_ESR, 0);
675 apic_read(APIC_ESR);
676 Dprintk("After apic_write.\n");
677
678 /*
679 * STARTUP IPI
680 */
681
682 /* Target chip */
683 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
684
685 /* Boot on the stack */
686 /* Kick the second */
687 apic_write_around(APIC_ICR, APIC_DM_STARTUP
688 | (start_eip >> 12));
689
690 /*
691 * Give the other CPU some time to accept the IPI.
692 */
693 udelay(300);
694
695 Dprintk("Startup point 1.\n");
696
697 Dprintk("Waiting for send to finish...\n");
698 send_status = safe_apic_wait_icr_idle();
699
700 /*
701 * Give the other CPU some time to accept the IPI.
702 */
703 udelay(200);
704 /*
705 * Due to the Pentium erratum 3AP.
706 */
707 if (maxlvt > 3) {
708 apic_read_around(APIC_SPIV);
709 apic_write(APIC_ESR, 0);
710 }
711 accept_status = (apic_read(APIC_ESR) & 0xEF);
712 if (send_status || accept_status)
713 break;
714 }
715 Dprintk("After Startup.\n");
716
717 if (send_status)
718 printk("APIC never delivered???\n");
719 if (accept_status)
720 printk("APIC delivery error (%lx).\n", accept_status);
721
722 return (send_status | accept_status);
723}
724#endif /* WAKE_SECONDARY_VIA_INIT */
725
726extern cpumask_t cpu_initialized;
727static inline int alloc_cpu_id(void)
728{
729 cpumask_t tmp_map;
730 int cpu;
731 cpus_complement(tmp_map, cpu_present_map);
732 cpu = first_cpu(tmp_map);
733 if (cpu >= NR_CPUS)
734 return -ENODEV;
735 return cpu;
736}
737
738#ifdef CONFIG_HOTPLUG_CPU
739static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
740static inline struct task_struct * alloc_idle_task(int cpu)
741{
742 struct task_struct *idle;
743
744 if ((idle = cpu_idle_tasks[cpu]) != NULL) {
745 /* initialize thread_struct. we really want to avoid destroy
746 * idle tread
747 */
748 idle->thread.esp = (unsigned long)task_pt_regs(idle);
749 init_idle(idle, cpu);
750 return idle;
751 }
752 idle = fork_idle(cpu);
753
754 if (!IS_ERR(idle))
755 cpu_idle_tasks[cpu] = idle;
756 return idle;
757}
758#else
759#define alloc_idle_task(cpu) fork_idle(cpu)
760#endif
761
762static int __cpuinit do_boot_cpu(int apicid, int cpu)
763/*
764 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
765 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
766 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
767 */
768{
769 struct task_struct *idle;
770 unsigned long boot_error;
771 int timeout;
772 unsigned long start_eip;
773 unsigned short nmi_high = 0, nmi_low = 0;
774
775 /*
776 * Save current MTRR state in case it was changed since early boot
777 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
778 */
779 mtrr_save_state();
780
781 /*
782 * We can't use kernel_thread since we must avoid to
783 * reschedule the child.
784 */
785 idle = alloc_idle_task(cpu);
786 if (IS_ERR(idle))
787 panic("failed fork for CPU %d", cpu);
788
789 init_gdt(cpu);
790 per_cpu(current_task, cpu) = idle;
791 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
792
793 idle->thread.eip = (unsigned long) start_secondary;
794 /* start_eip had better be page-aligned! */
795 start_eip = setup_trampoline();
796
797 ++cpucount;
798 alternatives_smp_switch(1);
799
800 /* So we see what's up */
801 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
802 /* Stack for startup_32 can be just as for start_secondary onwards */
803 stack_start.esp = (void *) idle->thread.esp;
804
805 irq_ctx_init(cpu);
806
807 x86_cpu_to_apicid[cpu] = apicid;
808 /*
809 * This grunge runs the startup process for
810 * the targeted processor.
811 */
812
813 atomic_set(&init_deasserted, 0);
814
815 Dprintk("Setting warm reset code and vector.\n");
816
817 store_NMI_vector(&nmi_high, &nmi_low);
818
819 smpboot_setup_warm_reset_vector(start_eip);
820
821 /*
822 * Starting actual IPI sequence...
823 */
824 boot_error = wakeup_secondary_cpu(apicid, start_eip);
825
826 if (!boot_error) {
827 /*
828 * allow APs to start initializing.
829 */
830 Dprintk("Before Callout %d.\n", cpu);
831 cpu_set(cpu, cpu_callout_map);
832 Dprintk("After Callout %d.\n", cpu);
833
834 /*
835 * Wait 5s total for a response
836 */
837 for (timeout = 0; timeout < 50000; timeout++) {
838 if (cpu_isset(cpu, cpu_callin_map))
839 break; /* It has booted */
840 udelay(100);
841 }
842
843 if (cpu_isset(cpu, cpu_callin_map)) {
844 /* number CPUs logically, starting from 1 (BSP is 0) */
845 Dprintk("OK.\n");
846 printk("CPU%d: ", cpu);
847 print_cpu_info(&cpu_data[cpu]);
848 Dprintk("CPU has booted.\n");
849 } else {
850 boot_error= 1;
851 if (*((volatile unsigned char *)trampoline_base)
852 == 0xA5)
853 /* trampoline started but...? */
854 printk("Stuck ??\n");
855 else
856 /* trampoline code not run */
857 printk("Not responding.\n");
858 inquire_remote_apic(apicid);
859 }
860 }
861
862 if (boot_error) {
863 /* Try to put things back the way they were before ... */
864 unmap_cpu_to_logical_apicid(cpu);
865 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
866 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
867 cpucount--;
868 } else {
869 x86_cpu_to_apicid[cpu] = apicid;
870 cpu_set(cpu, cpu_present_map);
871 }
872
873 /* mark "stuck" area as not stuck */
874 *((volatile unsigned long *)trampoline_base) = 0;
875
876 return boot_error;
877}
878
879#ifdef CONFIG_HOTPLUG_CPU
880void cpu_exit_clear(void)
881{
882 int cpu = raw_smp_processor_id();
883
884 idle_task_exit();
885
886 cpucount --;
887 cpu_uninit();
888 irq_ctx_exit(cpu);
889
890 cpu_clear(cpu, cpu_callout_map);
891 cpu_clear(cpu, cpu_callin_map);
892
893 cpu_clear(cpu, smp_commenced_mask);
894 unmap_cpu_to_logical_apicid(cpu);
895}
896
897struct warm_boot_cpu_info {
898 struct completion *complete;
899 struct work_struct task;
900 int apicid;
901 int cpu;
902};
903
904static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
905{
906 struct warm_boot_cpu_info *info =
907 container_of(work, struct warm_boot_cpu_info, task);
908 do_boot_cpu(info->apicid, info->cpu);
909 complete(info->complete);
910}
911
912static int __cpuinit __smp_prepare_cpu(int cpu)
913{
914 DECLARE_COMPLETION_ONSTACK(done);
915 struct warm_boot_cpu_info info;
916 int apicid, ret;
917
918 apicid = x86_cpu_to_apicid[cpu];
919 if (apicid == BAD_APICID) {
920 ret = -ENODEV;
921 goto exit;
922 }
923
924 info.complete = &done;
925 info.apicid = apicid;
926 info.cpu = cpu;
927 INIT_WORK(&info.task, do_warm_boot_cpu);
928
929 /* init low mem mapping */
930 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
931 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
932 flush_tlb_all();
933 schedule_work(&info.task);
934 wait_for_completion(&done);
935
936 zap_low_mappings();
937 ret = 0;
938exit:
939 return ret;
940}
941#endif
942
943/*
944 * Cycle through the processors sending APIC IPIs to boot each.
945 */
946
947static int boot_cpu_logical_apicid;
948/* Where the IO area was mapped on multiquad, always 0 otherwise */
949void *xquad_portio;
950#ifdef CONFIG_X86_NUMAQ
951EXPORT_SYMBOL(xquad_portio);
952#endif
953
954static void __init smp_boot_cpus(unsigned int max_cpus)
955{
956 int apicid, cpu, bit, kicked;
957 unsigned long bogosum = 0;
958
959 /*
960 * Setup boot CPU information
961 */
962 smp_store_cpu_info(0); /* Final full version of the data */
963 printk("CPU%d: ", 0);
964 print_cpu_info(&cpu_data[0]);
965
966 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
967 boot_cpu_logical_apicid = logical_smp_processor_id();
968 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
969
970 current_thread_info()->cpu = 0;
971
972 set_cpu_sibling_map(0);
973
974 /*
975 * If we couldn't find an SMP configuration at boot time,
976 * get out of here now!
977 */
978 if (!smp_found_config && !acpi_lapic) {
979 printk(KERN_NOTICE "SMP motherboard not detected.\n");
980 smpboot_clear_io_apic_irqs();
981 phys_cpu_present_map = physid_mask_of_physid(0);
982 if (APIC_init_uniprocessor())
983 printk(KERN_NOTICE "Local APIC not detected."
984 " Using dummy APIC emulation.\n");
985 map_cpu_to_logical_apicid();
986 cpu_set(0, cpu_sibling_map[0]);
987 cpu_set(0, cpu_core_map[0]);
988 return;
989 }
990
991 /*
992 * Should not be necessary because the MP table should list the boot
993 * CPU too, but we do it for the sake of robustness anyway.
994 * Makes no sense to do this check in clustered apic mode, so skip it
995 */
996 if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
997 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
998 boot_cpu_physical_apicid);
999 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1000 }
1001
1002 /*
1003 * If we couldn't find a local APIC, then get out of here now!
1004 */
1005 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
1006 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1007 boot_cpu_physical_apicid);
1008 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
1009 smpboot_clear_io_apic_irqs();
1010 phys_cpu_present_map = physid_mask_of_physid(0);
1011 cpu_set(0, cpu_sibling_map[0]);
1012 cpu_set(0, cpu_core_map[0]);
1013 return;
1014 }
1015
1016 verify_local_APIC();
1017
1018 /*
1019 * If SMP should be disabled, then really disable it!
1020 */
1021 if (!max_cpus) {
1022 smp_found_config = 0;
1023 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
1024 smpboot_clear_io_apic_irqs();
1025 phys_cpu_present_map = physid_mask_of_physid(0);
1026 cpu_set(0, cpu_sibling_map[0]);
1027 cpu_set(0, cpu_core_map[0]);
1028 return;
1029 }
1030
1031 connect_bsp_APIC();
1032 setup_local_APIC();
1033 map_cpu_to_logical_apicid();
1034
1035
1036 setup_portio_remap();
1037
1038 /*
1039 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
1040 *
1041 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
1042 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
1043 * clustered apic ID.
1044 */
1045 Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
1046
1047 kicked = 1;
1048 for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
1049 apicid = cpu_present_to_apicid(bit);
1050 /*
1051 * Don't even attempt to start the boot CPU!
1052 */
1053 if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
1054 continue;
1055
1056 if (!check_apicid_present(bit))
1057 continue;
1058 if (max_cpus <= cpucount+1)
1059 continue;
1060
1061 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
1062 printk("CPU #%d not responding - cannot use it.\n",
1063 apicid);
1064 else
1065 ++kicked;
1066 }
1067
1068 /*
1069 * Cleanup possible dangling ends...
1070 */
1071 smpboot_restore_warm_reset_vector();
1072
1073 /*
1074 * Allow the user to impress friends.
1075 */
1076 Dprintk("Before bogomips.\n");
1077 for (cpu = 0; cpu < NR_CPUS; cpu++)
1078 if (cpu_isset(cpu, cpu_callout_map))
1079 bogosum += cpu_data[cpu].loops_per_jiffy;
1080 printk(KERN_INFO
1081 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
1082 cpucount+1,
1083 bogosum/(500000/HZ),
1084 (bogosum/(5000/HZ))%100);
1085
1086 Dprintk("Before bogocount - setting activated=1.\n");
1087
1088 if (smp_b_stepping)
1089 printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
1090
1091 /*
1092 * Don't taint if we are running SMP kernel on a single non-MP
1093 * approved Athlon
1094 */
1095 if (tainted & TAINT_UNSAFE_SMP) {
1096 if (cpucount)
1097 printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
1098 else
1099 tainted &= ~TAINT_UNSAFE_SMP;
1100 }
1101
1102 Dprintk("Boot done.\n");
1103
1104 /*
1105 * construct cpu_sibling_map[], so that we can tell sibling CPUs
1106 * efficiently.
1107 */
1108 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1109 cpus_clear(cpu_sibling_map[cpu]);
1110 cpus_clear(cpu_core_map[cpu]);
1111 }
1112
1113 cpu_set(0, cpu_sibling_map[0]);
1114 cpu_set(0, cpu_core_map[0]);
1115
1116 smpboot_setup_io_apic();
1117
1118 setup_boot_clock();
1119}
1120
1121/* These are wrappers to interface to the new boot process. Someone
1122 who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
1123void __init native_smp_prepare_cpus(unsigned int max_cpus)
1124{
1125 smp_commenced_mask = cpumask_of_cpu(0);
1126 cpu_callin_map = cpumask_of_cpu(0);
1127 mb();
1128 smp_boot_cpus(max_cpus);
1129}
1130
1131void __init native_smp_prepare_boot_cpu(void)
1132{
1133 unsigned int cpu = smp_processor_id();
1134
1135 init_gdt(cpu);
1136 switch_to_new_gdt();
1137
1138 cpu_set(cpu, cpu_online_map);
1139 cpu_set(cpu, cpu_callout_map);
1140 cpu_set(cpu, cpu_present_map);
1141 cpu_set(cpu, cpu_possible_map);
1142 __get_cpu_var(cpu_state) = CPU_ONLINE;
1143}
1144
1145#ifdef CONFIG_HOTPLUG_CPU
1146void remove_siblinginfo(int cpu)
1147{
1148 int sibling;
1149 struct cpuinfo_x86 *c = cpu_data;
1150
1151 for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
1152 cpu_clear(cpu, cpu_core_map[sibling]);
1153 /*
1154 * last thread sibling in this cpu core going down
1155 */
1156 if (cpus_weight(cpu_sibling_map[cpu]) == 1)
1157 c[sibling].booted_cores--;
1158 }
1159
1160 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1161 cpu_clear(cpu, cpu_sibling_map[sibling]);
1162 cpus_clear(cpu_sibling_map[cpu]);
1163 cpus_clear(cpu_core_map[cpu]);
1164 c[cpu].phys_proc_id = 0;
1165 c[cpu].cpu_core_id = 0;
1166 cpu_clear(cpu, cpu_sibling_setup_map);
1167}
1168
1169int __cpu_disable(void)
1170{
1171 cpumask_t map = cpu_online_map;
1172 int cpu = smp_processor_id();
1173
1174 /*
1175 * Perhaps use cpufreq to drop frequency, but that could go
1176 * into generic code.
1177 *
1178 * We won't take down the boot processor on i386 due to some
1179 * interrupts only being able to be serviced by the BSP.
1180 * Especially so if we're not using an IOAPIC -zwane
1181 */
1182 if (cpu == 0)
1183 return -EBUSY;
1184 if (nmi_watchdog == NMI_LOCAL_APIC)
1185 stop_apic_nmi_watchdog(NULL);
1186 clear_local_APIC();
1187 /* Allow any queued timer interrupts to get serviced */
1188 local_irq_enable();
1189 mdelay(1);
1190 local_irq_disable();
1191
1192 remove_siblinginfo(cpu);
1193
1194 cpu_clear(cpu, map);
1195 fixup_irqs(map);
1196 /* It's now safe to remove this processor from the online map */
1197 cpu_clear(cpu, cpu_online_map);
1198 return 0;
1199}
1200
1201void __cpu_die(unsigned int cpu)
1202{
1203 /* We don't do anything here: idle task is faking death itself. */
1204 unsigned int i;
1205
1206 for (i = 0; i < 10; i++) {
1207 /* They ack this in play_dead by setting CPU_DEAD */
1208 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1209 printk ("CPU %d is now offline\n", cpu);
1210 if (1 == num_online_cpus())
1211 alternatives_smp_switch(0);
1212 return;
1213 }
1214 msleep(100);
1215 }
1216 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1217}
1218#else /* ... !CONFIG_HOTPLUG_CPU */
1219int __cpu_disable(void)
1220{
1221 return -ENOSYS;
1222}
1223
1224void __cpu_die(unsigned int cpu)
1225{
1226 /* We said "no" in __cpu_disable */
1227 BUG();
1228}
1229#endif /* CONFIG_HOTPLUG_CPU */
1230
1231int __cpuinit native_cpu_up(unsigned int cpu)
1232{
1233 unsigned long flags;
1234#ifdef CONFIG_HOTPLUG_CPU
1235 int ret = 0;
1236
1237 /*
1238 * We do warm boot only on cpus that had booted earlier
1239 * Otherwise cold boot is all handled from smp_boot_cpus().
1240 * cpu_callin_map is set during AP kickstart process. Its reset
1241 * when a cpu is taken offline from cpu_exit_clear().
1242 */
1243 if (!cpu_isset(cpu, cpu_callin_map))
1244 ret = __smp_prepare_cpu(cpu);
1245
1246 if (ret)
1247 return -EIO;
1248#endif
1249
1250 /* In case one didn't come up */
1251 if (!cpu_isset(cpu, cpu_callin_map)) {
1252 printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
1253 return -EIO;
1254 }
1255
1256 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1257 /* Unleash the CPU! */
1258 cpu_set(cpu, smp_commenced_mask);
1259
1260 /*
1261 * Check TSC synchronization with the AP (keep irqs disabled
1262 * while doing so):
1263 */
1264 local_irq_save(flags);
1265 check_tsc_sync_source(cpu);
1266 local_irq_restore(flags);
1267
1268 while (!cpu_isset(cpu, cpu_online_map)) {
1269 cpu_relax();
1270 touch_nmi_watchdog();
1271 }
1272
1273 return 0;
1274}
1275
1276void __init native_smp_cpus_done(unsigned int max_cpus)
1277{
1278#ifdef CONFIG_X86_IO_APIC
1279 setup_ioapic_dest();
1280#endif
1281 zap_low_mappings();
1282#ifndef CONFIG_HOTPLUG_CPU
1283 /*
1284 * Disable executability of the SMP trampoline:
1285 */
1286 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
1287#endif
1288}
1289
1290void __init smp_intr_init(void)
1291{
1292 /*
1293 * IRQ0 must be given a fixed assignment and initialized,
1294 * because it's used before the IO-APIC is set up.
1295 */
1296 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1297
1298 /*
1299 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1300 * IPI, driven by wakeup.
1301 */
1302 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1303
1304 /* IPI for invalidation */
1305 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1306
1307 /* IPI for generic function call */
1308 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1309}
1310
1311/*
1312 * If the BIOS enumerates physical processors before logical,
1313 * maxcpus=N at enumeration-time can be used to disable HT.
1314 */
1315static int __init parse_maxcpus(char *arg)
1316{
1317 extern unsigned int maxcpus;
1318
1319 maxcpus = simple_strtoul(arg, NULL, 0);
1320 return 0;
1321}
1322early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
new file mode 100644
index 000000000000..bbfe85a0f699
--- /dev/null
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -0,0 +1,81 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7DEFINE_PER_CPU(unsigned long, this_cpu_off);
8EXPORT_PER_CPU_SYMBOL(this_cpu_off);
9
10/* Initialize the CPU's GDT. This is either the boot CPU doing itself
11 (still using the master per-cpu area), or a CPU doing it for a
12 secondary which will soon come up. */
13__cpuinit void init_gdt(int cpu)
14{
15 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
16
17 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
18 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x80 | DESCTYPE_S | 0x2, 0x8);
21
22 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
23 per_cpu(cpu_number, cpu) = cpu;
24}
25
26
27/**
28 * smp_call_function(): Run a function on all other CPUs.
29 * @func: The function to run. This must be fast and non-blocking.
30 * @info: An arbitrary pointer to pass to the function.
31 * @nonatomic: Unused.
32 * @wait: If true, wait (atomically) until function has completed on other CPUs.
33 *
34 * Returns 0 on success, else a negative status code.
35 *
36 * If @wait is true, then returns once @func has returned; otherwise
37 * it returns just before the target cpu calls @func.
38 *
39 * You must not call this function with disabled interrupts or from a
40 * hardware interrupt handler or from a bottom half handler.
41 */
42int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
43 int wait)
44{
45 return smp_call_function_mask(cpu_online_map, func, info, wait);
46}
47EXPORT_SYMBOL(smp_call_function);
48
49/**
50 * smp_call_function_single - Run a function on a specific CPU
51 * @cpu: The target CPU. Cannot be the calling CPU.
52 * @func: The function to run. This must be fast and non-blocking.
53 * @info: An arbitrary pointer to pass to the function.
54 * @nonatomic: Unused.
55 * @wait: If true, wait until function has completed on other CPUs.
56 *
57 * Returns 0 on success, else a negative status code.
58 *
59 * If @wait is true, then returns once @func has returned; otherwise
60 * it returns just before the target cpu calls @func.
61 */
62int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
63 int nonatomic, int wait)
64{
65 /* prevent preemption and reschedule on another processor */
66 int ret;
67 int me = get_cpu();
68 if (cpu == me) {
69 local_irq_disable();
70 func(info);
71 local_irq_enable();
72 put_cpu();
73 return 0;
74 }
75
76 ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
77
78 put_cpu();
79 return ret;
80}
81EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
new file mode 100644
index 000000000000..2a8713ec0f9a
--- /dev/null
+++ b/arch/x86/kernel/srat_32.c
@@ -0,0 +1,360 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34
35/*
36 * proximity macros and definitions
37 */
38#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
39#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
40#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
41#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
42/* bitmap length; _PXM is at most 255 */
43#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
44static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
45
46#define MAX_CHUNKS_PER_NODE 3
47#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
48struct node_memory_chunk_s {
49 unsigned long start_pfn;
50 unsigned long end_pfn;
51 u8 pxm; // proximity domain of node
52 u8 nid; // which cnode contains this chunk?
53 u8 bank; // which mem bank on this node
54};
55static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
56
57static int num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID];
59
60extern void * boot_ioremap(unsigned long, unsigned long);
61
62/* Identify CPU proximity domains */
63static void __init parse_cpu_affinity_structure(char *p)
64{
65 struct acpi_srat_cpu_affinity *cpu_affinity =
66 (struct acpi_srat_cpu_affinity *) p;
67
68 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
69 return; /* empty entry */
70
71 /* mark this node as "seen" in node bitmap */
72 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
73
74 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
75
76 printk("CPU 0x%02X in proximity domain 0x%02X\n",
77 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
78}
79
80/*
81 * Identify memory proximity domains and hot-remove capabilities.
82 * Fill node memory chunk list structure.
83 */
84static void __init parse_memory_affinity_structure (char *sratp)
85{
86 unsigned long long paddr, size;
87 unsigned long start_pfn, end_pfn;
88 u8 pxm;
89 struct node_memory_chunk_s *p, *q, *pend;
90 struct acpi_srat_mem_affinity *memory_affinity =
91 (struct acpi_srat_mem_affinity *) sratp;
92
93 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
94 return; /* empty entry */
95
96 pxm = memory_affinity->proximity_domain & 0xff;
97
98 /* mark this node as "seen" in node bitmap */
99 BMAP_SET(pxm_bitmap, pxm);
100
101 /* calculate info for memory chunk structure */
102 paddr = memory_affinity->base_address;
103 size = memory_affinity->length;
104
105 start_pfn = paddr >> PAGE_SHIFT;
106 end_pfn = (paddr + size) >> PAGE_SHIFT;
107
108
109 if (num_memory_chunks >= MAXCHUNKS) {
110 printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
111 size/(1024*1024), paddr);
112 return;
113 }
114
115 /* Insertion sort based on base address */
116 pend = &node_memory_chunk[num_memory_chunks];
117 for (p = &node_memory_chunk[0]; p < pend; p++) {
118 if (start_pfn < p->start_pfn)
119 break;
120 }
121 if (p < pend) {
122 for (q = pend; q >= p; q--)
123 *(q + 1) = *q;
124 }
125 p->start_pfn = start_pfn;
126 p->end_pfn = end_pfn;
127 p->pxm = pxm;
128
129 num_memory_chunks++;
130
131 printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
132 start_pfn, end_pfn,
133 memory_affinity->memory_type,
134 pxm,
135 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
136 "enabled and removable" : "enabled" ) );
137}
138
139/*
140 * The SRAT table always lists ascending addresses, so can always
141 * assume that the first "start" address that you see is the real
142 * start of the node, and that the current "end" address is after
143 * the previous one.
144 */
145static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
146{
147 /*
148 * Only add present memory as told by the e820.
149 * There is no guarantee from the SRAT that the memory it
150 * enumerates is present at boot time because it represents
151 * *possible* memory hotplug areas the same as normal RAM.
152 */
153 if (memory_chunk->start_pfn >= max_pfn) {
154 printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
155 memory_chunk->start_pfn, memory_chunk->end_pfn);
156 return;
157 }
158 if (memory_chunk->nid != nid)
159 return;
160
161 if (!node_has_online_mem(nid))
162 node_start_pfn[nid] = memory_chunk->start_pfn;
163
164 if (node_start_pfn[nid] > memory_chunk->start_pfn)
165 node_start_pfn[nid] = memory_chunk->start_pfn;
166
167 if (node_end_pfn[nid] < memory_chunk->end_pfn)
168 node_end_pfn[nid] = memory_chunk->end_pfn;
169}
170
171/* Parse the ACPI Static Resource Affinity Table */
172static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
173{
174 u8 *start, *end, *p;
175 int i, j, nid;
176
177 start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
178 p = start;
179 end = (u8 *)sratp + sratp->header.length;
180
181 memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
182 memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
183
184 num_memory_chunks = 0;
185 while (p < end) {
186 switch (*p) {
187 case ACPI_SRAT_TYPE_CPU_AFFINITY:
188 parse_cpu_affinity_structure(p);
189 break;
190 case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
191 parse_memory_affinity_structure(p);
192 break;
193 default:
194 printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
195 break;
196 }
197 p += p[1];
198 if (p[1] == 0) {
199 printk("acpi20_parse_srat: Entry length value is zero;"
200 " can't parse any further!\n");
201 break;
202 }
203 }
204
205 if (num_memory_chunks == 0) {
206 printk("could not finy any ACPI SRAT memory areas.\n");
207 goto out_fail;
208 }
209
210 /* Calculate total number of nodes in system from PXM bitmap and create
211 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
212 * to specify the range of _PXM values.)
213 */
214 /*
215 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
216 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
217 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
218 * approaches MAX_PXM_DOMAINS for i386.
219 */
220 nodes_clear(node_online_map);
221 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
222 if (BMAP_TEST(pxm_bitmap, i)) {
223 int nid = acpi_map_pxm_to_node(i);
224 node_set_online(nid);
225 }
226 }
227 BUG_ON(num_online_nodes() == 0);
228
229 /* set cnode id in memory chunk structure */
230 for (i = 0; i < num_memory_chunks; i++)
231 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
232
233 printk("pxm bitmap: ");
234 for (i = 0; i < sizeof(pxm_bitmap); i++) {
235 printk("%02X ", pxm_bitmap[i]);
236 }
237 printk("\n");
238 printk("Number of logical nodes in system = %d\n", num_online_nodes());
239 printk("Number of memory chunks in system = %d\n", num_memory_chunks);
240
241 for (i = 0; i < MAX_APICID; i++)
242 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
243
244 for (j = 0; j < num_memory_chunks; j++){
245 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
246 printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
247 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
248 node_read_chunk(chunk->nid, chunk);
249 add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
250 }
251
252 for_each_online_node(nid) {
253 unsigned long start = node_start_pfn[nid];
254 unsigned long end = node_end_pfn[nid];
255
256 memory_present(nid, start, end);
257 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
258 }
259 return 1;
260out_fail:
261 return 0;
262}
263
264struct acpi_static_rsdt {
265 struct acpi_table_rsdt table;
266 u32 padding[7]; /* Allow for 7 more table entries */
267};
268
269int __init get_memcfg_from_srat(void)
270{
271 struct acpi_table_header *header = NULL;
272 struct acpi_table_rsdp *rsdp = NULL;
273 struct acpi_table_rsdt *rsdt = NULL;
274 acpi_native_uint rsdp_address = 0;
275 struct acpi_static_rsdt saved_rsdt;
276 int tables = 0;
277 int i = 0;
278
279 rsdp_address = acpi_find_rsdp();
280 if (!rsdp_address) {
281 printk("%s: System description tables not found\n",
282 __FUNCTION__);
283 goto out_err;
284 }
285
286 printk("%s: assigning address to rsdp\n", __FUNCTION__);
287 rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
288 if (!rsdp) {
289 printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
290 goto out_err;
291 }
292
293 printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
294 rsdp->oem_id);
295
296 if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
297 printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
298 goto out_err;
299 }
300
301 rsdt = (struct acpi_table_rsdt *)
302 boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
303
304 if (!rsdt) {
305 printk(KERN_WARNING
306 "%s: ACPI: Invalid root system description tables (RSDT)\n",
307 __FUNCTION__);
308 goto out_err;
309 }
310
311 header = &rsdt->header;
312
313 if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
314 printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
315 goto out_err;
316 }
317
318 /*
319 * The number of tables is computed by taking the
320 * size of all entries (header size minus total
321 * size of RSDT) divided by the size of each entry
322 * (4-byte table pointers).
323 */
324 tables = (header->length - sizeof(struct acpi_table_header)) / 4;
325
326 if (!tables)
327 goto out_err;
328
329 memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
330
331 if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
332 printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
333 saved_rsdt.table.header.length);
334 goto out_err;
335 }
336
337 printk("Begin SRAT table scan....\n");
338
339 for (i = 0; i < tables; i++) {
340 /* Map in header, then map in full table length. */
341 header = (struct acpi_table_header *)
342 boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
343 if (!header)
344 break;
345 header = (struct acpi_table_header *)
346 boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
347 if (!header)
348 break;
349
350 if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
351 continue;
352
353 /* we've found the srat table. don't need to look at any more tables */
354 return acpi20_parse_srat((struct acpi_table_srat *)header);
355 }
356out_err:
357 remove_all_active_ranges();
358 printk("failed to get NUMA memory information from SRAT table\n");
359 return 0;
360}
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
new file mode 100644
index 000000000000..d0e01a3acf35
--- /dev/null
+++ b/arch/x86/kernel/summit_32.c
@@ -0,0 +1,180 @@
1/*
2 * arch/i386/kernel/summit.c - IBM Summit-Specific Code
3 *
4 * Written By: Matthew Dobson, IBM Corporation
5 *
6 * Copyright (c) 2003 IBM Corp.
7 *
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18 * NON INFRINGEMENT. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * Send feedback to <colpatch@us.ibm.com>
26 *
27 */
28
29#include <linux/mm.h>
30#include <linux/init.h>
31#include <asm/io.h>
32#include <asm/mach-summit/mach_mpparse.h>
33
34static struct rio_table_hdr *rio_table_hdr __initdata;
35static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
36static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
37
38static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
39{
40 int twister = 0, node = 0;
41 int i, bus, num_buses;
42
43 for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
44 if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
45 twister = rio_devs[i]->owner_id;
46 break;
47 }
48 }
49 if (i == rio_table_hdr->num_rio_dev){
50 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
51 return last_bus;
52 }
53
54 for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
55 if (scal_devs[i]->node_id == twister){
56 node = scal_devs[i]->node_id;
57 break;
58 }
59 }
60 if (i == rio_table_hdr->num_scal_dev){
61 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
62 return last_bus;
63 }
64
65 switch (rio_devs[wpeg_num]->type){
66 case CompatWPEG:
67 /* The Compatability Winnipeg controls the 2 legacy buses,
68 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
69 * a PCI-PCI bridge card is used in either slot: total 5 buses.
70 */
71 num_buses = 5;
72 break;
73 case AltWPEG:
74 /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
75 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
76 * the "extra" buses for each of those slots: total 7 buses.
77 */
78 num_buses = 7;
79 break;
80 case LookOutAWPEG:
81 case LookOutBWPEG:
82 /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
83 * & the "extra" buses for each of those slots: total 9 buses.
84 */
85 num_buses = 9;
86 break;
87 default:
88 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
89 return last_bus;
90 }
91
92 for(bus = last_bus; bus < last_bus + num_buses; bus++)
93 mp_bus_id_to_node[bus] = node;
94 return bus;
95}
96
97static int __init build_detail_arrays(void)
98{
99 unsigned long ptr;
100 int i, scal_detail_size, rio_detail_size;
101
102 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
103 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
104 return 0;
105 }
106
107 switch (rio_table_hdr->version){
108 default:
109 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
110 return 0;
111 case 2:
112 scal_detail_size = 11;
113 rio_detail_size = 13;
114 break;
115 case 3:
116 scal_detail_size = 12;
117 rio_detail_size = 15;
118 break;
119 }
120
121 ptr = (unsigned long)rio_table_hdr + 3;
122 for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
123 scal_devs[i] = (struct scal_detail *)ptr;
124
125 for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
126 rio_devs[i] = (struct rio_detail *)ptr;
127
128 return 1;
129}
130
131void __init setup_summit(void)
132{
133 unsigned long ptr;
134 unsigned short offset;
135 int i, next_wpeg, next_bus = 0;
136
137 /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
138 ptr = *(unsigned short *)phys_to_virt(0x40Eul);
139 ptr = (unsigned long)phys_to_virt(ptr << 4);
140
141 rio_table_hdr = NULL;
142 offset = 0x180;
143 while (offset){
144 /* The block id is stored in the 2nd word */
145 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
146 /* set the pointer past the offset & block id */
147 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
148 break;
149 }
150 /* The next offset is stored in the 1st word. 0 means no more */
151 offset = *((unsigned short *)(ptr + offset));
152 }
153 if (!rio_table_hdr){
154 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
155 return;
156 }
157
158 if (!build_detail_arrays())
159 return;
160
161 /* The first Winnipeg we're looking for has an index of 0 */
162 next_wpeg = 0;
163 do {
164 for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
165 if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
166 /* It's the Winnipeg we're looking for! */
167 next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
168 next_wpeg++;
169 break;
170 }
171 }
172 /*
173 * If we go through all Rio devices and don't find one with
174 * the next index, it means we've found all the Winnipegs,
175 * and thus all the PCI buses.
176 */
177 if (i == rio_table_hdr->num_rio_dev)
178 next_wpeg = 0;
179 } while (next_wpeg != 0);
180}
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
new file mode 100644
index 000000000000..42147304de88
--- /dev/null
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -0,0 +1,265 @@
1/*
2 * linux/arch/i386/kernel/sys_i386.c
3 *
4 * This file contains various random system calls that
5 * have a non-standard calling sequence on the Linux/i386
6 * platform.
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/fs.h>
13#include <linux/smp.h>
14#include <linux/sem.h>
15#include <linux/msg.h>
16#include <linux/shm.h>
17#include <linux/stat.h>
18#include <linux/syscalls.h>
19#include <linux/mman.h>
20#include <linux/file.h>
21#include <linux/utsname.h>
22
23#include <asm/uaccess.h>
24#include <asm/unistd.h>
25#include <asm/ipc.h>
26
27/*
28 * sys_pipe() is the normal C calling standard for creating
29 * a pipe. It's not the way Unix traditionally does this, though.
30 */
31asmlinkage int sys_pipe(unsigned long __user * fildes)
32{
33 int fd[2];
34 int error;
35
36 error = do_pipe(fd);
37 if (!error) {
38 if (copy_to_user(fildes, fd, 2*sizeof(int)))
39 error = -EFAULT;
40 }
41 return error;
42}
43
44asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
45 unsigned long prot, unsigned long flags,
46 unsigned long fd, unsigned long pgoff)
47{
48 int error = -EBADF;
49 struct file *file = NULL;
50 struct mm_struct *mm = current->mm;
51
52 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
53 if (!(flags & MAP_ANONYMOUS)) {
54 file = fget(fd);
55 if (!file)
56 goto out;
57 }
58
59 down_write(&mm->mmap_sem);
60 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
61 up_write(&mm->mmap_sem);
62
63 if (file)
64 fput(file);
65out:
66 return error;
67}
68
69/*
70 * Perform the select(nd, in, out, ex, tv) and mmap() system
71 * calls. Linux/i386 didn't use to be able to handle more than
72 * 4 system call parameters, so these system calls used a memory
73 * block for parameter passing..
74 */
75
76struct mmap_arg_struct {
77 unsigned long addr;
78 unsigned long len;
79 unsigned long prot;
80 unsigned long flags;
81 unsigned long fd;
82 unsigned long offset;
83};
84
85asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
86{
87 struct mmap_arg_struct a;
88 int err = -EFAULT;
89
90 if (copy_from_user(&a, arg, sizeof(a)))
91 goto out;
92
93 err = -EINVAL;
94 if (a.offset & ~PAGE_MASK)
95 goto out;
96
97 err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
98 a.fd, a.offset >> PAGE_SHIFT);
99out:
100 return err;
101}
102
103
104struct sel_arg_struct {
105 unsigned long n;
106 fd_set __user *inp, *outp, *exp;
107 struct timeval __user *tvp;
108};
109
110asmlinkage int old_select(struct sel_arg_struct __user *arg)
111{
112 struct sel_arg_struct a;
113
114 if (copy_from_user(&a, arg, sizeof(a)))
115 return -EFAULT;
116 /* sys_select() does the appropriate kernel locking */
117 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
118}
119
120/*
121 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
122 *
123 * This is really horribly ugly.
124 */
125asmlinkage int sys_ipc (uint call, int first, int second,
126 int third, void __user *ptr, long fifth)
127{
128 int version, ret;
129
130 version = call >> 16; /* hack for backward compatibility */
131 call &= 0xffff;
132
133 switch (call) {
134 case SEMOP:
135 return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
136 case SEMTIMEDOP:
137 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
138 (const struct timespec __user *)fifth);
139
140 case SEMGET:
141 return sys_semget (first, second, third);
142 case SEMCTL: {
143 union semun fourth;
144 if (!ptr)
145 return -EINVAL;
146 if (get_user(fourth.__pad, (void __user * __user *) ptr))
147 return -EFAULT;
148 return sys_semctl (first, second, third, fourth);
149 }
150
151 case MSGSND:
152 return sys_msgsnd (first, (struct msgbuf __user *) ptr,
153 second, third);
154 case MSGRCV:
155 switch (version) {
156 case 0: {
157 struct ipc_kludge tmp;
158 if (!ptr)
159 return -EINVAL;
160
161 if (copy_from_user(&tmp,
162 (struct ipc_kludge __user *) ptr,
163 sizeof (tmp)))
164 return -EFAULT;
165 return sys_msgrcv (first, tmp.msgp, second,
166 tmp.msgtyp, third);
167 }
168 default:
169 return sys_msgrcv (first,
170 (struct msgbuf __user *) ptr,
171 second, fifth, third);
172 }
173 case MSGGET:
174 return sys_msgget ((key_t) first, second);
175 case MSGCTL:
176 return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
177
178 case SHMAT:
179 switch (version) {
180 default: {
181 ulong raddr;
182 ret = do_shmat (first, (char __user *) ptr, second, &raddr);
183 if (ret)
184 return ret;
185 return put_user (raddr, (ulong __user *) third);
186 }
187 case 1: /* iBCS2 emulator entry point */
188 if (!segment_eq(get_fs(), get_ds()))
189 return -EINVAL;
190 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
191 return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
192 }
193 case SHMDT:
194 return sys_shmdt ((char __user *)ptr);
195 case SHMGET:
196 return sys_shmget (first, second, third);
197 case SHMCTL:
198 return sys_shmctl (first, second,
199 (struct shmid_ds __user *) ptr);
200 default:
201 return -ENOSYS;
202 }
203}
204
205/*
206 * Old cruft
207 */
208asmlinkage int sys_uname(struct old_utsname __user * name)
209{
210 int err;
211 if (!name)
212 return -EFAULT;
213 down_read(&uts_sem);
214 err = copy_to_user(name, utsname(), sizeof (*name));
215 up_read(&uts_sem);
216 return err?-EFAULT:0;
217}
218
219asmlinkage int sys_olduname(struct oldold_utsname __user * name)
220{
221 int error;
222
223 if (!name)
224 return -EFAULT;
225 if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
226 return -EFAULT;
227
228 down_read(&uts_sem);
229
230 error = __copy_to_user(&name->sysname, &utsname()->sysname,
231 __OLD_UTS_LEN);
232 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
233 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
234 __OLD_UTS_LEN);
235 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
236 error |= __copy_to_user(&name->release, &utsname()->release,
237 __OLD_UTS_LEN);
238 error |= __put_user(0, name->release + __OLD_UTS_LEN);
239 error |= __copy_to_user(&name->version, &utsname()->version,
240 __OLD_UTS_LEN);
241 error |= __put_user(0, name->version + __OLD_UTS_LEN);
242 error |= __copy_to_user(&name->machine, &utsname()->machine,
243 __OLD_UTS_LEN);
244 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
245
246 up_read(&uts_sem);
247
248 error = error ? -EFAULT : 0;
249
250 return error;
251}
252
253
254/*
255 * Do a system call from kernel instead of calling sys_execve so we
256 * end up with proper pt_regs.
257 */
258int kernel_execve(const char *filename, char *const argv[], char *const envp[])
259{
260 long __res;
261 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
262 : "=a" (__res)
263 : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
264 return __res;
265}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
new file mode 100644
index 000000000000..8344c70adf61
--- /dev/null
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -0,0 +1,326 @@
1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit
4 .long sys_fork
5 .long sys_read
6 .long sys_write
7 .long sys_open /* 5 */
8 .long sys_close
9 .long sys_waitpid
10 .long sys_creat
11 .long sys_link
12 .long sys_unlink /* 10 */
13 .long sys_execve
14 .long sys_chdir
15 .long sys_time
16 .long sys_mknod
17 .long sys_chmod /* 15 */
18 .long sys_lchown16
19 .long sys_ni_syscall /* old break syscall holder */
20 .long sys_stat
21 .long sys_lseek
22 .long sys_getpid /* 20 */
23 .long sys_mount
24 .long sys_oldumount
25 .long sys_setuid16
26 .long sys_getuid16
27 .long sys_stime /* 25 */
28 .long sys_ptrace
29 .long sys_alarm
30 .long sys_fstat
31 .long sys_pause
32 .long sys_utime /* 30 */
33 .long sys_ni_syscall /* old stty syscall holder */
34 .long sys_ni_syscall /* old gtty syscall holder */
35 .long sys_access
36 .long sys_nice
37 .long sys_ni_syscall /* 35 - old ftime syscall holder */
38 .long sys_sync
39 .long sys_kill
40 .long sys_rename
41 .long sys_mkdir
42 .long sys_rmdir /* 40 */
43 .long sys_dup
44 .long sys_pipe
45 .long sys_times
46 .long sys_ni_syscall /* old prof syscall holder */
47 .long sys_brk /* 45 */
48 .long sys_setgid16
49 .long sys_getgid16
50 .long sys_signal
51 .long sys_geteuid16
52 .long sys_getegid16 /* 50 */
53 .long sys_acct
54 .long sys_umount /* recycled never used phys() */
55 .long sys_ni_syscall /* old lock syscall holder */
56 .long sys_ioctl
57 .long sys_fcntl /* 55 */
58 .long sys_ni_syscall /* old mpx syscall holder */
59 .long sys_setpgid
60 .long sys_ni_syscall /* old ulimit syscall holder */
61 .long sys_olduname
62 .long sys_umask /* 60 */
63 .long sys_chroot
64 .long sys_ustat
65 .long sys_dup2
66 .long sys_getppid
67 .long sys_getpgrp /* 65 */
68 .long sys_setsid
69 .long sys_sigaction
70 .long sys_sgetmask
71 .long sys_ssetmask
72 .long sys_setreuid16 /* 70 */
73 .long sys_setregid16
74 .long sys_sigsuspend
75 .long sys_sigpending
76 .long sys_sethostname
77 .long sys_setrlimit /* 75 */
78 .long sys_old_getrlimit
79 .long sys_getrusage
80 .long sys_gettimeofday
81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16
84 .long old_select
85 .long sys_symlink
86 .long sys_lstat
87 .long sys_readlink /* 85 */
88 .long sys_uselib
89 .long sys_swapon
90 .long sys_reboot
91 .long old_readdir
92 .long old_mmap /* 90 */
93 .long sys_munmap
94 .long sys_truncate
95 .long sys_ftruncate
96 .long sys_fchmod
97 .long sys_fchown16 /* 95 */
98 .long sys_getpriority
99 .long sys_setpriority
100 .long sys_ni_syscall /* old profil syscall holder */
101 .long sys_statfs
102 .long sys_fstatfs /* 100 */
103 .long sys_ioperm
104 .long sys_socketcall
105 .long sys_syslog
106 .long sys_setitimer
107 .long sys_getitimer /* 105 */
108 .long sys_newstat
109 .long sys_newlstat
110 .long sys_newfstat
111 .long sys_uname
112 .long sys_iopl /* 110 */
113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */
115 .long sys_vm86old
116 .long sys_wait4
117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo
119 .long sys_ipc
120 .long sys_fsync
121 .long sys_sigreturn
122 .long sys_clone /* 120 */
123 .long sys_setdomainname
124 .long sys_newuname
125 .long sys_modify_ldt
126 .long sys_adjtimex
127 .long sys_mprotect /* 125 */
128 .long sys_sigprocmask
129 .long sys_ni_syscall /* old "create_module" */
130 .long sys_init_module
131 .long sys_delete_module
132 .long sys_ni_syscall /* 130: old "get_kernel_syms" */
133 .long sys_quotactl
134 .long sys_getpgid
135 .long sys_fchdir
136 .long sys_bdflush
137 .long sys_sysfs /* 135 */
138 .long sys_personality
139 .long sys_ni_syscall /* reserved for afs_syscall */
140 .long sys_setfsuid16
141 .long sys_setfsgid16
142 .long sys_llseek /* 140 */
143 .long sys_getdents
144 .long sys_select
145 .long sys_flock
146 .long sys_msync
147 .long sys_readv /* 145 */
148 .long sys_writev
149 .long sys_getsid
150 .long sys_fdatasync
151 .long sys_sysctl
152 .long sys_mlock /* 150 */
153 .long sys_munlock
154 .long sys_mlockall
155 .long sys_munlockall
156 .long sys_sched_setparam
157 .long sys_sched_getparam /* 155 */
158 .long sys_sched_setscheduler
159 .long sys_sched_getscheduler
160 .long sys_sched_yield
161 .long sys_sched_get_priority_max
162 .long sys_sched_get_priority_min /* 160 */
163 .long sys_sched_rr_get_interval
164 .long sys_nanosleep
165 .long sys_mremap
166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */
168 .long sys_vm86
169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll
171 .long sys_nfsservctl
172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16
174 .long sys_prctl
175 .long sys_rt_sigreturn
176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending
179 .long sys_rt_sigtimedwait
180 .long sys_rt_sigqueueinfo
181 .long sys_rt_sigsuspend
182 .long sys_pread64 /* 180 */
183 .long sys_pwrite64
184 .long sys_chown16
185 .long sys_getcwd
186 .long sys_capget
187 .long sys_capset /* 185 */
188 .long sys_sigaltstack
189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */
192 .long sys_vfork /* 190 */
193 .long sys_getrlimit
194 .long sys_mmap2
195 .long sys_truncate64
196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */
198 .long sys_lstat64
199 .long sys_fstat64
200 .long sys_lchown
201 .long sys_getuid
202 .long sys_getgid /* 200 */
203 .long sys_geteuid
204 .long sys_getegid
205 .long sys_setreuid
206 .long sys_setregid
207 .long sys_getgroups /* 205 */
208 .long sys_setgroups
209 .long sys_fchown
210 .long sys_setresuid
211 .long sys_getresuid
212 .long sys_setresgid /* 210 */
213 .long sys_getresgid
214 .long sys_chown
215 .long sys_setuid
216 .long sys_setgid
217 .long sys_setfsuid /* 215 */
218 .long sys_setfsgid
219 .long sys_pivot_root
220 .long sys_mincore
221 .long sys_madvise
222 .long sys_getdents64 /* 220 */
223 .long sys_fcntl64
224 .long sys_ni_syscall /* reserved for TUX */
225 .long sys_ni_syscall
226 .long sys_gettid
227 .long sys_readahead /* 225 */
228 .long sys_setxattr
229 .long sys_lsetxattr
230 .long sys_fsetxattr
231 .long sys_getxattr
232 .long sys_lgetxattr /* 230 */
233 .long sys_fgetxattr
234 .long sys_listxattr
235 .long sys_llistxattr
236 .long sys_flistxattr
237 .long sys_removexattr /* 235 */
238 .long sys_lremovexattr
239 .long sys_fremovexattr
240 .long sys_tkill
241 .long sys_sendfile64
242 .long sys_futex /* 240 */
243 .long sys_sched_setaffinity
244 .long sys_sched_getaffinity
245 .long sys_set_thread_area
246 .long sys_get_thread_area
247 .long sys_io_setup /* 245 */
248 .long sys_io_destroy
249 .long sys_io_getevents
250 .long sys_io_submit
251 .long sys_io_cancel
252 .long sys_fadvise64 /* 250 */
253 .long sys_ni_syscall
254 .long sys_exit_group
255 .long sys_lookup_dcookie
256 .long sys_epoll_create
257 .long sys_epoll_ctl /* 255 */
258 .long sys_epoll_wait
259 .long sys_remap_file_pages
260 .long sys_set_tid_address
261 .long sys_timer_create
262 .long sys_timer_settime /* 260 */
263 .long sys_timer_gettime
264 .long sys_timer_getoverrun
265 .long sys_timer_delete
266 .long sys_clock_settime
267 .long sys_clock_gettime /* 265 */
268 .long sys_clock_getres
269 .long sys_clock_nanosleep
270 .long sys_statfs64
271 .long sys_fstatfs64
272 .long sys_tgkill /* 270 */
273 .long sys_utimes
274 .long sys_fadvise64_64
275 .long sys_ni_syscall /* sys_vserver */
276 .long sys_mbind
277 .long sys_get_mempolicy
278 .long sys_set_mempolicy
279 .long sys_mq_open
280 .long sys_mq_unlink
281 .long sys_mq_timedsend
282 .long sys_mq_timedreceive /* 280 */
283 .long sys_mq_notify
284 .long sys_mq_getsetattr
285 .long sys_kexec_load
286 .long sys_waitid
287 .long sys_ni_syscall /* 285 */ /* available */
288 .long sys_add_key
289 .long sys_request_key
290 .long sys_keyctl
291 .long sys_ioprio_set
292 .long sys_ioprio_get /* 290 */
293 .long sys_inotify_init
294 .long sys_inotify_add_watch
295 .long sys_inotify_rm_watch
296 .long sys_migrate_pages
297 .long sys_openat /* 295 */
298 .long sys_mkdirat
299 .long sys_mknodat
300 .long sys_fchownat
301 .long sys_futimesat
302 .long sys_fstatat64 /* 300 */
303 .long sys_unlinkat
304 .long sys_renameat
305 .long sys_linkat
306 .long sys_symlinkat
307 .long sys_readlinkat /* 305 */
308 .long sys_fchmodat
309 .long sys_faccessat
310 .long sys_pselect6
311 .long sys_ppoll
312 .long sys_unshare /* 310 */
313 .long sys_set_robust_list
314 .long sys_get_robust_list
315 .long sys_splice
316 .long sys_sync_file_range
317 .long sys_tee /* 315 */
318 .long sys_vmsplice
319 .long sys_move_pages
320 .long sys_getcpu
321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */
323 .long sys_signalfd
324 .long sys_timerfd
325 .long sys_eventfd
326 .long sys_fallocate
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
new file mode 100644
index 000000000000..4eb2e408764f
--- /dev/null
+++ b/arch/x86/kernel/sysenter_32.c
@@ -0,0 +1,348 @@
1/*
2 * linux/arch/i386/kernel/sysenter.c
3 *
4 * (C) Copyright 2002 Linus Torvalds
5 * Portions based on the vdso-randomization code from exec-shield:
6 * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
7 *
8 * This file contains the needed initializations to support sysenter.
9 */
10
11#include <linux/init.h>
12#include <linux/smp.h>
13#include <linux/thread_info.h>
14#include <linux/sched.h>
15#include <linux/gfp.h>
16#include <linux/string.h>
17#include <linux/elf.h>
18#include <linux/mm.h>
19#include <linux/err.h>
20#include <linux/module.h>
21
22#include <asm/cpufeature.h>
23#include <asm/msr.h>
24#include <asm/pgtable.h>
25#include <asm/unistd.h>
26#include <asm/elf.h>
27#include <asm/tlbflush.h>
28
29enum {
30 VDSO_DISABLED = 0,
31 VDSO_ENABLED = 1,
32 VDSO_COMPAT = 2,
33};
34
35#ifdef CONFIG_COMPAT_VDSO
36#define VDSO_DEFAULT VDSO_COMPAT
37#else
38#define VDSO_DEFAULT VDSO_ENABLED
39#endif
40
41/*
42 * Should the kernel map a VDSO page into processes and pass its
43 * address down to glibc upon exec()?
44 */
45unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
46
47EXPORT_SYMBOL_GPL(vdso_enabled);
48
49static int __init vdso_setup(char *s)
50{
51 vdso_enabled = simple_strtoul(s, NULL, 0);
52
53 return 1;
54}
55
56__setup("vdso=", vdso_setup);
57
58extern asmlinkage void sysenter_entry(void);
59
60static __init void reloc_symtab(Elf32_Ehdr *ehdr,
61 unsigned offset, unsigned size)
62{
63 Elf32_Sym *sym = (void *)ehdr + offset;
64 unsigned nsym = size / sizeof(*sym);
65 unsigned i;
66
67 for(i = 0; i < nsym; i++, sym++) {
68 if (sym->st_shndx == SHN_UNDEF ||
69 sym->st_shndx == SHN_ABS)
70 continue; /* skip */
71
72 if (sym->st_shndx > SHN_LORESERVE) {
73 printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
74 sym->st_shndx);
75 continue;
76 }
77
78 switch(ELF_ST_TYPE(sym->st_info)) {
79 case STT_OBJECT:
80 case STT_FUNC:
81 case STT_SECTION:
82 case STT_FILE:
83 sym->st_value += VDSO_HIGH_BASE;
84 }
85 }
86}
87
88static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
89{
90 Elf32_Dyn *dyn = (void *)ehdr + offset;
91
92 for(; dyn->d_tag != DT_NULL; dyn++)
93 switch(dyn->d_tag) {
94 case DT_PLTGOT:
95 case DT_HASH:
96 case DT_STRTAB:
97 case DT_SYMTAB:
98 case DT_RELA:
99 case DT_INIT:
100 case DT_FINI:
101 case DT_REL:
102 case DT_DEBUG:
103 case DT_JMPREL:
104 case DT_VERSYM:
105 case DT_VERDEF:
106 case DT_VERNEED:
107 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
108 /* definitely pointers needing relocation */
109 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
110 break;
111
112 case DT_ENCODING ... OLD_DT_LOOS-1:
113 case DT_LOOS ... DT_HIOS-1:
114 /* Tags above DT_ENCODING are pointers if
115 they're even */
116 if (dyn->d_tag >= DT_ENCODING &&
117 (dyn->d_tag & 1) == 0)
118 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
119 break;
120
121 case DT_VERDEFNUM:
122 case DT_VERNEEDNUM:
123 case DT_FLAGS_1:
124 case DT_RELACOUNT:
125 case DT_RELCOUNT:
126 case DT_VALRNGLO ... DT_VALRNGHI:
127 /* definitely not pointers */
128 break;
129
130 case OLD_DT_LOOS ... DT_LOOS-1:
131 case DT_HIOS ... DT_VALRNGLO-1:
132 default:
133 if (dyn->d_tag > DT_ENCODING)
134 printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
135 dyn->d_tag);
136 break;
137 }
138}
139
140static __init void relocate_vdso(Elf32_Ehdr *ehdr)
141{
142 Elf32_Phdr *phdr;
143 Elf32_Shdr *shdr;
144 int i;
145
146 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
147 !elf_check_arch(ehdr) ||
148 ehdr->e_type != ET_DYN);
149
150 ehdr->e_entry += VDSO_HIGH_BASE;
151
152 /* rebase phdrs */
153 phdr = (void *)ehdr + ehdr->e_phoff;
154 for (i = 0; i < ehdr->e_phnum; i++) {
155 phdr[i].p_vaddr += VDSO_HIGH_BASE;
156
157 /* relocate dynamic stuff */
158 if (phdr[i].p_type == PT_DYNAMIC)
159 reloc_dyn(ehdr, phdr[i].p_offset);
160 }
161
162 /* rebase sections */
163 shdr = (void *)ehdr + ehdr->e_shoff;
164 for(i = 0; i < ehdr->e_shnum; i++) {
165 if (!(shdr[i].sh_flags & SHF_ALLOC))
166 continue;
167
168 shdr[i].sh_addr += VDSO_HIGH_BASE;
169
170 if (shdr[i].sh_type == SHT_SYMTAB ||
171 shdr[i].sh_type == SHT_DYNSYM)
172 reloc_symtab(ehdr, shdr[i].sh_offset,
173 shdr[i].sh_size);
174 }
175}
176
177void enable_sep_cpu(void)
178{
179 int cpu = get_cpu();
180 struct tss_struct *tss = &per_cpu(init_tss, cpu);
181
182 if (!boot_cpu_has(X86_FEATURE_SEP)) {
183 put_cpu();
184 return;
185 }
186
187 tss->x86_tss.ss1 = __KERNEL_CS;
188 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
189 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
190 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
191 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
192 put_cpu();
193}
194
195static struct vm_area_struct gate_vma;
196
197static int __init gate_vma_init(void)
198{
199 gate_vma.vm_mm = NULL;
200 gate_vma.vm_start = FIXADDR_USER_START;
201 gate_vma.vm_end = FIXADDR_USER_END;
202 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
203 gate_vma.vm_page_prot = __P101;
204 /*
205 * Make sure the vDSO gets into every core dump.
206 * Dumping its contents makes post-mortem fully interpretable later
207 * without matching up the same kernel and hardware config to see
208 * what PC values meant.
209 */
210 gate_vma.vm_flags |= VM_ALWAYSDUMP;
211 return 0;
212}
213
214/*
215 * These symbols are defined by vsyscall.o to mark the bounds
216 * of the ELF DSO images included therein.
217 */
218extern const char vsyscall_int80_start, vsyscall_int80_end;
219extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
220static struct page *syscall_pages[1];
221
222static void map_compat_vdso(int map)
223{
224 static int vdso_mapped;
225
226 if (map == vdso_mapped)
227 return;
228
229 vdso_mapped = map;
230
231 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
232 map ? PAGE_READONLY_EXEC : PAGE_NONE);
233
234 /* flush stray tlbs */
235 flush_tlb_all();
236}
237
238int __init sysenter_setup(void)
239{
240 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
241 const void *vsyscall;
242 size_t vsyscall_len;
243
244 syscall_pages[0] = virt_to_page(syscall_page);
245
246 gate_vma_init();
247
248 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
249
250 if (!boot_cpu_has(X86_FEATURE_SEP)) {
251 vsyscall = &vsyscall_int80_start;
252 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
253 } else {
254 vsyscall = &vsyscall_sysenter_start;
255 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
256 }
257
258 memcpy(syscall_page, vsyscall, vsyscall_len);
259 relocate_vdso(syscall_page);
260
261 return 0;
262}
263
264/* Defined in vsyscall-sysenter.S */
265extern void SYSENTER_RETURN;
266
267/* Setup a VMA at program startup for the vsyscall page */
268int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
269{
270 struct mm_struct *mm = current->mm;
271 unsigned long addr;
272 int ret = 0;
273 bool compat;
274
275 down_write(&mm->mmap_sem);
276
277 /* Test compat mode once here, in case someone
278 changes it via sysctl */
279 compat = (vdso_enabled == VDSO_COMPAT);
280
281 map_compat_vdso(compat);
282
283 if (compat)
284 addr = VDSO_HIGH_BASE;
285 else {
286 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
287 if (IS_ERR_VALUE(addr)) {
288 ret = addr;
289 goto up_fail;
290 }
291
292 /*
293 * MAYWRITE to allow gdb to COW and set breakpoints
294 *
295 * Make sure the vDSO gets into every core dump.
296 * Dumping its contents makes post-mortem fully
297 * interpretable later without matching up the same
298 * kernel and hardware config to see what PC values
299 * meant.
300 */
301 ret = install_special_mapping(mm, addr, PAGE_SIZE,
302 VM_READ|VM_EXEC|
303 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
304 VM_ALWAYSDUMP,
305 syscall_pages);
306
307 if (ret)
308 goto up_fail;
309 }
310
311 current->mm->context.vdso = (void *)addr;
312 current_thread_info()->sysenter_return =
313 (void *)VDSO_SYM(&SYSENTER_RETURN);
314
315 up_fail:
316 up_write(&mm->mmap_sem);
317
318 return ret;
319}
320
321const char *arch_vma_name(struct vm_area_struct *vma)
322{
323 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
324 return "[vdso]";
325 return NULL;
326}
327
328struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
329{
330 struct mm_struct *mm = tsk->mm;
331
332 /* Check to see if this task was created in compat vdso mode */
333 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
334 return &gate_vma;
335 return NULL;
336}
337
338int in_gate_area(struct task_struct *task, unsigned long addr)
339{
340 const struct vm_area_struct *vma = get_gate_vma(task);
341
342 return vma && addr >= vma->vm_start && addr < vma->vm_end;
343}
344
345int in_gate_area_no_task(unsigned long addr)
346{
347 return 0;
348}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
new file mode 100644
index 000000000000..19a6c678d02e
--- /dev/null
+++ b/arch/x86/kernel/time_32.c
@@ -0,0 +1,236 @@
1/*
2 * linux/arch/i386/kernel/time.c
3 *
4 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
5 *
6 * This file contains the PC-specific time handling details:
7 * reading the RTC at bootup, etc..
8 * 1994-07-02 Alan Modra
9 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10 * 1995-03-26 Markus Kuhn
11 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12 * precision CMOS clock update
13 * 1996-05-03 Ingo Molnar
14 * fixed time warps in do_[slow|fast]_gettimeoffset()
15 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
16 * "A Kernel Model for Precision Timekeeping" by Dave Mills
17 * 1998-09-05 (Various)
18 * More robust do_fast_gettimeoffset() algorithm implemented
19 * (works with APM, Cyrix 6x86MX and Centaur C6),
20 * monotonic gettimeofday() with fast_get_timeoffset(),
21 * drift-proof precision TSC calibration on boot
22 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25 * 1998-12-16 Andrea Arcangeli
26 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27 * because was not accounting lost_ticks.
28 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
29 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30 * serialize accesses to xtime/lost_ticks).
31 */
32
33#include <linux/errno.h>
34#include <linux/sched.h>
35#include <linux/kernel.h>
36#include <linux/param.h>
37#include <linux/string.h>
38#include <linux/mm.h>
39#include <linux/interrupt.h>
40#include <linux/time.h>
41#include <linux/delay.h>
42#include <linux/init.h>
43#include <linux/smp.h>
44#include <linux/module.h>
45#include <linux/sysdev.h>
46#include <linux/bcd.h>
47#include <linux/efi.h>
48#include <linux/mca.h>
49
50#include <asm/io.h>
51#include <asm/smp.h>
52#include <asm/irq.h>
53#include <asm/msr.h>
54#include <asm/delay.h>
55#include <asm/mpspec.h>
56#include <asm/uaccess.h>
57#include <asm/processor.h>
58#include <asm/timer.h>
59#include <asm/time.h>
60
61#include "mach_time.h"
62
63#include <linux/timex.h>
64
65#include <asm/hpet.h>
66
67#include <asm/arch_hooks.h>
68
69#include "io_ports.h"
70
71#include <asm/i8259.h>
72
73#include "do_timer.h"
74
75unsigned int cpu_khz; /* Detected as we calibrate the TSC */
76EXPORT_SYMBOL(cpu_khz);
77
78DEFINE_SPINLOCK(rtc_lock);
79EXPORT_SYMBOL(rtc_lock);
80
81/*
82 * This is a special lock that is owned by the CPU and holds the index
83 * register we are working with. It is required for NMI access to the
84 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
85 */
86volatile unsigned long cmos_lock = 0;
87EXPORT_SYMBOL(cmos_lock);
88
89/* Routines for accessing the CMOS RAM/RTC. */
90unsigned char rtc_cmos_read(unsigned char addr)
91{
92 unsigned char val;
93 lock_cmos_prefix(addr);
94 outb_p(addr, RTC_PORT(0));
95 val = inb_p(RTC_PORT(1));
96 lock_cmos_suffix(addr);
97 return val;
98}
99EXPORT_SYMBOL(rtc_cmos_read);
100
101void rtc_cmos_write(unsigned char val, unsigned char addr)
102{
103 lock_cmos_prefix(addr);
104 outb_p(addr, RTC_PORT(0));
105 outb_p(val, RTC_PORT(1));
106 lock_cmos_suffix(addr);
107}
108EXPORT_SYMBOL(rtc_cmos_write);
109
110static int set_rtc_mmss(unsigned long nowtime)
111{
112 int retval;
113 unsigned long flags;
114
115 /* gets recalled with irq locally disabled */
116 /* XXX - does irqsave resolve this? -johnstul */
117 spin_lock_irqsave(&rtc_lock, flags);
118 retval = set_wallclock(nowtime);
119 spin_unlock_irqrestore(&rtc_lock, flags);
120
121 return retval;
122}
123
124
125int timer_ack;
126
127unsigned long profile_pc(struct pt_regs *regs)
128{
129 unsigned long pc = instruction_pointer(regs);
130
131#ifdef CONFIG_SMP
132 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
133 in_lock_functions(pc)) {
134#ifdef CONFIG_FRAME_POINTER
135 return *(unsigned long *)(regs->ebp + 4);
136#else
137 unsigned long *sp = (unsigned long *)&regs->esp;
138
139 /* Return address is either directly at stack pointer
140 or above a saved eflags. Eflags has bits 22-31 zero,
141 kernel addresses don't. */
142 if (sp[0] >> 22)
143 return sp[0];
144 if (sp[1] >> 22)
145 return sp[1];
146#endif
147 }
148#endif
149 return pc;
150}
151EXPORT_SYMBOL(profile_pc);
152
153/*
154 * This is the same as the above, except we _also_ save the current
155 * Time Stamp Counter value at the time of the timer interrupt, so that
156 * we later on can estimate the time of day more exactly.
157 */
158irqreturn_t timer_interrupt(int irq, void *dev_id)
159{
160#ifdef CONFIG_X86_IO_APIC
161 if (timer_ack) {
162 /*
163 * Subtle, when I/O APICs are used we have to ack timer IRQ
164 * manually to reset the IRR bit for do_slow_gettimeoffset().
165 * This will also deassert NMI lines for the watchdog if run
166 * on an 82489DX-based system.
167 */
168 spin_lock(&i8259A_lock);
169 outb(0x0c, PIC_MASTER_OCW3);
170 /* Ack the IRQ; AEOI will end it automatically. */
171 inb(PIC_MASTER_POLL);
172 spin_unlock(&i8259A_lock);
173 }
174#endif
175
176 do_timer_interrupt_hook();
177
178 if (MCA_bus) {
179 /* The PS/2 uses level-triggered interrupts. You can't
180 turn them off, nor would you want to (any attempt to
181 enable edge-triggered interrupts usually gets intercepted by a
182 special hardware circuit). Hence we have to acknowledge
183 the timer interrupt. Through some incredibly stupid
184 design idea, the reset for IRQ 0 is done by setting the
185 high bit of the PPI port B (0x61). Note that some PS/2s,
186 notably the 55SX, work fine if this is removed. */
187
188 u8 irq_v = inb_p( 0x61 ); /* read the current state */
189 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
190 }
191
192 return IRQ_HANDLED;
193}
194
195/* not static: needed by APM */
196unsigned long read_persistent_clock(void)
197{
198 unsigned long retval;
199 unsigned long flags;
200
201 spin_lock_irqsave(&rtc_lock, flags);
202
203 retval = get_wallclock();
204
205 spin_unlock_irqrestore(&rtc_lock, flags);
206
207 return retval;
208}
209
210int update_persistent_clock(struct timespec now)
211{
212 return set_rtc_mmss(now.tv_sec);
213}
214
215extern void (*late_time_init)(void);
216/* Duplicate of time_init() below, with hpet_enable part added */
217void __init hpet_time_init(void)
218{
219 if (!hpet_enable())
220 setup_pit_timer();
221 time_init_hook();
222}
223
224/*
225 * This is called directly from init code; we must delay timer setup in the
226 * HPET case as we can't make the decision to turn on HPET this early in the
227 * boot process.
228 *
229 * The chosen time_init function will usually be hpet_time_init, above, but
230 * in the case of virtual hardware, an alternative function may be substituted.
231 */
232void __init time_init(void)
233{
234 tsc_init();
235 late_time_init = choose_time_init();
236}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644
index 000000000000..45782356a618
--- /dev/null
+++ b/arch/x86/kernel/topology.c
@@ -0,0 +1,77 @@
1/*
2 * arch/i386/kernel/topology.c - Populate sysfs with topology information
3 *
4 * Written by: Matthew Dobson, IBM Corporation
5 * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
6 *
7 * Copyright (C) 2002, IBM Corp.
8 *
9 * All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
19 * NON INFRINGEMENT. See the GNU General Public License for more
20 * details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 * Send feedback to <colpatch@us.ibm.com>
27 */
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/nodemask.h>
31#include <linux/mmzone.h>
32#include <asm/cpu.h>
33
34static struct i386_cpu cpu_devices[NR_CPUS];
35
36int arch_register_cpu(int num)
37{
38 /*
39 * CPU0 cannot be offlined due to several
40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline
42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
46 */
47 if (num && enable_cpu_hotplug)
48 cpu_devices[num].cpu.hotpluggable = 1;
49
50 return register_cpu(&cpu_devices[num].cpu, num);
51}
52
53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
55
56void arch_unregister_cpu(int num) {
57 return unregister_cpu(&cpu_devices[num].cpu);
58}
59EXPORT_SYMBOL(arch_register_cpu);
60EXPORT_SYMBOL(arch_unregister_cpu);
61#endif /*CONFIG_HOTPLUG_CPU*/
62
63static int __init topology_init(void)
64{
65 int i;
66
67#ifdef CONFIG_NUMA
68 for_each_online_node(i)
69 register_one_node(i);
70#endif /* CONFIG_NUMA */
71
72 for_each_present_cpu(i)
73 arch_register_cpu(i);
74 return 0;
75}
76
77subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
new file mode 100644
index 000000000000..f62815f8d06a
--- /dev/null
+++ b/arch/x86/kernel/trampoline_32.S
@@ -0,0 +1,85 @@
1/*
2 *
3 * Trampoline.S Derived from Setup.S by Linus Torvalds
4 *
5 * 4 Jan 1997 Michael Chastain: changed to gnu as.
6 *
7 * This is only used for booting secondary CPUs in SMP machine
8 *
9 * Entry: CS:IP point to the start of our code, we are
10 * in real mode with no stack, but the rest of the
11 * trampoline page to make our stack and everything else
12 * is a mystery.
13 *
14 * In fact we don't actually need a stack so we don't
15 * set one up.
16 *
17 * We jump into the boot/compressed/head.S code. So you'd
18 * better be running a compressed kernel image or you
19 * won't get very far.
20 *
21 * On entry to trampoline_data, the processor is in real mode
22 * with 16-bit addressing and 16-bit data. CS has some value
23 * and IP is zero. Thus, data addresses need to be absolute
24 * (no relocation) and are taken with regard to r_base.
25 *
26 * If you work on this file, check the object module with
27 * objdump --reloc to make sure there are no relocation
28 * entries except for:
29 *
30 * TYPE VALUE
31 * R_386_32 startup_32_smp
32 * R_386_32 boot_gdt
33 */
34
35#include <linux/linkage.h>
36#include <asm/segment.h>
37#include <asm/page.h>
38
39.data
40
41/* We can free up trampoline after bootup if cpu hotplug is not supported. */
42#ifndef CONFIG_HOTPLUG_CPU
43.section ".init.data","aw",@progbits
44#endif
45
46.code16
47
48ENTRY(trampoline_data)
49r_base = .
50 wbinvd # Needed for NUMA-Q should be harmless for others
51 mov %cs, %ax # Code and data in the same place
52 mov %ax, %ds
53
54 cli # We should be safe anyway
55
56 movl $0xA5A5A5A5, trampoline_data - r_base
57 # write marker for master knows we're running
58
59 /* GDT tables in non default location kernel can be beyond 16MB and
60 * lgdt will not be able to load the address as in real mode default
61 * operand size is 16bit. Use lgdtl instead to force operand size
62 * to 32 bit.
63 */
64
65 lidtl boot_idt_descr - r_base # load idt with 0, 0
66 lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
67
68 xor %ax, %ax
69 inc %ax # protected mode (PE) bit
70 lmsw %ax # into protected mode
71 # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
72 ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
73
74 # These need to be in the same 64K segment as the above;
75 # hence we don't use the boot_gdt_descr defined in head.S
76boot_gdt_descr:
77 .word __BOOT_DS + 7 # gdt limit
78 .long boot_gdt - __PAGE_OFFSET # gdt base
79
80boot_idt_descr:
81 .word 0 # idt limit = 0
82 .long 0 # idt base = 0L
83
84.globl trampoline_end
85trampoline_end:
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
new file mode 100644
index 000000000000..47b0bef335bd
--- /dev/null
+++ b/arch/x86/kernel/traps_32.c
@@ -0,0 +1,1250 @@
1/*
2 * linux/arch/i386/traps.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * 'Traps.c' handles hardware traps and faults after we have saved some
12 * state in 'asm.s'.
13 */
14#include <linux/sched.h>
15#include <linux/kernel.h>
16#include <linux/string.h>
17#include <linux/errno.h>
18#include <linux/timer.h>
19#include <linux/mm.h>
20#include <linux/init.h>
21#include <linux/delay.h>
22#include <linux/spinlock.h>
23#include <linux/interrupt.h>
24#include <linux/highmem.h>
25#include <linux/kallsyms.h>
26#include <linux/ptrace.h>
27#include <linux/utsname.h>
28#include <linux/kprobes.h>
29#include <linux/kexec.h>
30#include <linux/unwind.h>
31#include <linux/uaccess.h>
32#include <linux/nmi.h>
33#include <linux/bug.h>
34
35#ifdef CONFIG_EISA
36#include <linux/ioport.h>
37#include <linux/eisa.h>
38#endif
39
40#ifdef CONFIG_MCA
41#include <linux/mca.h>
42#endif
43
44#if defined(CONFIG_EDAC)
45#include <linux/edac.h>
46#endif
47
48#include <asm/processor.h>
49#include <asm/system.h>
50#include <asm/io.h>
51#include <asm/atomic.h>
52#include <asm/debugreg.h>
53#include <asm/desc.h>
54#include <asm/i387.h>
55#include <asm/nmi.h>
56#include <asm/unwind.h>
57#include <asm/smp.h>
58#include <asm/arch_hooks.h>
59#include <linux/kdebug.h>
60#include <asm/stacktrace.h>
61
62#include <linux/module.h>
63
64#include "mach_traps.h"
65
66int panic_on_unrecovered_nmi;
67
68asmlinkage int system_call(void);
69
70/* Do we ignore FPU interrupts ? */
71char ignore_fpu_irq = 0;
72
73/*
74 * The IDT has to be page-aligned to simplify the Pentium
75 * F0 0F bug workaround.. We have a special link segment
76 * for this.
77 */
78struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
79
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int kstack_depth_to_print = 24;
101static unsigned int code_bytes = 64;
102
103static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
104{
105 return p > (void *)tinfo &&
106 p <= (void *)tinfo + THREAD_SIZE - size;
107}
108
109/* The form of the top of the frame on the stack */
110struct stack_frame {
111 struct stack_frame *next_frame;
112 unsigned long return_address;
113};
114
115static inline unsigned long print_context_stack(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long ebp,
117 struct stacktrace_ops *ops, void *data)
118{
119#ifdef CONFIG_FRAME_POINTER
120 struct stack_frame *frame = (struct stack_frame *)ebp;
121 while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
122 struct stack_frame *next;
123 unsigned long addr;
124
125 addr = frame->return_address;
126 ops->address(data, addr);
127 /*
128 * break out of recursive entries (such as
129 * end_of_stack_stop_unwind_function). Also,
130 * we can never allow a frame pointer to
131 * move downwards!
132 */
133 next = frame->next_frame;
134 if (next <= frame)
135 break;
136 frame = next;
137 }
138#else
139 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
140 unsigned long addr;
141
142 addr = *stack++;
143 if (__kernel_text_address(addr))
144 ops->address(data, addr);
145 }
146#endif
147 return ebp;
148}
149
150#define MSG(msg) ops->warning(data, msg)
151
152void dump_trace(struct task_struct *task, struct pt_regs *regs,
153 unsigned long *stack,
154 struct stacktrace_ops *ops, void *data)
155{
156 unsigned long ebp = 0;
157
158 if (!task)
159 task = current;
160
161 if (!stack) {
162 unsigned long dummy;
163 stack = &dummy;
164 if (task != current)
165 stack = (unsigned long *)task->thread.esp;
166 }
167
168#ifdef CONFIG_FRAME_POINTER
169 if (!ebp) {
170 if (task == current) {
171 /* Grab ebp right from our regs */
172 asm ("movl %%ebp, %0" : "=r" (ebp) : );
173 } else {
174 /* ebp is the last reg pushed by switch_to */
175 ebp = *(unsigned long *) task->thread.esp;
176 }
177 }
178#endif
179
180 while (1) {
181 struct thread_info *context;
182 context = (struct thread_info *)
183 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
184 ebp = print_context_stack(context, stack, ebp, ops, data);
185 /* Should be after the line below, but somewhere
186 in early boot context comes out corrupted and we
187 can't reference it -AK */
188 if (ops->stack(data, "IRQ") < 0)
189 break;
190 stack = (unsigned long*)context->previous_esp;
191 if (!stack)
192 break;
193 touch_nmi_watchdog();
194 }
195}
196EXPORT_SYMBOL(dump_trace);
197
198static void
199print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
200{
201 printk(data);
202 print_symbol(msg, symbol);
203 printk("\n");
204}
205
206static void print_trace_warning(void *data, char *msg)
207{
208 printk("%s%s\n", (char *)data, msg);
209}
210
211static int print_trace_stack(void *data, char *name)
212{
213 return 0;
214}
215
216/*
217 * Print one address/symbol entries per line.
218 */
219static void print_trace_address(void *data, unsigned long addr)
220{
221 printk("%s [<%08lx>] ", (char *)data, addr);
222 print_symbol("%s\n", addr);
223 touch_nmi_watchdog();
224}
225
226static struct stacktrace_ops print_trace_ops = {
227 .warning = print_trace_warning,
228 .warning_symbol = print_trace_warning_symbol,
229 .stack = print_trace_stack,
230 .address = print_trace_address,
231};
232
233static void
234show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
235 unsigned long * stack, char *log_lvl)
236{
237 dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
238 printk("%s =======================\n", log_lvl);
239}
240
241void show_trace(struct task_struct *task, struct pt_regs *regs,
242 unsigned long * stack)
243{
244 show_trace_log_lvl(task, regs, stack, "");
245}
246
247static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
248 unsigned long *esp, char *log_lvl)
249{
250 unsigned long *stack;
251 int i;
252
253 if (esp == NULL) {
254 if (task)
255 esp = (unsigned long*)task->thread.esp;
256 else
257 esp = (unsigned long *)&esp;
258 }
259
260 stack = esp;
261 for(i = 0; i < kstack_depth_to_print; i++) {
262 if (kstack_end(stack))
263 break;
264 if (i && ((i % 8) == 0))
265 printk("\n%s ", log_lvl);
266 printk("%08lx ", *stack++);
267 }
268 printk("\n%sCall Trace:\n", log_lvl);
269 show_trace_log_lvl(task, regs, esp, log_lvl);
270}
271
272void show_stack(struct task_struct *task, unsigned long *esp)
273{
274 printk(" ");
275 show_stack_log_lvl(task, NULL, esp, "");
276}
277
278/*
279 * The architecture-independent dump_stack generator
280 */
281void dump_stack(void)
282{
283 unsigned long stack;
284
285 show_trace(current, NULL, &stack);
286}
287
288EXPORT_SYMBOL(dump_stack);
289
290void show_registers(struct pt_regs *regs)
291{
292 int i;
293 int in_kernel = 1;
294 unsigned long esp;
295 unsigned short ss, gs;
296
297 esp = (unsigned long) (&regs->esp);
298 savesegment(ss, ss);
299 savesegment(gs, gs);
300 if (user_mode_vm(regs)) {
301 in_kernel = 0;
302 esp = regs->esp;
303 ss = regs->xss & 0xffff;
304 }
305 print_modules();
306 printk(KERN_EMERG "CPU: %d\n"
307 KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
308 KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
309 smp_processor_id(), 0xffff & regs->xcs, regs->eip,
310 print_tainted(), regs->eflags, init_utsname()->release,
311 (int)strcspn(init_utsname()->version, " "),
312 init_utsname()->version);
313 print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
314 printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
315 regs->eax, regs->ebx, regs->ecx, regs->edx);
316 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
317 regs->esi, regs->edi, regs->ebp, esp);
318 printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
319 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
320 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
321 TASK_COMM_LEN, current->comm, current->pid,
322 current_thread_info(), current, task_thread_info(current));
323 /*
324 * When in-kernel, we also print out the stack and code at the
325 * time of the fault..
326 */
327 if (in_kernel) {
328 u8 *eip;
329 unsigned int code_prologue = code_bytes * 43 / 64;
330 unsigned int code_len = code_bytes;
331 unsigned char c;
332
333 printk("\n" KERN_EMERG "Stack: ");
334 show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
335
336 printk(KERN_EMERG "Code: ");
337
338 eip = (u8 *)regs->eip - code_prologue;
339 if (eip < (u8 *)PAGE_OFFSET ||
340 probe_kernel_address(eip, c)) {
341 /* try starting at EIP */
342 eip = (u8 *)regs->eip;
343 code_len = code_len - code_prologue + 1;
344 }
345 for (i = 0; i < code_len; i++, eip++) {
346 if (eip < (u8 *)PAGE_OFFSET ||
347 probe_kernel_address(eip, c)) {
348 printk(" Bad EIP value.");
349 break;
350 }
351 if (eip == (u8 *)regs->eip)
352 printk("<%02x> ", c);
353 else
354 printk("%02x ", c);
355 }
356 }
357 printk("\n");
358}
359
360int is_valid_bugaddr(unsigned long eip)
361{
362 unsigned short ud2;
363
364 if (eip < PAGE_OFFSET)
365 return 0;
366 if (probe_kernel_address((unsigned short *)eip, ud2))
367 return 0;
368
369 return ud2 == 0x0b0f;
370}
371
372/*
373 * This is gone through when something in the kernel has done something bad and
374 * is about to be terminated.
375 */
376void die(const char * str, struct pt_regs * regs, long err)
377{
378 static struct {
379 spinlock_t lock;
380 u32 lock_owner;
381 int lock_owner_depth;
382 } die = {
383 .lock = __SPIN_LOCK_UNLOCKED(die.lock),
384 .lock_owner = -1,
385 .lock_owner_depth = 0
386 };
387 static int die_counter;
388 unsigned long flags;
389
390 oops_enter();
391
392 if (die.lock_owner != raw_smp_processor_id()) {
393 console_verbose();
394 spin_lock_irqsave(&die.lock, flags);
395 die.lock_owner = smp_processor_id();
396 die.lock_owner_depth = 0;
397 bust_spinlocks(1);
398 }
399 else
400 local_save_flags(flags);
401
402 if (++die.lock_owner_depth < 3) {
403 int nl = 0;
404 unsigned long esp;
405 unsigned short ss;
406
407 report_bug(regs->eip, regs);
408
409 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
410#ifdef CONFIG_PREEMPT
411 printk(KERN_EMERG "PREEMPT ");
412 nl = 1;
413#endif
414#ifdef CONFIG_SMP
415 if (!nl)
416 printk(KERN_EMERG);
417 printk("SMP ");
418 nl = 1;
419#endif
420#ifdef CONFIG_DEBUG_PAGEALLOC
421 if (!nl)
422 printk(KERN_EMERG);
423 printk("DEBUG_PAGEALLOC");
424 nl = 1;
425#endif
426 if (nl)
427 printk("\n");
428 if (notify_die(DIE_OOPS, str, regs, err,
429 current->thread.trap_no, SIGSEGV) !=
430 NOTIFY_STOP) {
431 show_registers(regs);
432 /* Executive summary in case the oops scrolled away */
433 esp = (unsigned long) (&regs->esp);
434 savesegment(ss, ss);
435 if (user_mode(regs)) {
436 esp = regs->esp;
437 ss = regs->xss & 0xffff;
438 }
439 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
440 print_symbol("%s", regs->eip);
441 printk(" SS:ESP %04x:%08lx\n", ss, esp);
442 }
443 else
444 regs = NULL;
445 } else
446 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
447
448 bust_spinlocks(0);
449 die.lock_owner = -1;
450 add_taint(TAINT_DIE);
451 spin_unlock_irqrestore(&die.lock, flags);
452
453 if (!regs)
454 return;
455
456 if (kexec_should_crash(current))
457 crash_kexec(regs);
458
459 if (in_interrupt())
460 panic("Fatal exception in interrupt");
461
462 if (panic_on_oops)
463 panic("Fatal exception");
464
465 oops_exit();
466 do_exit(SIGSEGV);
467}
468
469static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
470{
471 if (!user_mode_vm(regs))
472 die(str, regs, err);
473}
474
475static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
476 struct pt_regs * regs, long error_code,
477 siginfo_t *info)
478{
479 struct task_struct *tsk = current;
480
481 if (regs->eflags & VM_MASK) {
482 if (vm86)
483 goto vm86_trap;
484 goto trap_signal;
485 }
486
487 if (!user_mode(regs))
488 goto kernel_trap;
489
490 trap_signal: {
491 /*
492 * We want error_code and trap_no set for userspace faults and
493 * kernelspace faults which result in die(), but not
494 * kernelspace faults which are fixed up. die() gives the
495 * process no chance to handle the signal and notice the
496 * kernel fault information, so that won't result in polluting
497 * the information about previously queued, but not yet
498 * delivered, faults. See also do_general_protection below.
499 */
500 tsk->thread.error_code = error_code;
501 tsk->thread.trap_no = trapnr;
502
503 if (info)
504 force_sig_info(signr, info, tsk);
505 else
506 force_sig(signr, tsk);
507 return;
508 }
509
510 kernel_trap: {
511 if (!fixup_exception(regs)) {
512 tsk->thread.error_code = error_code;
513 tsk->thread.trap_no = trapnr;
514 die(str, regs, error_code);
515 }
516 return;
517 }
518
519 vm86_trap: {
520 int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
521 if (ret) goto trap_signal;
522 return;
523 }
524}
525
526#define DO_ERROR(trapnr, signr, str, name) \
527fastcall void do_##name(struct pt_regs * regs, long error_code) \
528{ \
529 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
530 == NOTIFY_STOP) \
531 return; \
532 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
533}
534
535#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
536fastcall void do_##name(struct pt_regs * regs, long error_code) \
537{ \
538 siginfo_t info; \
539 if (irq) \
540 local_irq_enable(); \
541 info.si_signo = signr; \
542 info.si_errno = 0; \
543 info.si_code = sicode; \
544 info.si_addr = (void __user *)siaddr; \
545 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
546 == NOTIFY_STOP) \
547 return; \
548 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
549}
550
551#define DO_VM86_ERROR(trapnr, signr, str, name) \
552fastcall void do_##name(struct pt_regs * regs, long error_code) \
553{ \
554 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
555 == NOTIFY_STOP) \
556 return; \
557 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
558}
559
560#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
561fastcall void do_##name(struct pt_regs * regs, long error_code) \
562{ \
563 siginfo_t info; \
564 info.si_signo = signr; \
565 info.si_errno = 0; \
566 info.si_code = sicode; \
567 info.si_addr = (void __user *)siaddr; \
568 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
569 == NOTIFY_STOP) \
570 return; \
571 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
572}
573
574DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
575#ifndef CONFIG_KPROBES
576DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
577#endif
578DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
579DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
580DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
581DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
582DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
583DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
584DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
585DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
586DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
587
588fastcall void __kprobes do_general_protection(struct pt_regs * regs,
589 long error_code)
590{
591 int cpu = get_cpu();
592 struct tss_struct *tss = &per_cpu(init_tss, cpu);
593 struct thread_struct *thread = &current->thread;
594
595 /*
596 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
597 * invalid offset set (the LAZY one) and the faulting thread has
598 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
599 * and we set the offset field correctly. Then we let the CPU to
600 * restart the faulting instruction.
601 */
602 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
603 thread->io_bitmap_ptr) {
604 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
605 thread->io_bitmap_max);
606 /*
607 * If the previously set map was extending to higher ports
608 * than the current one, pad extra space with 0xff (no access).
609 */
610 if (thread->io_bitmap_max < tss->io_bitmap_max)
611 memset((char *) tss->io_bitmap +
612 thread->io_bitmap_max, 0xff,
613 tss->io_bitmap_max - thread->io_bitmap_max);
614 tss->io_bitmap_max = thread->io_bitmap_max;
615 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
616 tss->io_bitmap_owner = thread;
617 put_cpu();
618 return;
619 }
620 put_cpu();
621
622 if (regs->eflags & VM_MASK)
623 goto gp_in_vm86;
624
625 if (!user_mode(regs))
626 goto gp_in_kernel;
627
628 current->thread.error_code = error_code;
629 current->thread.trap_no = 13;
630 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
631 printk_ratelimit())
632 printk(KERN_INFO
633 "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
634 current->comm, current->pid,
635 regs->eip, regs->esp, error_code);
636
637 force_sig(SIGSEGV, current);
638 return;
639
640gp_in_vm86:
641 local_irq_enable();
642 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
643 return;
644
645gp_in_kernel:
646 if (!fixup_exception(regs)) {
647 current->thread.error_code = error_code;
648 current->thread.trap_no = 13;
649 if (notify_die(DIE_GPF, "general protection fault", regs,
650 error_code, 13, SIGSEGV) == NOTIFY_STOP)
651 return;
652 die("general protection fault", regs, error_code);
653 }
654}
655
656static __kprobes void
657mem_parity_error(unsigned char reason, struct pt_regs * regs)
658{
659 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
660 "CPU %d.\n", reason, smp_processor_id());
661 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
662
663#if defined(CONFIG_EDAC)
664 if(edac_handler_set()) {
665 edac_atomic_assert_error();
666 return;
667 }
668#endif
669
670 if (panic_on_unrecovered_nmi)
671 panic("NMI: Not continuing");
672
673 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
674
675 /* Clear and disable the memory parity error line. */
676 clear_mem_error(reason);
677}
678
679static __kprobes void
680io_check_error(unsigned char reason, struct pt_regs * regs)
681{
682 unsigned long i;
683
684 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
685 show_registers(regs);
686
687 /* Re-enable the IOCK line, wait for a few seconds */
688 reason = (reason & 0xf) | 8;
689 outb(reason, 0x61);
690 i = 2000;
691 while (--i) udelay(1000);
692 reason &= ~8;
693 outb(reason, 0x61);
694}
695
696static __kprobes void
697unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
698{
699#ifdef CONFIG_MCA
700 /* Might actually be able to figure out what the guilty party
701 * is. */
702 if( MCA_bus ) {
703 mca_handle_nmi();
704 return;
705 }
706#endif
707 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
708 "CPU %d.\n", reason, smp_processor_id());
709 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
710 if (panic_on_unrecovered_nmi)
711 panic("NMI: Not continuing");
712
713 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
714}
715
716static DEFINE_SPINLOCK(nmi_print_lock);
717
718void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
719{
720 if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
721 NOTIFY_STOP)
722 return;
723
724 spin_lock(&nmi_print_lock);
725 /*
726 * We are in trouble anyway, lets at least try
727 * to get a message out.
728 */
729 bust_spinlocks(1);
730 printk(KERN_EMERG "%s", msg);
731 printk(" on CPU%d, eip %08lx, registers:\n",
732 smp_processor_id(), regs->eip);
733 show_registers(regs);
734 console_silent();
735 spin_unlock(&nmi_print_lock);
736 bust_spinlocks(0);
737
738 /* If we are in kernel we are probably nested up pretty bad
739 * and might aswell get out now while we still can.
740 */
741 if (!user_mode_vm(regs)) {
742 current->thread.trap_no = 2;
743 crash_kexec(regs);
744 }
745
746 do_exit(SIGSEGV);
747}
748
749static __kprobes void default_do_nmi(struct pt_regs * regs)
750{
751 unsigned char reason = 0;
752
753 /* Only the BSP gets external NMIs from the system. */
754 if (!smp_processor_id())
755 reason = get_nmi_reason();
756
757 if (!(reason & 0xc0)) {
758 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
759 == NOTIFY_STOP)
760 return;
761#ifdef CONFIG_X86_LOCAL_APIC
762 /*
763 * Ok, so this is none of the documented NMI sources,
764 * so it must be the NMI watchdog.
765 */
766 if (nmi_watchdog_tick(regs, reason))
767 return;
768 if (!do_nmi_callback(regs, smp_processor_id()))
769#endif
770 unknown_nmi_error(reason, regs);
771
772 return;
773 }
774 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
775 return;
776 if (reason & 0x80)
777 mem_parity_error(reason, regs);
778 if (reason & 0x40)
779 io_check_error(reason, regs);
780 /*
781 * Reassert NMI in case it became active meanwhile
782 * as it's edge-triggered.
783 */
784 reassert_nmi();
785}
786
787static int ignore_nmis;
788
789fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
790{
791 int cpu;
792
793 nmi_enter();
794
795 cpu = smp_processor_id();
796
797 ++nmi_count(cpu);
798
799 if (!ignore_nmis)
800 default_do_nmi(regs);
801
802 nmi_exit();
803}
804
805void stop_nmi(void)
806{
807 acpi_nmi_disable();
808 ignore_nmis++;
809}
810
811void restart_nmi(void)
812{
813 ignore_nmis--;
814 acpi_nmi_enable();
815}
816
817#ifdef CONFIG_KPROBES
818fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
819{
820 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
821 == NOTIFY_STOP)
822 return;
823 /* This is an interrupt gate, because kprobes wants interrupts
824 disabled. Normal trap handlers don't. */
825 restore_interrupts(regs);
826 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
827}
828#endif
829
830/*
831 * Our handling of the processor debug registers is non-trivial.
832 * We do not clear them on entry and exit from the kernel. Therefore
833 * it is possible to get a watchpoint trap here from inside the kernel.
834 * However, the code in ./ptrace.c has ensured that the user can
835 * only set watchpoints on userspace addresses. Therefore the in-kernel
836 * watchpoint trap can only occur in code which is reading/writing
837 * from user space. Such code must not hold kernel locks (since it
838 * can equally take a page fault), therefore it is safe to call
839 * force_sig_info even though that claims and releases locks.
840 *
841 * Code in ./signal.c ensures that the debug control register
842 * is restored before we deliver any signal, and therefore that
843 * user code runs with the correct debug control register even though
844 * we clear it here.
845 *
846 * Being careful here means that we don't have to be as careful in a
847 * lot of more complicated places (task switching can be a bit lazy
848 * about restoring all the debug state, and ptrace doesn't have to
849 * find every occurrence of the TF bit that could be saved away even
850 * by user code)
851 */
852fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
853{
854 unsigned int condition;
855 struct task_struct *tsk = current;
856
857 get_debugreg(condition, 6);
858
859 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
860 SIGTRAP) == NOTIFY_STOP)
861 return;
862 /* It's safe to allow irq's after DR6 has been saved */
863 if (regs->eflags & X86_EFLAGS_IF)
864 local_irq_enable();
865
866 /* Mask out spurious debug traps due to lazy DR7 setting */
867 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
868 if (!tsk->thread.debugreg[7])
869 goto clear_dr7;
870 }
871
872 if (regs->eflags & VM_MASK)
873 goto debug_vm86;
874
875 /* Save debug status register where ptrace can see it */
876 tsk->thread.debugreg[6] = condition;
877
878 /*
879 * Single-stepping through TF: make sure we ignore any events in
880 * kernel space (but re-enable TF when returning to user mode).
881 */
882 if (condition & DR_STEP) {
883 /*
884 * We already checked v86 mode above, so we can
885 * check for kernel mode by just checking the CPL
886 * of CS.
887 */
888 if (!user_mode(regs))
889 goto clear_TF_reenable;
890 }
891
892 /* Ok, finally something we can handle */
893 send_sigtrap(tsk, regs, error_code);
894
895 /* Disable additional traps. They'll be re-enabled when
896 * the signal is delivered.
897 */
898clear_dr7:
899 set_debugreg(0, 7);
900 return;
901
902debug_vm86:
903 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
904 return;
905
906clear_TF_reenable:
907 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
908 regs->eflags &= ~TF_MASK;
909 return;
910}
911
912/*
913 * Note that we play around with the 'TS' bit in an attempt to get
914 * the correct behaviour even in the presence of the asynchronous
915 * IRQ13 behaviour
916 */
917void math_error(void __user *eip)
918{
919 struct task_struct * task;
920 siginfo_t info;
921 unsigned short cwd, swd;
922
923 /*
924 * Save the info for the exception handler and clear the error.
925 */
926 task = current;
927 save_init_fpu(task);
928 task->thread.trap_no = 16;
929 task->thread.error_code = 0;
930 info.si_signo = SIGFPE;
931 info.si_errno = 0;
932 info.si_code = __SI_FAULT;
933 info.si_addr = eip;
934 /*
935 * (~cwd & swd) will mask out exceptions that are not set to unmasked
936 * status. 0x3f is the exception bits in these regs, 0x200 is the
937 * C1 reg you need in case of a stack fault, 0x040 is the stack
938 * fault bit. We should only be taking one exception at a time,
939 * so if this combination doesn't produce any single exception,
940 * then we have a bad program that isn't syncronizing its FPU usage
941 * and it will suffer the consequences since we won't be able to
942 * fully reproduce the context of the exception
943 */
944 cwd = get_fpu_cwd(task);
945 swd = get_fpu_swd(task);
946 switch (swd & ~cwd & 0x3f) {
947 case 0x000: /* No unmasked exception */
948 return;
949 default: /* Multiple exceptions */
950 break;
951 case 0x001: /* Invalid Op */
952 /*
953 * swd & 0x240 == 0x040: Stack Underflow
954 * swd & 0x240 == 0x240: Stack Overflow
955 * User must clear the SF bit (0x40) if set
956 */
957 info.si_code = FPE_FLTINV;
958 break;
959 case 0x002: /* Denormalize */
960 case 0x010: /* Underflow */
961 info.si_code = FPE_FLTUND;
962 break;
963 case 0x004: /* Zero Divide */
964 info.si_code = FPE_FLTDIV;
965 break;
966 case 0x008: /* Overflow */
967 info.si_code = FPE_FLTOVF;
968 break;
969 case 0x020: /* Precision */
970 info.si_code = FPE_FLTRES;
971 break;
972 }
973 force_sig_info(SIGFPE, &info, task);
974}
975
976fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
977{
978 ignore_fpu_irq = 1;
979 math_error((void __user *)regs->eip);
980}
981
982static void simd_math_error(void __user *eip)
983{
984 struct task_struct * task;
985 siginfo_t info;
986 unsigned short mxcsr;
987
988 /*
989 * Save the info for the exception handler and clear the error.
990 */
991 task = current;
992 save_init_fpu(task);
993 task->thread.trap_no = 19;
994 task->thread.error_code = 0;
995 info.si_signo = SIGFPE;
996 info.si_errno = 0;
997 info.si_code = __SI_FAULT;
998 info.si_addr = eip;
999 /*
1000 * The SIMD FPU exceptions are handled a little differently, as there
1001 * is only a single status/control register. Thus, to determine which
1002 * unmasked exception was caught we must mask the exception mask bits
1003 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1004 */
1005 mxcsr = get_fpu_mxcsr(task);
1006 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1007 case 0x000:
1008 default:
1009 break;
1010 case 0x001: /* Invalid Op */
1011 info.si_code = FPE_FLTINV;
1012 break;
1013 case 0x002: /* Denormalize */
1014 case 0x010: /* Underflow */
1015 info.si_code = FPE_FLTUND;
1016 break;
1017 case 0x004: /* Zero Divide */
1018 info.si_code = FPE_FLTDIV;
1019 break;
1020 case 0x008: /* Overflow */
1021 info.si_code = FPE_FLTOVF;
1022 break;
1023 case 0x020: /* Precision */
1024 info.si_code = FPE_FLTRES;
1025 break;
1026 }
1027 force_sig_info(SIGFPE, &info, task);
1028}
1029
1030fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
1031 long error_code)
1032{
1033 if (cpu_has_xmm) {
1034 /* Handle SIMD FPU exceptions on PIII+ processors. */
1035 ignore_fpu_irq = 1;
1036 simd_math_error((void __user *)regs->eip);
1037 } else {
1038 /*
1039 * Handle strange cache flush from user space exception
1040 * in all other cases. This is undocumented behaviour.
1041 */
1042 if (regs->eflags & VM_MASK) {
1043 handle_vm86_fault((struct kernel_vm86_regs *)regs,
1044 error_code);
1045 return;
1046 }
1047 current->thread.trap_no = 19;
1048 current->thread.error_code = error_code;
1049 die_if_kernel("cache flush denied", regs, error_code);
1050 force_sig(SIGSEGV, current);
1051 }
1052}
1053
1054fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1055 long error_code)
1056{
1057#if 0
1058 /* No need to warn about this any longer. */
1059 printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1060#endif
1061}
1062
1063fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1064 unsigned long kesp)
1065{
1066 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
1067 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1068 unsigned long new_kesp = kesp - base;
1069 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1070 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1071 /* Set up base for espfix segment */
1072 desc &= 0x00f0ff0000000000ULL;
1073 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1074 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1075 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1076 (lim_pages & 0xffff);
1077 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1078 return new_kesp;
1079}
1080
1081/*
1082 * 'math_state_restore()' saves the current math information in the
1083 * old math state array, and gets the new ones from the current task
1084 *
1085 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1086 * Don't touch unless you *really* know how it works.
1087 *
1088 * Must be called with kernel preemption disabled (in this case,
1089 * local interrupts are disabled at the call-site in entry.S).
1090 */
1091asmlinkage void math_state_restore(void)
1092{
1093 struct thread_info *thread = current_thread_info();
1094 struct task_struct *tsk = thread->task;
1095
1096 clts(); /* Allow maths ops (or we recurse) */
1097 if (!tsk_used_math(tsk))
1098 init_fpu(tsk);
1099 restore_fpu(tsk);
1100 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1101 tsk->fpu_counter++;
1102}
1103EXPORT_SYMBOL_GPL(math_state_restore);
1104
1105#ifndef CONFIG_MATH_EMULATION
1106
1107asmlinkage void math_emulate(long arg)
1108{
1109 printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
1110 printk(KERN_EMERG "killing %s.\n",current->comm);
1111 force_sig(SIGFPE,current);
1112 schedule();
1113}
1114
1115#endif /* CONFIG_MATH_EMULATION */
1116
1117#ifdef CONFIG_X86_F00F_BUG
1118void __init trap_init_f00f_bug(void)
1119{
1120 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
1121
1122 /*
1123 * Update the IDT descriptor and reload the IDT so that
1124 * it uses the read-only mapped virtual address.
1125 */
1126 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
1127 load_idt(&idt_descr);
1128}
1129#endif
1130
1131/*
1132 * This needs to use 'idt_table' rather than 'idt', and
1133 * thus use the _nonmapped_ version of the IDT, as the
1134 * Pentium F0 0F bugfix can have resulted in the mapped
1135 * IDT being write-protected.
1136 */
1137void set_intr_gate(unsigned int n, void *addr)
1138{
1139 _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
1140}
1141
1142/*
1143 * This routine sets up an interrupt gate at directory privilege level 3.
1144 */
1145static inline void set_system_intr_gate(unsigned int n, void *addr)
1146{
1147 _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
1148}
1149
1150static void __init set_trap_gate(unsigned int n, void *addr)
1151{
1152 _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
1153}
1154
1155static void __init set_system_gate(unsigned int n, void *addr)
1156{
1157 _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
1158}
1159
1160static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1161{
1162 _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
1163}
1164
1165
1166void __init trap_init(void)
1167{
1168#ifdef CONFIG_EISA
1169 void __iomem *p = ioremap(0x0FFFD9, 4);
1170 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1171 EISA_bus = 1;
1172 }
1173 iounmap(p);
1174#endif
1175
1176#ifdef CONFIG_X86_LOCAL_APIC
1177 init_apic_mappings();
1178#endif
1179
1180 set_trap_gate(0,&divide_error);
1181 set_intr_gate(1,&debug);
1182 set_intr_gate(2,&nmi);
1183 set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
1184 set_system_gate(4,&overflow);
1185 set_trap_gate(5,&bounds);
1186 set_trap_gate(6,&invalid_op);
1187 set_trap_gate(7,&device_not_available);
1188 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
1189 set_trap_gate(9,&coprocessor_segment_overrun);
1190 set_trap_gate(10,&invalid_TSS);
1191 set_trap_gate(11,&segment_not_present);
1192 set_trap_gate(12,&stack_segment);
1193 set_trap_gate(13,&general_protection);
1194 set_intr_gate(14,&page_fault);
1195 set_trap_gate(15,&spurious_interrupt_bug);
1196 set_trap_gate(16,&coprocessor_error);
1197 set_trap_gate(17,&alignment_check);
1198#ifdef CONFIG_X86_MCE
1199 set_trap_gate(18,&machine_check);
1200#endif
1201 set_trap_gate(19,&simd_coprocessor_error);
1202
1203 if (cpu_has_fxsr) {
1204 /*
1205 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1206 * Generates a compile-time "error: zero width for bit-field" if
1207 * the alignment is wrong.
1208 */
1209 struct fxsrAlignAssert {
1210 int _:!(offsetof(struct task_struct,
1211 thread.i387.fxsave) & 15);
1212 };
1213
1214 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1215 set_in_cr4(X86_CR4_OSFXSR);
1216 printk("done.\n");
1217 }
1218 if (cpu_has_xmm) {
1219 printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
1220 "support... ");
1221 set_in_cr4(X86_CR4_OSXMMEXCPT);
1222 printk("done.\n");
1223 }
1224
1225 set_system_gate(SYSCALL_VECTOR,&system_call);
1226
1227 /*
1228 * Should be a barrier for any external CPU state.
1229 */
1230 cpu_init();
1231
1232 trap_init_hook();
1233}
1234
1235static int __init kstack_setup(char *s)
1236{
1237 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1238 return 1;
1239}
1240__setup("kstack=", kstack_setup);
1241
1242static int __init code_bytes_setup(char *s)
1243{
1244 code_bytes = simple_strtoul(s, NULL, 0);
1245 if (code_bytes > 8192)
1246 code_bytes = 8192;
1247
1248 return 1;
1249}
1250__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
new file mode 100644
index 000000000000..a39280b4dd3a
--- /dev/null
+++ b/arch/x86/kernel/tsc_32.c
@@ -0,0 +1,413 @@
1/*
2 * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
3 * which was originally moved from arch/i386/kernel/time.c.
4 * See comments there for proper credits.
5 */
6
7#include <linux/sched.h>
8#include <linux/clocksource.h>
9#include <linux/workqueue.h>
10#include <linux/cpufreq.h>
11#include <linux/jiffies.h>
12#include <linux/init.h>
13#include <linux/dmi.h>
14
15#include <asm/delay.h>
16#include <asm/tsc.h>
17#include <asm/io.h>
18#include <asm/timer.h>
19
20#include "mach_timer.h"
21
22static int tsc_enabled;
23
24/*
25 * On some systems the TSC frequency does not
26 * change with the cpu frequency. So we need
27 * an extra value to store the TSC freq
28 */
29unsigned int tsc_khz;
30EXPORT_SYMBOL_GPL(tsc_khz);
31
32int tsc_disable;
33
34#ifdef CONFIG_X86_TSC
35static int __init tsc_setup(char *str)
36{
37 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
38 "cannot disable TSC.\n");
39 return 1;
40}
41#else
42/*
43 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
44 * in cpu/common.c
45 */
46static int __init tsc_setup(char *str)
47{
48 tsc_disable = 1;
49
50 return 1;
51}
52#endif
53
54__setup("notsc", tsc_setup);
55
56/*
57 * code to mark and check if the TSC is unstable
58 * due to cpufreq or due to unsynced TSCs
59 */
60static int tsc_unstable;
61
62int check_tsc_unstable(void)
63{
64 return tsc_unstable;
65}
66EXPORT_SYMBOL_GPL(check_tsc_unstable);
67
68/* Accellerators for sched_clock()
69 * convert from cycles(64bits) => nanoseconds (64bits)
70 * basic equation:
71 * ns = cycles / (freq / ns_per_sec)
72 * ns = cycles * (ns_per_sec / freq)
73 * ns = cycles * (10^9 / (cpu_khz * 10^3))
74 * ns = cycles * (10^6 / cpu_khz)
75 *
76 * Then we use scaling math (suggested by george@mvista.com) to get:
77 * ns = cycles * (10^6 * SC / cpu_khz) / SC
78 * ns = cycles * cyc2ns_scale / SC
79 *
80 * And since SC is a constant power of two, we can convert the div
81 * into a shift.
82 *
83 * We can use khz divisor instead of mhz to keep a better percision, since
84 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
85 * (mathieu.desnoyers@polymtl.ca)
86 *
87 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
88 */
89unsigned long cyc2ns_scale __read_mostly;
90
91#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
92
93static inline void set_cyc2ns_scale(unsigned long cpu_khz)
94{
95 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
96}
97
98/*
99 * Scheduler clock - returns current time in nanosec units.
100 */
101unsigned long long native_sched_clock(void)
102{
103 unsigned long long this_offset;
104
105 /*
106 * Fall back to jiffies if there's no TSC available:
107 * ( But note that we still use it if the TSC is marked
108 * unstable. We do this because unlike Time Of Day,
109 * the scheduler clock tolerates small errors and it's
110 * very important for it to be as fast as the platform
111 * can achive it. )
112 */
113 if (unlikely(!tsc_enabled && !tsc_unstable))
114 /* No locking but a rare wrong value is not a big deal: */
115 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
116
117 /* read the Time Stamp Counter: */
118 rdtscll(this_offset);
119
120 /* return the value in ns */
121 return cycles_2_ns(this_offset);
122}
123
124/* We need to define a real function for sched_clock, to override the
125 weak default version */
126#ifdef CONFIG_PARAVIRT
127unsigned long long sched_clock(void)
128{
129 return paravirt_sched_clock();
130}
131#else
132unsigned long long sched_clock(void)
133 __attribute__((alias("native_sched_clock")));
134#endif
135
136unsigned long native_calculate_cpu_khz(void)
137{
138 unsigned long long start, end;
139 unsigned long count;
140 u64 delta64;
141 int i;
142 unsigned long flags;
143
144 local_irq_save(flags);
145
146 /* run 3 times to ensure the cache is warm */
147 for (i = 0; i < 3; i++) {
148 mach_prepare_counter();
149 rdtscll(start);
150 mach_countup(&count);
151 rdtscll(end);
152 }
153 /*
154 * Error: ECTCNEVERSET
155 * The CTC wasn't reliable: we got a hit on the very first read,
156 * or the CPU was so fast/slow that the quotient wouldn't fit in
157 * 32 bits..
158 */
159 if (count <= 1)
160 goto err;
161
162 delta64 = end - start;
163
164 /* cpu freq too fast: */
165 if (delta64 > (1ULL<<32))
166 goto err;
167
168 /* cpu freq too slow: */
169 if (delta64 <= CALIBRATE_TIME_MSEC)
170 goto err;
171
172 delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
173 do_div(delta64,CALIBRATE_TIME_MSEC);
174
175 local_irq_restore(flags);
176 return (unsigned long)delta64;
177err:
178 local_irq_restore(flags);
179 return 0;
180}
181
182int recalibrate_cpu_khz(void)
183{
184#ifndef CONFIG_SMP
185 unsigned long cpu_khz_old = cpu_khz;
186
187 if (cpu_has_tsc) {
188 cpu_khz = calculate_cpu_khz();
189 tsc_khz = cpu_khz;
190 cpu_data[0].loops_per_jiffy =
191 cpufreq_scale(cpu_data[0].loops_per_jiffy,
192 cpu_khz_old, cpu_khz);
193 return 0;
194 } else
195 return -ENODEV;
196#else
197 return -ENODEV;
198#endif
199}
200
201EXPORT_SYMBOL(recalibrate_cpu_khz);
202
203#ifdef CONFIG_CPU_FREQ
204
205/*
206 * if the CPU frequency is scaled, TSC-based delays will need a different
207 * loops_per_jiffy value to function properly.
208 */
209static unsigned int ref_freq = 0;
210static unsigned long loops_per_jiffy_ref = 0;
211static unsigned long cpu_khz_ref = 0;
212
213static int
214time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
215{
216 struct cpufreq_freqs *freq = data;
217
218 if (!ref_freq) {
219 if (!freq->old){
220 ref_freq = freq->new;
221 return 0;
222 }
223 ref_freq = freq->old;
224 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
225 cpu_khz_ref = cpu_khz;
226 }
227
228 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
229 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
230 (val == CPUFREQ_RESUMECHANGE)) {
231 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
232 cpu_data[freq->cpu].loops_per_jiffy =
233 cpufreq_scale(loops_per_jiffy_ref,
234 ref_freq, freq->new);
235
236 if (cpu_khz) {
237
238 if (num_online_cpus() == 1)
239 cpu_khz = cpufreq_scale(cpu_khz_ref,
240 ref_freq, freq->new);
241 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
242 tsc_khz = cpu_khz;
243 set_cyc2ns_scale(cpu_khz);
244 /*
245 * TSC based sched_clock turns
246 * to junk w/ cpufreq
247 */
248 mark_tsc_unstable("cpufreq changes");
249 }
250 }
251 }
252
253 return 0;
254}
255
256static struct notifier_block time_cpufreq_notifier_block = {
257 .notifier_call = time_cpufreq_notifier
258};
259
260static int __init cpufreq_tsc(void)
261{
262 return cpufreq_register_notifier(&time_cpufreq_notifier_block,
263 CPUFREQ_TRANSITION_NOTIFIER);
264}
265core_initcall(cpufreq_tsc);
266
267#endif
268
269/* clock source code */
270
271static unsigned long current_tsc_khz = 0;
272
273static cycle_t read_tsc(void)
274{
275 cycle_t ret;
276
277 rdtscll(ret);
278
279 return ret;
280}
281
282static struct clocksource clocksource_tsc = {
283 .name = "tsc",
284 .rating = 300,
285 .read = read_tsc,
286 .mask = CLOCKSOURCE_MASK(64),
287 .mult = 0, /* to be set */
288 .shift = 22,
289 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
290 CLOCK_SOURCE_MUST_VERIFY,
291};
292
293void mark_tsc_unstable(char *reason)
294{
295 if (!tsc_unstable) {
296 tsc_unstable = 1;
297 tsc_enabled = 0;
298 printk("Marking TSC unstable due to: %s.\n", reason);
299 /* Can be called before registration */
300 if (clocksource_tsc.mult)
301 clocksource_change_rating(&clocksource_tsc, 0);
302 else
303 clocksource_tsc.rating = 0;
304 }
305}
306EXPORT_SYMBOL_GPL(mark_tsc_unstable);
307
308static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
309{
310 printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
311 d->ident);
312 tsc_unstable = 1;
313 return 0;
314}
315
316/* List of systems that have known TSC problems */
317static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
318 {
319 .callback = dmi_mark_tsc_unstable,
320 .ident = "IBM Thinkpad 380XD",
321 .matches = {
322 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
323 DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
324 },
325 },
326 {}
327};
328
329/*
330 * Make an educated guess if the TSC is trustworthy and synchronized
331 * over all CPUs.
332 */
333__cpuinit int unsynchronized_tsc(void)
334{
335 if (!cpu_has_tsc || tsc_unstable)
336 return 1;
337 /*
338 * Intel systems are normally all synchronized.
339 * Exceptions must mark TSC as unstable:
340 */
341 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
342 /* assume multi socket systems are not synchronized: */
343 if (num_possible_cpus() > 1)
344 tsc_unstable = 1;
345 }
346 return tsc_unstable;
347}
348
349/*
350 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
351 */
352#ifdef CONFIG_MGEODE_LX
353/* RTSC counts during suspend */
354#define RTSC_SUSP 0x100
355
356static void __init check_geode_tsc_reliable(void)
357{
358 unsigned long val;
359
360 rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
361 if ((val & RTSC_SUSP))
362 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
363}
364#else
365static inline void check_geode_tsc_reliable(void) { }
366#endif
367
368
369void __init tsc_init(void)
370{
371 if (!cpu_has_tsc || tsc_disable)
372 goto out_no_tsc;
373
374 cpu_khz = calculate_cpu_khz();
375 tsc_khz = cpu_khz;
376
377 if (!cpu_khz)
378 goto out_no_tsc;
379
380 printk("Detected %lu.%03lu MHz processor.\n",
381 (unsigned long)cpu_khz / 1000,
382 (unsigned long)cpu_khz % 1000);
383
384 set_cyc2ns_scale(cpu_khz);
385 use_tsc_delay();
386
387 /* Check and install the TSC clocksource */
388 dmi_check_system(bad_tsc_dmi_table);
389
390 unsynchronized_tsc();
391 check_geode_tsc_reliable();
392 current_tsc_khz = tsc_khz;
393 clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
394 clocksource_tsc.shift);
395 /* lower the rating if we already know its unstable: */
396 if (check_tsc_unstable()) {
397 clocksource_tsc.rating = 0;
398 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
399 } else
400 tsc_enabled = 1;
401
402 clocksource_register(&clocksource_tsc);
403
404 return;
405
406out_no_tsc:
407 /*
408 * Set the tsc_disable flag if there's no TSC support, this
409 * makes it a fast flag for the kernel to see whether it
410 * should be using the TSC.
411 */
412 tsc_disable = 1;
413}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644
index 000000000000..12424629af87
--- /dev/null
+++ b/arch/x86/kernel/tsc_sync.c
@@ -0,0 +1 @@
#include "../../x86_64/kernel/tsc_sync.c"
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644
index 000000000000..f2dcd1d27c0a
--- /dev/null
+++ b/arch/x86/kernel/vm86_32.c
@@ -0,0 +1,843 @@
1/*
2 * linux/kernel/vm86.c
3 *
4 * Copyright (C) 1994 Linus Torvalds
5 *
6 * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
7 * stack - Manfred Spraul <manfred@colorfullife.com>
8 *
9 * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle
10 * them correctly. Now the emulation will be in a
11 * consistent state after stackfaults - Kasper Dupont
12 * <kasperd@daimi.au.dk>
13 *
14 * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
15 * <kasperd@daimi.au.dk>
16 *
17 * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
18 * caused by Kasper Dupont's changes - Stas Sergeev
19 *
20 * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
21 * Kasper Dupont <kasperd@daimi.au.dk>
22 *
23 * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
24 * Kasper Dupont <kasperd@daimi.au.dk>
25 *
26 * 9 apr 2002 - Changed stack access macros to jump to a label
27 * instead of returning to userspace. This simplifies
28 * do_int, and is needed by handle_vm6_fault. Kasper
29 * Dupont <kasperd@daimi.au.dk>
30 *
31 */
32
33#include <linux/capability.h>
34#include <linux/errno.h>
35#include <linux/interrupt.h>
36#include <linux/sched.h>
37#include <linux/kernel.h>
38#include <linux/signal.h>
39#include <linux/string.h>
40#include <linux/mm.h>
41#include <linux/smp.h>
42#include <linux/highmem.h>
43#include <linux/ptrace.h>
44#include <linux/audit.h>
45#include <linux/stddef.h>
46
47#include <asm/uaccess.h>
48#include <asm/io.h>
49#include <asm/tlbflush.h>
50#include <asm/irq.h>
51
52/*
53 * Known problems:
54 *
55 * Interrupt handling is not guaranteed:
56 * - a real x86 will disable all interrupts for one instruction
57 * after a "mov ss,xx" to make stack handling atomic even without
58 * the 'lss' instruction. We can't guarantee this in v86 mode,
59 * as the next instruction might result in a page fault or similar.
60 * - a real x86 will have interrupts disabled for one instruction
61 * past the 'sti' that enables them. We don't bother with all the
62 * details yet.
63 *
64 * Let's hope these problems do not actually matter for anything.
65 */
66
67
68#define KVM86 ((struct kernel_vm86_struct *)regs)
69#define VMPI KVM86->vm86plus
70
71
72/*
73 * 8- and 16-bit register defines..
74 */
75#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79
80/*
81 * virtual flags (16 and 32-bit versions)
82 */
83#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
84#define VEFLAGS (current->thread.v86flags)
85
86#define set_flags(X,new,mask) \
87((X) = ((X) & ~(mask)) | ((new) & (mask)))
88
89#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF)
91
92/* convert kernel_vm86_regs to vm86_regs */
93static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 const struct kernel_vm86_regs *regs)
95{
96 int ret = 0;
97
98 /* kernel_vm86_regs is missing xgs, so copy everything up to
99 (but not including) orig_eax, and then rest including orig_eax. */
100 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
101 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
102 sizeof(struct kernel_vm86_regs) -
103 offsetof(struct kernel_vm86_regs, pt.orig_eax));
104
105 return ret;
106}
107
108/* convert vm86_regs to kernel_vm86_regs */
109static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
110 const struct vm86_regs __user *user,
111 unsigned extra)
112{
113 int ret = 0;
114
115 /* copy eax-xfs inclusive */
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
117 /* copy orig_eax-__gsh+extra */
118 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
119 sizeof(struct kernel_vm86_regs) -
120 offsetof(struct kernel_vm86_regs, pt.orig_eax) +
121 extra);
122 return ret;
123}
124
125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
127{
128 struct tss_struct *tss;
129 struct pt_regs *ret;
130 unsigned long tmp;
131
132 /*
133 * This gets called from entry.S with interrupts disabled, but
134 * from process context. Enable interrupts here, before trying
135 * to access user space.
136 */
137 local_irq_enable();
138
139 if (!current->thread.vm86_info) {
140 printk("no vm86_info: BAD\n");
141 do_exit(SIGSEGV);
142 }
143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
146 if (tmp) {
147 printk("vm86: could not access userspace vm86_info\n");
148 do_exit(SIGSEGV);
149 }
150
151 tss = &per_cpu(init_tss, get_cpu());
152 current->thread.esp0 = current->thread.saved_esp0;
153 current->thread.sysenter_cs = __KERNEL_CS;
154 load_esp0(tss, &current->thread);
155 current->thread.saved_esp0 = 0;
156 put_cpu();
157
158 ret = KVM86->regs32;
159
160 ret->xfs = current->thread.saved_fs;
161 loadsegment(gs, current->thread.saved_gs);
162
163 return ret;
164}
165
166static void mark_screen_rdonly(struct mm_struct *mm)
167{
168 pgd_t *pgd;
169 pud_t *pud;
170 pmd_t *pmd;
171 pte_t *pte;
172 spinlock_t *ptl;
173 int i;
174
175 pgd = pgd_offset(mm, 0xA0000);
176 if (pgd_none_or_clear_bad(pgd))
177 goto out;
178 pud = pud_offset(pgd, 0xA0000);
179 if (pud_none_or_clear_bad(pud))
180 goto out;
181 pmd = pmd_offset(pud, 0xA0000);
182 if (pmd_none_or_clear_bad(pmd))
183 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
185 for (i = 0; i < 32; i++) {
186 if (pte_present(*pte))
187 set_pte(pte, pte_wrprotect(*pte));
188 pte++;
189 }
190 pte_unmap_unlock(pte, ptl);
191out:
192 flush_tlb();
193}
194
195
196
197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199
200asmlinkage int sys_vm86old(struct pt_regs regs)
201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
203 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space.
205 * This remains on the stack until we
206 * return to 32 bit user space.
207 */
208 struct task_struct *tsk;
209 int tmp, ret = -EPERM;
210
211 tsk = current;
212 if (tsk->thread.saved_esp0)
213 goto out;
214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
215 offsetof(struct kernel_vm86_struct, vm86plus) -
216 sizeof(info.regs));
217 ret = -EFAULT;
218 if (tmp)
219 goto out;
220 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
221 info.regs32 = &regs;
222 tsk->thread.vm86_info = v86;
223 do_sys_vm86(&info, tsk);
224 ret = 0; /* we never return here */
225out:
226 return ret;
227}
228
229
230asmlinkage int sys_vm86(struct pt_regs regs)
231{
232 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space.
234 * This remains on the stack until we
235 * return to 32 bit user space.
236 */
237 struct task_struct *tsk;
238 int tmp, ret;
239 struct vm86plus_struct __user *v86;
240
241 tsk = current;
242 switch (regs.ebx) {
243 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
248 goto out;
249 case VM86_PLUS_INSTALL_CHECK:
250 /* NOTE: on old vm86 stuff this will return the error
251 from access_ok(), because the subfunction is
252 interpreted as (invalid) address to vm86_struct.
253 So the installation check works.
254 */
255 ret = 0;
256 goto out;
257 }
258
259 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
260 ret = -EPERM;
261 if (tsk->thread.saved_esp0)
262 goto out;
263 v86 = (struct vm86plus_struct __user *)regs.ecx;
264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
265 offsetof(struct kernel_vm86_struct, regs32) -
266 sizeof(info.regs));
267 ret = -EFAULT;
268 if (tmp)
269 goto out;
270 info.regs32 = &regs;
271 info.vm86plus.is_vm86pus = 1;
272 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
273 do_sys_vm86(&info, tsk);
274 ret = 0; /* we never return here */
275out:
276 return ret;
277}
278
279
280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
281{
282 struct tss_struct *tss;
283/*
284 * make sure the vm86() system call doesn't try to do anything silly
285 */
286 info->regs.pt.xds = 0;
287 info->regs.pt.xes = 0;
288 info->regs.pt.xfs = 0;
289
290/* we are clearing gs later just before "jmp resume_userspace",
291 * because it is not saved/restored.
292 */
293
294/*
295 * The eflags register is also special: we cannot trust that the user
296 * has set it up safely, so this makes sure interrupt etc flags are
297 * inherited from protected mode.
298 */
299 VEFLAGS = info->regs.pt.eflags;
300 info->regs.pt.eflags &= SAFE_MASK;
301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
302 info->regs.pt.eflags |= VM_MASK;
303
304 switch (info->cpu_type) {
305 case CPU_286:
306 tsk->thread.v86mask = 0;
307 break;
308 case CPU_386:
309 tsk->thread.v86mask = NT_MASK | IOPL_MASK;
310 break;
311 case CPU_486:
312 tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
313 break;
314 default:
315 tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
316 break;
317 }
318
319/*
320 * Save old state, set default return value (%eax) to 0
321 */
322 info->regs32->eax = 0;
323 tsk->thread.saved_esp0 = tsk->thread.esp0;
324 tsk->thread.saved_fs = info->regs32->xfs;
325 savesegment(gs, tsk->thread.saved_gs);
326
327 tss = &per_cpu(init_tss, get_cpu());
328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
329 if (cpu_has_sep)
330 tsk->thread.sysenter_cs = 0;
331 load_esp0(tss, &tsk->thread);
332 put_cpu();
333
334 tsk->thread.screen_bitmap = info->screen_bitmap;
335 if (info->flags & VM86_SCREEN_BITMAP)
336 mark_screen_rdonly(tsk->mm);
337
338 /*call audit_syscall_exit since we do not exit via the normal paths */
339 if (unlikely(current->audit_context))
340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
341
342 __asm__ __volatile__(
343 "movl %0,%%esp\n\t"
344 "movl %1,%%ebp\n\t"
345 "mov %2, %%gs\n\t"
346 "jmp resume_userspace"
347 : /* no outputs */
348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
349 /* we never return here */
350}
351
352static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
353{
354 struct pt_regs * regs32;
355
356 regs32 = save_v86_state(regs16);
357 regs32->eax = retval;
358 __asm__ __volatile__("movl %0,%%esp\n\t"
359 "movl %1,%%ebp\n\t"
360 "jmp resume_userspace"
361 : : "r" (regs32), "r" (current_thread_info()));
362}
363
364static inline void set_IF(struct kernel_vm86_regs * regs)
365{
366 VEFLAGS |= VIF_MASK;
367 if (VEFLAGS & VIP_MASK)
368 return_to_32bit(regs, VM86_STI);
369}
370
371static inline void clear_IF(struct kernel_vm86_regs * regs)
372{
373 VEFLAGS &= ~VIF_MASK;
374}
375
376static inline void clear_TF(struct kernel_vm86_regs * regs)
377{
378 regs->pt.eflags &= ~TF_MASK;
379}
380
381static inline void clear_AC(struct kernel_vm86_regs * regs)
382{
383 regs->pt.eflags &= ~AC_MASK;
384}
385
386/* It is correct to call set_IF(regs) from the set_vflags_*
387 * functions. However someone forgot to call clear_IF(regs)
388 * in the opposite case.
389 * After the command sequence CLI PUSHF STI POPF you should
390 * end up with interrups disabled, but you ended up with
391 * interrupts enabled.
392 * ( I was testing my own changes, but the only bug I
393 * could find was in a function I had not changed. )
394 * [KD]
395 */
396
397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
398{
399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
401 if (eflags & IF_MASK)
402 set_IF(regs);
403 else
404 clear_IF(regs);
405}
406
407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
408{
409 set_flags(VFLAGS, flags, current->thread.v86mask);
410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
411 if (flags & IF_MASK)
412 set_IF(regs);
413 else
414 clear_IF(regs);
415}
416
417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
418{
419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
420
421 if (VEFLAGS & VIF_MASK)
422 flags |= IF_MASK;
423 flags |= IOPL_MASK;
424 return flags | (VEFLAGS & current->thread.v86mask);
425}
426
427static inline int is_revectored(int nr, struct revectored_struct * bitmap)
428{
429 __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
430 :"=r" (nr)
431 :"m" (*bitmap),"r" (nr));
432 return nr;
433}
434
435#define val_byte(val, n) (((__u8 *)&val)[n])
436
437#define pushb(base, ptr, val, err_label) \
438 do { \
439 __u8 __val = val; \
440 ptr--; \
441 if (put_user(__val, base + ptr) < 0) \
442 goto err_label; \
443 } while(0)
444
445#define pushw(base, ptr, val, err_label) \
446 do { \
447 __u16 __val = val; \
448 ptr--; \
449 if (put_user(val_byte(__val, 1), base + ptr) < 0) \
450 goto err_label; \
451 ptr--; \
452 if (put_user(val_byte(__val, 0), base + ptr) < 0) \
453 goto err_label; \
454 } while(0)
455
456#define pushl(base, ptr, val, err_label) \
457 do { \
458 __u32 __val = val; \
459 ptr--; \
460 if (put_user(val_byte(__val, 3), base + ptr) < 0) \
461 goto err_label; \
462 ptr--; \
463 if (put_user(val_byte(__val, 2), base + ptr) < 0) \
464 goto err_label; \
465 ptr--; \
466 if (put_user(val_byte(__val, 1), base + ptr) < 0) \
467 goto err_label; \
468 ptr--; \
469 if (put_user(val_byte(__val, 0), base + ptr) < 0) \
470 goto err_label; \
471 } while(0)
472
473#define popb(base, ptr, err_label) \
474 ({ \
475 __u8 __res; \
476 if (get_user(__res, base + ptr) < 0) \
477 goto err_label; \
478 ptr++; \
479 __res; \
480 })
481
482#define popw(base, ptr, err_label) \
483 ({ \
484 __u16 __res; \
485 if (get_user(val_byte(__res, 0), base + ptr) < 0) \
486 goto err_label; \
487 ptr++; \
488 if (get_user(val_byte(__res, 1), base + ptr) < 0) \
489 goto err_label; \
490 ptr++; \
491 __res; \
492 })
493
494#define popl(base, ptr, err_label) \
495 ({ \
496 __u32 __res; \
497 if (get_user(val_byte(__res, 0), base + ptr) < 0) \
498 goto err_label; \
499 ptr++; \
500 if (get_user(val_byte(__res, 1), base + ptr) < 0) \
501 goto err_label; \
502 ptr++; \
503 if (get_user(val_byte(__res, 2), base + ptr) < 0) \
504 goto err_label; \
505 ptr++; \
506 if (get_user(val_byte(__res, 3), base + ptr) < 0) \
507 goto err_label; \
508 ptr++; \
509 __res; \
510 })
511
512/* There are so many possible reasons for this function to return
513 * VM86_INTx, so adding another doesn't bother me. We can expect
514 * userspace programs to be able to handle it. (Getting a problem
515 * in userspace is always better than an Oops anyway.) [KD]
516 */
517static void do_int(struct kernel_vm86_regs *regs, int i,
518 unsigned char __user * ssp, unsigned short sp)
519{
520 unsigned long __user *intr_ptr;
521 unsigned long segoffs;
522
523 if (regs->pt.xcs == BIOSSEG)
524 goto cannot_handle;
525 if (is_revectored(i, &KVM86->int_revectored))
526 goto cannot_handle;
527 if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
528 goto cannot_handle;
529 intr_ptr = (unsigned long __user *) (i << 2);
530 if (get_user(segoffs, intr_ptr))
531 goto cannot_handle;
532 if ((segoffs >> 16) == BIOSSEG)
533 goto cannot_handle;
534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
536 pushw(ssp, sp, IP(regs), cannot_handle);
537 regs->pt.xcs = segoffs >> 16;
538 SP(regs) -= 6;
539 IP(regs) = segoffs & 0xffff;
540 clear_TF(regs);
541 clear_IF(regs);
542 clear_AC(regs);
543 return;
544
545cannot_handle:
546 return_to_32bit(regs, VM86_INTx + (i << 8));
547}
548
549int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
550{
551 if (VMPI.is_vm86pus) {
552 if ( (trapno==3) || (trapno==1) )
553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
555 return 0;
556 }
557 if (trapno !=1)
558 return 1; /* we let this handle by the calling routine */
559 if (current->ptrace & PT_PTRACED) {
560 unsigned long flags;
561 spin_lock_irqsave(&current->sighand->siglock, flags);
562 sigdelset(&current->blocked, SIGTRAP);
563 recalc_sigpending();
564 spin_unlock_irqrestore(&current->sighand->siglock, flags);
565 }
566 send_sig(SIGTRAP, current, 1);
567 current->thread.trap_no = trapno;
568 current->thread.error_code = error_code;
569 return 0;
570}
571
572void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
573{
574 unsigned char opcode;
575 unsigned char __user *csp;
576 unsigned char __user *ssp;
577 unsigned short ip, sp, orig_flags;
578 int data32, pref_done;
579
580#define CHECK_IF_IN_TRAP \
581 if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
582 newflags |= TF_MASK
583#define VM86_FAULT_RETURN do { \
584 if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \
585 return_to_32bit(regs, VM86_PICRETURN); \
586 if (orig_flags & TF_MASK) \
587 handle_vm86_trap(regs, 0, 1); \
588 return; } while (0)
589
590 orig_flags = *(unsigned short *)&regs->pt.eflags;
591
592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
594 sp = SP(regs);
595 ip = IP(regs);
596
597 data32 = 0;
598 pref_done = 0;
599 do {
600 switch (opcode = popb(csp, ip, simulate_sigsegv)) {
601 case 0x66: /* 32-bit data */ data32=1; break;
602 case 0x67: /* 32-bit address */ break;
603 case 0x2e: /* CS */ break;
604 case 0x3e: /* DS */ break;
605 case 0x26: /* ES */ break;
606 case 0x36: /* SS */ break;
607 case 0x65: /* GS */ break;
608 case 0x64: /* FS */ break;
609 case 0xf2: /* repnz */ break;
610 case 0xf3: /* rep */ break;
611 default: pref_done = 1;
612 }
613 } while (!pref_done);
614
615 switch (opcode) {
616
617 /* pushf */
618 case 0x9c:
619 if (data32) {
620 pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
621 SP(regs) -= 4;
622 } else {
623 pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
624 SP(regs) -= 2;
625 }
626 IP(regs) = ip;
627 VM86_FAULT_RETURN;
628
629 /* popf */
630 case 0x9d:
631 {
632 unsigned long newflags;
633 if (data32) {
634 newflags=popl(ssp, sp, simulate_sigsegv);
635 SP(regs) += 4;
636 } else {
637 newflags = popw(ssp, sp, simulate_sigsegv);
638 SP(regs) += 2;
639 }
640 IP(regs) = ip;
641 CHECK_IF_IN_TRAP;
642 if (data32) {
643 set_vflags_long(newflags, regs);
644 } else {
645 set_vflags_short(newflags, regs);
646 }
647 VM86_FAULT_RETURN;
648 }
649
650 /* int xx */
651 case 0xcd: {
652 int intno=popb(csp, ip, simulate_sigsegv);
653 IP(regs) = ip;
654 if (VMPI.vm86dbg_active) {
655 if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
656 return_to_32bit(regs, VM86_INTx + (intno << 8));
657 }
658 do_int(regs, intno, ssp, sp);
659 return;
660 }
661
662 /* iret */
663 case 0xcf:
664 {
665 unsigned long newip;
666 unsigned long newcs;
667 unsigned long newflags;
668 if (data32) {
669 newip=popl(ssp, sp, simulate_sigsegv);
670 newcs=popl(ssp, sp, simulate_sigsegv);
671 newflags=popl(ssp, sp, simulate_sigsegv);
672 SP(regs) += 12;
673 } else {
674 newip = popw(ssp, sp, simulate_sigsegv);
675 newcs = popw(ssp, sp, simulate_sigsegv);
676 newflags = popw(ssp, sp, simulate_sigsegv);
677 SP(regs) += 6;
678 }
679 IP(regs) = newip;
680 regs->pt.xcs = newcs;
681 CHECK_IF_IN_TRAP;
682 if (data32) {
683 set_vflags_long(newflags, regs);
684 } else {
685 set_vflags_short(newflags, regs);
686 }
687 VM86_FAULT_RETURN;
688 }
689
690 /* cli */
691 case 0xfa:
692 IP(regs) = ip;
693 clear_IF(regs);
694 VM86_FAULT_RETURN;
695
696 /* sti */
697 /*
698 * Damn. This is incorrect: the 'sti' instruction should actually
699 * enable interrupts after the /next/ instruction. Not good.
700 *
701 * Probably needs some horsing around with the TF flag. Aiee..
702 */
703 case 0xfb:
704 IP(regs) = ip;
705 set_IF(regs);
706 VM86_FAULT_RETURN;
707
708 default:
709 return_to_32bit(regs, VM86_UNKNOWN);
710 }
711
712 return;
713
714simulate_sigsegv:
715 /* FIXME: After a long discussion with Stas we finally
716 * agreed, that this is wrong. Here we should
717 * really send a SIGSEGV to the user program.
718 * But how do we create the correct context? We
719 * are inside a general protection fault handler
720 * and has just returned from a page fault handler.
721 * The correct context for the signal handler
722 * should be a mixture of the two, but how do we
723 * get the information? [KD]
724 */
725 return_to_32bit(regs, VM86_UNKNOWN);
726}
727
728/* ---------------- vm86 special IRQ passing stuff ----------------- */
729
730#define VM86_IRQNAME "vm86irq"
731
732static struct vm86_irqs {
733 struct task_struct *tsk;
734 int sig;
735} vm86_irqs[16];
736
737static DEFINE_SPINLOCK(irqbits_lock);
738static int irqbits;
739
740#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
741 | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
742 | (1 << SIGUNUSED) )
743
744static irqreturn_t irq_handler(int intno, void *dev_id)
745{
746 int irq_bit;
747 unsigned long flags;
748
749 spin_lock_irqsave(&irqbits_lock, flags);
750 irq_bit = 1 << intno;
751 if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
752 goto out;
753 irqbits |= irq_bit;
754 if (vm86_irqs[intno].sig)
755 send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
756 /*
757 * IRQ will be re-enabled when user asks for the irq (whether
758 * polling or as a result of the signal)
759 */
760 disable_irq_nosync(intno);
761 spin_unlock_irqrestore(&irqbits_lock, flags);
762 return IRQ_HANDLED;
763
764out:
765 spin_unlock_irqrestore(&irqbits_lock, flags);
766 return IRQ_NONE;
767}
768
769static inline void free_vm86_irq(int irqnumber)
770{
771 unsigned long flags;
772
773 free_irq(irqnumber, NULL);
774 vm86_irqs[irqnumber].tsk = NULL;
775
776 spin_lock_irqsave(&irqbits_lock, flags);
777 irqbits &= ~(1 << irqnumber);
778 spin_unlock_irqrestore(&irqbits_lock, flags);
779}
780
781void release_vm86_irqs(struct task_struct *task)
782{
783 int i;
784 for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
785 if (vm86_irqs[i].tsk == task)
786 free_vm86_irq(i);
787}
788
789static inline int get_and_reset_irq(int irqnumber)
790{
791 int bit;
792 unsigned long flags;
793 int ret = 0;
794
795 if (invalid_vm86_irq(irqnumber)) return 0;
796 if (vm86_irqs[irqnumber].tsk != current) return 0;
797 spin_lock_irqsave(&irqbits_lock, flags);
798 bit = irqbits & (1 << irqnumber);
799 irqbits &= ~bit;
800 if (bit) {
801 enable_irq(irqnumber);
802 ret = 1;
803 }
804
805 spin_unlock_irqrestore(&irqbits_lock, flags);
806 return ret;
807}
808
809
810static int do_vm86_irq_handling(int subfunction, int irqnumber)
811{
812 int ret;
813 switch (subfunction) {
814 case VM86_GET_AND_RESET_IRQ: {
815 return get_and_reset_irq(irqnumber);
816 }
817 case VM86_GET_IRQ_BITS: {
818 return irqbits;
819 }
820 case VM86_REQUEST_IRQ: {
821 int sig = irqnumber >> 8;
822 int irq = irqnumber & 255;
823 if (!capable(CAP_SYS_ADMIN)) return -EPERM;
824 if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
825 if (invalid_vm86_irq(irq)) return -EPERM;
826 if (vm86_irqs[irq].tsk) return -EPERM;
827 ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
828 if (ret) return ret;
829 vm86_irqs[irq].sig = sig;
830 vm86_irqs[irq].tsk = current;
831 return irq;
832 }
833 case VM86_FREE_IRQ: {
834 if (invalid_vm86_irq(irqnumber)) return -EPERM;
835 if (!vm86_irqs[irqnumber].tsk) return 0;
836 if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
837 free_vm86_irq(irqnumber);
838 return 0;
839 }
840 }
841 return -EINVAL;
842}
843
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
new file mode 100644
index 000000000000..18673e0f193b
--- /dev/null
+++ b/arch/x86/kernel/vmi_32.c
@@ -0,0 +1,981 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/bootmem.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <asm/vmi.h>
32#include <asm/io.h>
33#include <asm/fixmap.h>
34#include <asm/apicdef.h>
35#include <asm/apic.h>
36#include <asm/processor.h>
37#include <asm/timer.h>
38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h>
40
41/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
43typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
44
45#define call_vrom_func(rom,func) \
46 (((VROMFUNC *)(rom->func))())
47
48#define call_vrom_long_func(rom,func,arg) \
49 (((VROMLONGFUNC *)(rom->func)) (arg))
50
51static struct vrom_header *vmi_rom;
52static int disable_pge;
53static int disable_pse;
54static int disable_sep;
55static int disable_tsc;
56static int disable_mtrr;
57static int disable_noidle;
58static int disable_vmi_timer;
59
60/* Cached VMI operations */
61static struct {
62 void (*cpuid)(void /* non-c */);
63 void (*_set_ldt)(u32 selector);
64 void (*set_tr)(u32 selector);
65 void (*set_kernel_stack)(u32 selector, u32 esp0);
66 void (*allocate_page)(u32, u32, u32, u32, u32);
67 void (*release_page)(u32, u32);
68 void (*set_pte)(pte_t, pte_t *, unsigned);
69 void (*update_pte)(pte_t *, unsigned);
70 void (*set_linear_mapping)(int, void *, u32, u32);
71 void (*_flush_tlb)(int);
72 void (*set_initial_ap_state)(int, int);
73 void (*halt)(void);
74 void (*set_lazy_mode)(int mode);
75} vmi_ops;
76
77/* Cached VMI operations */
78struct vmi_timer_ops vmi_timer_ops;
79
80/*
81 * VMI patching routines.
82 */
83#define MNEM_CALL 0xe8
84#define MNEM_JMP 0xe9
85#define MNEM_RET 0xc3
86
87#define IRQ_PATCH_INT_MASK 0
88#define IRQ_PATCH_DISABLE 5
89
90static inline void patch_offset(void *insnbuf,
91 unsigned long eip, unsigned long dest)
92{
93 *(unsigned long *)(insnbuf+1) = dest-eip-5;
94}
95
96static unsigned patch_internal(int call, unsigned len, void *insnbuf,
97 unsigned long eip)
98{
99 u64 reloc;
100 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
101 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
102 switch(rel->type) {
103 case VMI_RELOCATION_CALL_REL:
104 BUG_ON(len < 5);
105 *(char *)insnbuf = MNEM_CALL;
106 patch_offset(insnbuf, eip, (unsigned long)rel->eip);
107 return 5;
108
109 case VMI_RELOCATION_JUMP_REL:
110 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_JMP;
112 patch_offset(insnbuf, eip, (unsigned long)rel->eip);
113 return 5;
114
115 case VMI_RELOCATION_NOP:
116 /* obliterate the whole thing */
117 return 0;
118
119 case VMI_RELOCATION_NONE:
120 /* leave native code in place */
121 break;
122
123 default:
124 BUG();
125 }
126 return len;
127}
128
129/*
130 * Apply patch if appropriate, return length of new instruction
131 * sequence. The callee does nop padding for us.
132 */
133static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len)
135{
136 switch (type) {
137 case PARAVIRT_PATCH(irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip);
140 case PARAVIRT_PATCH(irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip);
143 case PARAVIRT_PATCH(restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip);
146 case PARAVIRT_PATCH(save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip);
149 case PARAVIRT_PATCH(iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip);
151 case PARAVIRT_PATCH(irq_enable_sysexit):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
153 default:
154 break;
155 }
156 return len;
157}
158
159/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
160static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
161 unsigned int *ecx, unsigned int *edx)
162{
163 int override = 0;
164 if (*eax == 1)
165 override = 1;
166 asm volatile ("call *%6"
167 : "=a" (*eax),
168 "=b" (*ebx),
169 "=c" (*ecx),
170 "=d" (*edx)
171 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
172 if (override) {
173 if (disable_pse)
174 *edx &= ~X86_FEATURE_PSE;
175 if (disable_pge)
176 *edx &= ~X86_FEATURE_PGE;
177 if (disable_sep)
178 *edx &= ~X86_FEATURE_SEP;
179 if (disable_tsc)
180 *edx &= ~X86_FEATURE_TSC;
181 if (disable_mtrr)
182 *edx &= ~X86_FEATURE_MTRR;
183 }
184}
185
186static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
187{
188 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
189 write_gdt_entry(gdt, nr, new->a, new->b);
190}
191
192static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
193{
194 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
195 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
196 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
197 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
198}
199
200static void vmi_set_ldt(const void *addr, unsigned entries)
201{
202 unsigned cpu = smp_processor_id();
203 u32 low, high;
204
205 pack_descriptor(&low, &high, (unsigned long)addr,
206 entries * sizeof(struct desc_struct) - 1,
207 DESCTYPE_LDT, 0);
208 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
209 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
210}
211
212static void vmi_set_tr(void)
213{
214 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
215}
216
217static void vmi_load_esp0(struct tss_struct *tss,
218 struct thread_struct *thread)
219{
220 tss->x86_tss.esp0 = thread->esp0;
221
222 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
223 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
224 tss->x86_tss.ss1 = thread->sysenter_cs;
225 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
226 }
227 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
228}
229
230static void vmi_flush_tlb_user(void)
231{
232 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
233}
234
235static void vmi_flush_tlb_kernel(void)
236{
237 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
238}
239
240/* Stub to do nothing at all; used for delays and unimplemented calls */
241static void vmi_nop(void)
242{
243}
244
245#ifdef CONFIG_DEBUG_PAGE_TYPE
246
247#ifdef CONFIG_X86_PAE
248#define MAX_BOOT_PTS (2048+4+1)
249#else
250#define MAX_BOOT_PTS (1024+1)
251#endif
252
253/*
254 * During boot, mem_map is not yet available in paging_init, so stash
255 * all the boot page allocations here.
256 */
257static struct {
258 u32 pfn;
259 int type;
260} boot_page_allocations[MAX_BOOT_PTS];
261static int num_boot_page_allocations;
262static int boot_allocations_applied;
263
264void vmi_apply_boot_page_allocations(void)
265{
266 int i;
267 BUG_ON(!mem_map);
268 for (i = 0; i < num_boot_page_allocations; i++) {
269 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
270 page->type = boot_page_allocations[i].type;
271 page->type = boot_page_allocations[i].type &
272 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
273 }
274 boot_allocations_applied = 1;
275}
276
277static void record_page_type(u32 pfn, int type)
278{
279 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
280 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
281 boot_page_allocations[num_boot_page_allocations].type = type;
282 num_boot_page_allocations++;
283}
284
285static void check_zeroed_page(u32 pfn, int type, struct page *page)
286{
287 u32 *ptr;
288 int i;
289 int limit = PAGE_SIZE / sizeof(int);
290
291 if (page_address(page))
292 ptr = (u32 *)page_address(page);
293 else
294 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
295 /*
296 * When cloning the root in non-PAE mode, only the userspace
297 * pdes need to be zeroed.
298 */
299 if (type & VMI_PAGE_CLONE)
300 limit = USER_PTRS_PER_PGD;
301 for (i = 0; i < limit; i++)
302 BUG_ON(ptr[i]);
303}
304
305/*
306 * We stash the page type into struct page so we can verify the page
307 * types are used properly.
308 */
309static void vmi_set_page_type(u32 pfn, int type)
310{
311 /* PAE can have multiple roots per page - don't track */
312 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
313 return;
314
315 if (boot_allocations_applied) {
316 struct page *page = pfn_to_page(pfn);
317 if (type != VMI_PAGE_NORMAL)
318 BUG_ON(page->type);
319 else
320 BUG_ON(page->type == VMI_PAGE_NORMAL);
321 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
322 if (type & VMI_PAGE_ZEROED)
323 check_zeroed_page(pfn, type, page);
324 } else {
325 record_page_type(pfn, type);
326 }
327}
328
329static void vmi_check_page_type(u32 pfn, int type)
330{
331 /* PAE can have multiple roots per page - skip checks */
332 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
333 return;
334
335 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
336 if (boot_allocations_applied) {
337 struct page *page = pfn_to_page(pfn);
338 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
339 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
340 BUG_ON((type & page->type) == 0);
341 }
342}
343#else
344#define vmi_set_page_type(p,t) do { } while (0)
345#define vmi_check_page_type(p,t) do { } while (0)
346#endif
347
348#ifdef CONFIG_HIGHPTE
349static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
350{
351 void *va = kmap_atomic(page, type);
352
353 /*
354 * Internally, the VMI ROM must map virtual addresses to physical
355 * addresses for processing MMU updates. By the time MMU updates
356 * are issued, this information is typically already lost.
357 * Fortunately, the VMI provides a cache of mapping slots for active
358 * page tables.
359 *
360 * We use slot zero for the linear mapping of physical memory, and
361 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
362 *
363 * args: SLOT VA COUNT PFN
364 */
365 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
366 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
367
368 return va;
369}
370#endif
371
372static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
373{
374 vmi_set_page_type(pfn, VMI_PAGE_L1);
375 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
376}
377
378static void vmi_allocate_pd(u32 pfn)
379{
380 /*
381 * This call comes in very early, before mem_map is setup.
382 * It is called only for swapper_pg_dir, which already has
383 * data on it.
384 */
385 vmi_set_page_type(pfn, VMI_PAGE_L2);
386 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
387}
388
389static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
390{
391 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
392 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
393 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
394}
395
396static void vmi_release_pt(u32 pfn)
397{
398 vmi_ops.release_page(pfn, VMI_PAGE_L1);
399 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
400}
401
402static void vmi_release_pd(u32 pfn)
403{
404 vmi_ops.release_page(pfn, VMI_PAGE_L2);
405 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
406}
407
408/*
409 * Helper macros for MMU update flags. We can defer updates until a flush
410 * or page invalidation only if the update is to the current address space
411 * (otherwise, there is no flush). We must check against init_mm, since
412 * this could be a kernel update, which usually passes init_mm, although
413 * sometimes this check can be skipped if we know the particular function
414 * is only called on user mode PTEs. We could change the kernel to pass
415 * current->active_mm here, but in particular, I was unsure if changing
416 * mm/highmem.c to do this would still be correct on other architectures.
417 */
418#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
419 (!mustbeuser && (mm) == &init_mm))
420#define vmi_flags_addr(mm, addr, level, user) \
421 ((level) | (is_current_as(mm, user) ? \
422 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
423#define vmi_flags_addr_defer(mm, addr, level, user) \
424 ((level) | (is_current_as(mm, user) ? \
425 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
426
427static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
428{
429 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
430 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
431}
432
433static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
434{
435 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
436 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
437}
438
439static void vmi_set_pte(pte_t *ptep, pte_t pte)
440{
441 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
442 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
443 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
444}
445
446static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
447{
448 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
449 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
450}
451
452static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
453{
454#ifdef CONFIG_X86_PAE
455 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
456 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
457#else
458 const pte_t pte = { pmdval.pud.pgd.pgd };
459 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
460#endif
461 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
462}
463
464#ifdef CONFIG_X86_PAE
465
466static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
467{
468 /*
469 * XXX This is called from set_pmd_pte, but at both PT
470 * and PD layers so the VMI_PAGE_PT flag is wrong. But
471 * it is only called for large page mapping changes,
472 * the Xen backend, doesn't support large pages, and the
473 * ESX backend doesn't depend on the flag.
474 */
475 set_64bit((unsigned long long *)ptep,pte_val(pteval));
476 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
477}
478
479static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
480{
481 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
482 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
483}
484
485static void vmi_set_pud(pud_t *pudp, pud_t pudval)
486{
487 /* Um, eww */
488 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
489 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
490 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
491}
492
493static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
494{
495 const pte_t pte = { 0 };
496 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
497 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
498}
499
500static void vmi_pmd_clear(pmd_t *pmd)
501{
502 const pte_t pte = { 0 };
503 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
504 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
505}
506#endif
507
508#ifdef CONFIG_SMP
509static void __devinit
510vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
511 unsigned long start_esp)
512{
513 struct vmi_ap_state ap;
514
515 /* Default everything to zero. This is fine for most GPRs. */
516 memset(&ap, 0, sizeof(struct vmi_ap_state));
517
518 ap.gdtr_limit = GDT_SIZE - 1;
519 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
520
521 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
522 ap.idtr_base = (unsigned long) idt_table;
523
524 ap.ldtr = 0;
525
526 ap.cs = __KERNEL_CS;
527 ap.eip = (unsigned long) start_eip;
528 ap.ss = __KERNEL_DS;
529 ap.esp = (unsigned long) start_esp;
530
531 ap.ds = __USER_DS;
532 ap.es = __USER_DS;
533 ap.fs = __KERNEL_PERCPU;
534 ap.gs = 0;
535
536 ap.eflags = 0;
537
538#ifdef CONFIG_X86_PAE
539 /* efer should match BSP efer. */
540 if (cpu_has_nx) {
541 unsigned l, h;
542 rdmsr(MSR_EFER, l, h);
543 ap.efer = (unsigned long long) h << 32 | l;
544 }
545#endif
546
547 ap.cr3 = __pa(swapper_pg_dir);
548 /* Protected mode, paging, AM, WP, NE, MP. */
549 ap.cr0 = 0x80050023;
550 ap.cr4 = mmu_cr4_features;
551 vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
552}
553#endif
554
555static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
556{
557 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
558
559 if (!vmi_ops.set_lazy_mode)
560 return;
561
562 /* Modes should never nest or overlap */
563 BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
564 mode == PARAVIRT_LAZY_FLUSH));
565
566 if (mode == PARAVIRT_LAZY_FLUSH) {
567 vmi_ops.set_lazy_mode(0);
568 vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
569 } else {
570 vmi_ops.set_lazy_mode(mode);
571 __get_cpu_var(lazy_mode) = mode;
572 }
573}
574
575static inline int __init check_vmi_rom(struct vrom_header *rom)
576{
577 struct pci_header *pci;
578 struct pnp_header *pnp;
579 const char *manufacturer = "UNKNOWN";
580 const char *product = "UNKNOWN";
581 const char *license = "unspecified";
582
583 if (rom->rom_signature != 0xaa55)
584 return 0;
585 if (rom->vrom_signature != VMI_SIGNATURE)
586 return 0;
587 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
588 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
589 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
590 rom->api_version_maj,
591 rom->api_version_min);
592 return 0;
593 }
594
595 /*
596 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
597 * the PCI header and device type to make sure this is really a
598 * VMI device.
599 */
600 if (!rom->pci_header_offs) {
601 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
602 return 0;
603 }
604
605 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
606 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
607 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
608 /* Allow it to run... anyways, but warn */
609 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
610 }
611
612 if (rom->pnp_header_offs) {
613 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
614 if (pnp->manufacturer_offset)
615 manufacturer = (const char *)rom+pnp->manufacturer_offset;
616 if (pnp->product_offset)
617 product = (const char *)rom+pnp->product_offset;
618 }
619
620 if (rom->license_offs)
621 license = (char *)rom+rom->license_offs;
622
623 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
624 manufacturer, product,
625 rom->api_version_maj, rom->api_version_min,
626 pci->rom_version_maj, pci->rom_version_min);
627
628 /* Don't allow BSD/MIT here for now because we don't want to end up
629 with any binary only shim layers */
630 if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
631 printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
632 license);
633 return 0;
634 }
635
636 return 1;
637}
638
639/*
640 * Probe for the VMI option ROM
641 */
642static inline int __init probe_vmi_rom(void)
643{
644 unsigned long base;
645
646 /* VMI ROM is in option ROM area, check signature */
647 for (base = 0xC0000; base < 0xE0000; base += 2048) {
648 struct vrom_header *romstart;
649 romstart = (struct vrom_header *)isa_bus_to_virt(base);
650 if (check_vmi_rom(romstart)) {
651 vmi_rom = romstart;
652 return 1;
653 }
654 }
655 return 0;
656}
657
658/*
659 * VMI setup common to all processors
660 */
661void vmi_bringup(void)
662{
663 /* We must establish the lowmem mapping for MMU ops to work */
664 if (vmi_ops.set_linear_mapping)
665 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
666}
667
668/*
669 * Return a pointer to a VMI function or NULL if unimplemented
670 */
671static void *vmi_get_function(int vmicall)
672{
673 u64 reloc;
674 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
675 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
676 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
677 if (rel->type == VMI_RELOCATION_CALL_REL)
678 return (void *)rel->eip;
679 else
680 return NULL;
681}
682
683/*
684 * Helper macro for making the VMI paravirt-ops fill code readable.
685 * For unimplemented operations, fall back to default, unless nop
686 * is returned by the ROM.
687 */
688#define para_fill(opname, vmicall) \
689do { \
690 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
691 VMI_CALL_##vmicall); \
692 if (rel->type == VMI_RELOCATION_CALL_REL) \
693 paravirt_ops.opname = (void *)rel->eip; \
694 else if (rel->type == VMI_RELOCATION_NOP) \
695 paravirt_ops.opname = (void *)vmi_nop; \
696 else if (rel->type != VMI_RELOCATION_NONE) \
697 printk(KERN_WARNING "VMI: Unknown relocation " \
698 "type %d for " #vmicall"\n",\
699 rel->type); \
700} while (0)
701
702/*
703 * Helper macro for making the VMI paravirt-ops fill code readable.
704 * For cached operations which do not match the VMI ROM ABI and must
705 * go through a tranlation stub. Ignore NOPs, since it is not clear
706 * a NOP * VMI function corresponds to a NOP paravirt-op when the
707 * functions are not in 1-1 correspondence.
708 */
709#define para_wrap(opname, wrapper, cache, vmicall) \
710do { \
711 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
712 VMI_CALL_##vmicall); \
713 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
714 if (rel->type == VMI_RELOCATION_CALL_REL) { \
715 paravirt_ops.opname = wrapper; \
716 vmi_ops.cache = (void *)rel->eip; \
717 } \
718} while (0)
719
720/*
721 * Activate the VMI interface and switch into paravirtualized mode
722 */
723static inline int __init activate_vmi(void)
724{
725 short kernel_cs;
726 u64 reloc;
727 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
728
729 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
730 printk(KERN_ERR "VMI ROM failed to initialize!");
731 return 0;
732 }
733 savesegment(cs, kernel_cs);
734
735 paravirt_ops.paravirt_enabled = 1;
736 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
737
738 paravirt_ops.patch = vmi_patch;
739 paravirt_ops.name = "vmi";
740
741 /*
742 * Many of these operations are ABI compatible with VMI.
743 * This means we can fill in the paravirt-ops with direct
744 * pointers into the VMI ROM. If the calling convention for
745 * these operations changes, this code needs to be updated.
746 *
747 * Exceptions
748 * CPUID paravirt-op uses pointers, not the native ISA
749 * halt has no VMI equivalent; all VMI halts are "safe"
750 * no MSR support yet - just trap and emulate. VMI uses the
751 * same ABI as the native ISA, but Linux wants exceptions
752 * from bogus MSR read / write handled
753 * rdpmc is not yet used in Linux
754 */
755
756 /* CPUID is special, so very special it gets wrapped like a present */
757 para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
758
759 para_fill(clts, CLTS);
760 para_fill(get_debugreg, GetDR);
761 para_fill(set_debugreg, SetDR);
762 para_fill(read_cr0, GetCR0);
763 para_fill(read_cr2, GetCR2);
764 para_fill(read_cr3, GetCR3);
765 para_fill(read_cr4, GetCR4);
766 para_fill(write_cr0, SetCR0);
767 para_fill(write_cr2, SetCR2);
768 para_fill(write_cr3, SetCR3);
769 para_fill(write_cr4, SetCR4);
770 para_fill(save_fl, GetInterruptMask);
771 para_fill(restore_fl, SetInterruptMask);
772 para_fill(irq_disable, DisableInterrupts);
773 para_fill(irq_enable, EnableInterrupts);
774
775 para_fill(wbinvd, WBINVD);
776 para_fill(read_tsc, RDTSC);
777
778 /* The following we emulate with trap and emulate for now */
779 /* paravirt_ops.read_msr = vmi_rdmsr */
780 /* paravirt_ops.write_msr = vmi_wrmsr */
781 /* paravirt_ops.rdpmc = vmi_rdpmc */
782
783 /* TR interface doesn't pass TR value, wrap */
784 para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
785
786 /* LDT is special, too */
787 para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
788
789 para_fill(load_gdt, SetGDT);
790 para_fill(load_idt, SetIDT);
791 para_fill(store_gdt, GetGDT);
792 para_fill(store_idt, GetIDT);
793 para_fill(store_tr, GetTR);
794 paravirt_ops.load_tls = vmi_load_tls;
795 para_fill(write_ldt_entry, WriteLDTEntry);
796 para_fill(write_gdt_entry, WriteGDTEntry);
797 para_fill(write_idt_entry, WriteIDTEntry);
798 para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
799 para_fill(set_iopl_mask, SetIOPLMask);
800 para_fill(io_delay, IODelay);
801 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
802
803 /* user and kernel flush are just handled with different flags to FlushTLB */
804 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
805 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
806 para_fill(flush_tlb_single, InvalPage);
807
808 /*
809 * Until a standard flag format can be agreed on, we need to
810 * implement these as wrappers in Linux. Get the VMI ROM
811 * function pointers for the two backend calls.
812 */
813#ifdef CONFIG_X86_PAE
814 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
815 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
816#else
817 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
818 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
819#endif
820
821 if (vmi_ops.set_pte) {
822 paravirt_ops.set_pte = vmi_set_pte;
823 paravirt_ops.set_pte_at = vmi_set_pte_at;
824 paravirt_ops.set_pmd = vmi_set_pmd;
825#ifdef CONFIG_X86_PAE
826 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
827 paravirt_ops.set_pte_present = vmi_set_pte_present;
828 paravirt_ops.set_pud = vmi_set_pud;
829 paravirt_ops.pte_clear = vmi_pte_clear;
830 paravirt_ops.pmd_clear = vmi_pmd_clear;
831#endif
832 }
833
834 if (vmi_ops.update_pte) {
835 paravirt_ops.pte_update = vmi_update_pte;
836 paravirt_ops.pte_update_defer = vmi_update_pte_defer;
837 }
838
839 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
840 if (vmi_ops.allocate_page) {
841 paravirt_ops.alloc_pt = vmi_allocate_pt;
842 paravirt_ops.alloc_pd = vmi_allocate_pd;
843 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
844 }
845
846 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
847 if (vmi_ops.release_page) {
848 paravirt_ops.release_pt = vmi_release_pt;
849 paravirt_ops.release_pd = vmi_release_pd;
850 }
851
852 /* Set linear is needed in all cases */
853 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
854#ifdef CONFIG_HIGHPTE
855 if (vmi_ops.set_linear_mapping)
856 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
857#endif
858
859 /*
860 * These MUST always be patched. Don't support indirect jumps
861 * through these operations, as the VMI interface may use either
862 * a jump or a call to get to these operations, depending on
863 * the backend. They are performance critical anyway, so requiring
864 * a patch is not a big problem.
865 */
866 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
867 paravirt_ops.iret = (void *)0xbadbab0;
868
869#ifdef CONFIG_SMP
870 para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
871#endif
872
873#ifdef CONFIG_X86_LOCAL_APIC
874 para_fill(apic_read, APICRead);
875 para_fill(apic_write, APICWrite);
876 para_fill(apic_write_atomic, APICWrite);
877#endif
878
879 /*
880 * Check for VMI timer functionality by probing for a cycle frequency method
881 */
882 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
883 if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
884 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
885 vmi_timer_ops.get_cycle_counter =
886 vmi_get_function(VMI_CALL_GetCycleCounter);
887 vmi_timer_ops.get_wallclock =
888 vmi_get_function(VMI_CALL_GetWallclockTime);
889 vmi_timer_ops.wallclock_updated =
890 vmi_get_function(VMI_CALL_WallclockUpdated);
891 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
892 vmi_timer_ops.cancel_alarm =
893 vmi_get_function(VMI_CALL_CancelAlarm);
894 paravirt_ops.time_init = vmi_time_init;
895 paravirt_ops.get_wallclock = vmi_get_wallclock;
896 paravirt_ops.set_wallclock = vmi_set_wallclock;
897#ifdef CONFIG_X86_LOCAL_APIC
898 paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
899 paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
900#endif
901 paravirt_ops.sched_clock = vmi_sched_clock;
902 paravirt_ops.get_cpu_khz = vmi_cpu_khz;
903
904 /* We have true wallclock functions; disable CMOS clock sync */
905 no_sync_cmos_clock = 1;
906 } else {
907 disable_noidle = 1;
908 disable_vmi_timer = 1;
909 }
910
911 para_fill(safe_halt, Halt);
912
913 /*
914 * Alternative instruction rewriting doesn't happen soon enough
915 * to convert VMI_IRET to a call instead of a jump; so we have
916 * to do this before IRQs get reenabled. Fortunately, it is
917 * idempotent.
918 */
919 apply_paravirt(__parainstructions, __parainstructions_end);
920
921 vmi_bringup();
922
923 return 1;
924}
925
926#undef para_fill
927
928void __init vmi_init(void)
929{
930 unsigned long flags;
931
932 if (!vmi_rom)
933 probe_vmi_rom();
934 else
935 check_vmi_rom(vmi_rom);
936
937 /* In case probing for or validating the ROM failed, basil */
938 if (!vmi_rom)
939 return;
940
941 reserve_top_address(-vmi_rom->virtual_top);
942
943 local_irq_save(flags);
944 activate_vmi();
945
946#ifdef CONFIG_X86_IO_APIC
947 /* This is virtual hardware; timer routing is wired correctly */
948 no_timer_check = 1;
949#endif
950 local_irq_restore(flags & X86_EFLAGS_IF);
951}
952
953static int __init parse_vmi(char *arg)
954{
955 if (!arg)
956 return -EINVAL;
957
958 if (!strcmp(arg, "disable_pge")) {
959 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
960 disable_pge = 1;
961 } else if (!strcmp(arg, "disable_pse")) {
962 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
963 disable_pse = 1;
964 } else if (!strcmp(arg, "disable_sep")) {
965 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
966 disable_sep = 1;
967 } else if (!strcmp(arg, "disable_tsc")) {
968 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
969 disable_tsc = 1;
970 } else if (!strcmp(arg, "disable_mtrr")) {
971 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
972 disable_mtrr = 1;
973 } else if (!strcmp(arg, "disable_timer")) {
974 disable_vmi_timer = 1;
975 disable_noidle = 1;
976 } else if (!strcmp(arg, "disable_noidle"))
977 disable_noidle = 1;
978 return 0;
979}
980
981early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
new file mode 100644
index 000000000000..b1b5ab08b26e
--- /dev/null
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -0,0 +1,320 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/arch_hooks.h>
32#include <asm/apicdef.h>
33#include <asm/apic.h>
34#include <asm/timer.h>
35#include <asm/i8253.h>
36
37#include <irq_vectors.h>
38#include "io_ports.h"
39
40#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
42
43static DEFINE_PER_CPU(struct clock_event_device, local_events);
44
45static inline u32 vmi_counter(u32 flags)
46{
47 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
48 * cycle counter. */
49 return flags & VMI_ALARM_COUNTER_MASK;
50}
51
52/* paravirt_ops.get_wallclock = vmi_get_wallclock */
53unsigned long vmi_get_wallclock(void)
54{
55 unsigned long long wallclock;
56 wallclock = vmi_timer_ops.get_wallclock(); // nsec
57 (void)do_div(wallclock, 1000000000); // sec
58
59 return wallclock;
60}
61
62/* paravirt_ops.set_wallclock = vmi_set_wallclock */
63int vmi_set_wallclock(unsigned long now)
64{
65 return 0;
66}
67
68/* paravirt_ops.sched_clock = vmi_sched_clock */
69unsigned long long vmi_sched_clock(void)
70{
71 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
72}
73
74/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
75unsigned long vmi_cpu_khz(void)
76{
77 unsigned long long khz;
78 khz = vmi_timer_ops.get_cycle_frequency();
79 (void)do_div(khz, 1000);
80 return khz;
81}
82
83static inline unsigned int vmi_get_timer_vector(void)
84{
85#ifdef CONFIG_X86_IO_APIC
86 return FIRST_DEVICE_VECTOR;
87#else
88 return FIRST_EXTERNAL_VECTOR;
89#endif
90}
91
92/** vmi clockchip */
93#ifdef CONFIG_X86_LOCAL_APIC
94static unsigned int startup_timer_irq(unsigned int irq)
95{
96 unsigned long val = apic_read(APIC_LVTT);
97 apic_write(APIC_LVTT, vmi_get_timer_vector());
98
99 return (val & APIC_SEND_PENDING);
100}
101
102static void mask_timer_irq(unsigned int irq)
103{
104 unsigned long val = apic_read(APIC_LVTT);
105 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
106}
107
108static void unmask_timer_irq(unsigned int irq)
109{
110 unsigned long val = apic_read(APIC_LVTT);
111 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
112}
113
114static void ack_timer_irq(unsigned int irq)
115{
116 ack_APIC_irq();
117}
118
119static struct irq_chip vmi_chip __read_mostly = {
120 .name = "VMI-LOCAL",
121 .startup = startup_timer_irq,
122 .mask = mask_timer_irq,
123 .unmask = unmask_timer_irq,
124 .ack = ack_timer_irq
125};
126#endif
127
128/** vmi clockevent */
129#define VMI_ALARM_WIRED_IRQ0 0x00000000
130#define VMI_ALARM_WIRED_LVTT 0x00010000
131static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
132
133static inline int vmi_get_alarm_wiring(void)
134{
135 return vmi_wiring;
136}
137
138static void vmi_timer_set_mode(enum clock_event_mode mode,
139 struct clock_event_device *evt)
140{
141 cycle_t now, cycles_per_hz;
142 BUG_ON(!irqs_disabled());
143
144 switch (mode) {
145 case CLOCK_EVT_MODE_ONESHOT:
146 case CLOCK_EVT_MODE_RESUME:
147 break;
148 case CLOCK_EVT_MODE_PERIODIC:
149 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
150 (void)do_div(cycles_per_hz, HZ);
151 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
152 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
153 break;
154 case CLOCK_EVT_MODE_UNUSED:
155 case CLOCK_EVT_MODE_SHUTDOWN:
156 switch (evt->mode) {
157 case CLOCK_EVT_MODE_ONESHOT:
158 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
159 break;
160 case CLOCK_EVT_MODE_PERIODIC:
161 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
162 break;
163 default:
164 break;
165 }
166 break;
167 default:
168 break;
169 }
170}
171
172static int vmi_timer_next_event(unsigned long delta,
173 struct clock_event_device *evt)
174{
175 /* Unfortunately, set_next_event interface only passes relative
176 * expiry, but we want absolute expiry. It'd be better if were
177 * were passed an aboslute expiry, since a bunch of time may
178 * have been stolen between the time the delta is computed and
179 * when we set the alarm below. */
180 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
181
182 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
183 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
184 return 0;
185}
186
187static struct clock_event_device vmi_clockevent = {
188 .name = "vmi-timer",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .shift = 22,
191 .set_mode = vmi_timer_set_mode,
192 .set_next_event = vmi_timer_next_event,
193 .rating = 1000,
194 .irq = 0,
195};
196
197static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
198{
199 struct clock_event_device *evt = &__get_cpu_var(local_events);
200 evt->event_handler(evt);
201 return IRQ_HANDLED;
202}
203
204static struct irqaction vmi_clock_action = {
205 .name = "vmi-timer",
206 .handler = vmi_timer_interrupt,
207 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
208 .mask = CPU_MASK_ALL,
209};
210
211static void __devinit vmi_time_init_clockevent(void)
212{
213 cycle_t cycles_per_msec;
214 struct clock_event_device *evt;
215
216 int cpu = smp_processor_id();
217 evt = &__get_cpu_var(local_events);
218
219 /* Use cycles_per_msec since div_sc params are 32-bits. */
220 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
221 (void)do_div(cycles_per_msec, 1000);
222
223 memcpy(evt, &vmi_clockevent, sizeof(*evt));
224 /* Must pick .shift such that .mult fits in 32-bits. Choosing
225 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
226 * before overflow. */
227 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
228 /* Upper bound is clockevent's use of ulong for cycle deltas. */
229 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
230 evt->min_delta_ns = clockevent_delta2ns(1, evt);
231 evt->cpumask = cpumask_of_cpu(cpu);
232
233 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
234 evt->name, evt->mult, evt->shift);
235 clockevents_register_device(evt);
236}
237
238void __init vmi_time_init(void)
239{
240 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
241 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
242
243 vmi_time_init_clockevent();
244 setup_irq(0, &vmi_clock_action);
245}
246
247#ifdef CONFIG_X86_LOCAL_APIC
248void __devinit vmi_time_bsp_init(void)
249{
250 /*
251 * On APIC systems, we want local timers to fire on each cpu. We do
252 * this by programming LVTT to deliver timer events to the IRQ handler
253 * for IRQ-0, since we can't re-use the APIC local timer handler
254 * without interfering with that code.
255 */
256 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
257 local_irq_disable();
258#ifdef CONFIG_X86_SMP
259 /*
260 * XXX handle_percpu_irq only defined for SMP; we need to switch over
261 * to using it, since this is a local interrupt, which each CPU must
262 * handle individually without locking out or dropping simultaneous
263 * local timers on other CPUs. We also don't want to trigger the
264 * quirk workaround code for interrupts which gets invoked from
265 * handle_percpu_irq via eoi, so we use our own IRQ chip.
266 */
267 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
268#else
269 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
270#endif
271 vmi_wiring = VMI_ALARM_WIRED_LVTT;
272 apic_write(APIC_LVTT, vmi_get_timer_vector());
273 local_irq_enable();
274 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
275}
276
277void __devinit vmi_time_ap_init(void)
278{
279 vmi_time_init_clockevent();
280 apic_write(APIC_LVTT, vmi_get_timer_vector());
281}
282#endif
283
284/** vmi clocksource */
285
286static cycle_t read_real_cycles(void)
287{
288 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
289}
290
291static struct clocksource clocksource_vmi = {
292 .name = "vmi-timer",
293 .rating = 450,
294 .read = read_real_cycles,
295 .mask = CLOCKSOURCE_MASK(64),
296 .mult = 0, /* to be set */
297 .shift = 22,
298 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
299};
300
301static int __init init_vmi_clocksource(void)
302{
303 cycle_t cycles_per_msec;
304
305 if (!vmi_timer_ops.get_cycle_frequency)
306 return 0;
307 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
308 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
309 (void)do_div(cycles_per_msec, 1000);
310
311 /* Note that clocksource.{mult, shift} converts in the opposite direction
312 * as clockevents. */
313 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
314 clocksource_vmi.shift);
315
316 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
317 return clocksource_register(&clocksource_vmi);
318
319}
320module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..849ee611f013
--- /dev/null
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S"
3#else
4# include "vmlinux_64.lds.S"
5#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
new file mode 100644
index 000000000000..7d72cce00529
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -0,0 +1,213 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
17#define LOAD_OFFSET __PAGE_OFFSET
18
19#include <asm-generic/vmlinux.lds.h>
20#include <asm/thread_info.h>
21#include <asm/page.h>
22#include <asm/cache.h>
23#include <asm/boot.h>
24
25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
26OUTPUT_ARCH(i386)
27ENTRY(phys_startup_32)
28jiffies = jiffies_64;
29
30PHDRS {
31 text PT_LOAD FLAGS(5); /* R_E */
32 data PT_LOAD FLAGS(7); /* RWE */
33 note PT_NOTE FLAGS(0); /* ___ */
34}
35SECTIONS
36{
37 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
38 phys_startup_32 = startup_32 - LOAD_OFFSET;
39
40 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
41 _text = .; /* Text and read-only data */
42 *(.text.head)
43 } :text = 0x9090
44
45 /* read-only */
46 .text : AT(ADDR(.text) - LOAD_OFFSET) {
47 TEXT_TEXT
48 SCHED_TEXT
49 LOCK_TEXT
50 KPROBES_TEXT
51 *(.fixup)
52 *(.gnu.warning)
53 _etext = .; /* End of text section */
54 } :text = 0x9090
55
56 . = ALIGN(16); /* Exception table */
57 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
58 __start___ex_table = .;
59 *(__ex_table)
60 __stop___ex_table = .;
61 }
62
63 NOTES :text :note
64
65 BUG_TABLE :text
66
67 . = ALIGN(4);
68 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
69 __tracedata_start = .;
70 *(.tracedata)
71 __tracedata_end = .;
72 }
73
74 RODATA
75
76 /* writeable */
77 . = ALIGN(4096);
78 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
79 DATA_DATA
80 CONSTRUCTORS
81 } :data
82
83 . = ALIGN(4096);
84 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
85 __nosave_begin = .;
86 *(.data.nosave)
87 . = ALIGN(4096);
88 __nosave_end = .;
89 }
90
91 . = ALIGN(4096);
92 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
93 *(.data.page_aligned)
94 *(.data.idt)
95 }
96
97 . = ALIGN(32);
98 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
99 *(.data.cacheline_aligned)
100 }
101
102 /* rarely changed data like cpu maps */
103 . = ALIGN(32);
104 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
105 *(.data.read_mostly)
106 _edata = .; /* End of data section */
107 }
108
109 . = ALIGN(THREAD_SIZE); /* init_task */
110 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
111 *(.data.init_task)
112 }
113
114 /* might get freed after init */
115 . = ALIGN(4096);
116 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
117 __smp_locks = .;
118 *(.smp_locks)
119 __smp_locks_end = .;
120 }
121 /* will be freed after init
122 * Following ALIGN() is required to make sure no other data falls on the
123 * same page where __smp_alt_end is pointing as that page might be freed
124 * after boot. Always make sure that ALIGN() directive is present after
125 * the section which contains __smp_alt_end.
126 */
127 . = ALIGN(4096);
128
129 /* will be freed after init */
130 . = ALIGN(4096); /* Init code and data */
131 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
132 __init_begin = .;
133 _sinittext = .;
134 *(.init.text)
135 _einittext = .;
136 }
137 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
138 . = ALIGN(16);
139 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
140 __setup_start = .;
141 *(.init.setup)
142 __setup_end = .;
143 }
144 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
145 __initcall_start = .;
146 INITCALLS
147 __initcall_end = .;
148 }
149 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
150 __con_initcall_start = .;
151 *(.con_initcall.init)
152 __con_initcall_end = .;
153 }
154 SECURITY_INIT
155 . = ALIGN(4);
156 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
157 __alt_instructions = .;
158 *(.altinstructions)
159 __alt_instructions_end = .;
160 }
161 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
162 *(.altinstr_replacement)
163 }
164 . = ALIGN(4);
165 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
166 __parainstructions = .;
167 *(.parainstructions)
168 __parainstructions_end = .;
169 }
170 /* .exit.text is discard at runtime, not link time, to deal with references
171 from .altinstructions and .eh_frame */
172 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
173 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
174#if defined(CONFIG_BLK_DEV_INITRD)
175 . = ALIGN(4096);
176 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
177 __initramfs_start = .;
178 *(.init.ramfs)
179 __initramfs_end = .;
180 }
181#endif
182 . = ALIGN(4096);
183 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
184 __per_cpu_start = .;
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(4096);
190 /* freed after init ends here */
191
192 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
193 __init_end = .;
194 __bss_start = .; /* BSS */
195 *(.bss.page_aligned)
196 *(.bss)
197 . = ALIGN(4);
198 __bss_stop = .;
199 _end = . ;
200 /* This is where the kernel creates the early boot page tables */
201 . = ALIGN(4096);
202 pg0 = . ;
203 }
204
205 /* Sections to be discarded */
206 /DISCARD/ : {
207 *(.exitcall.exit)
208 }
209
210 STABS_DEBUG
211
212 DWARF_DEBUG
213}
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
new file mode 100644
index 000000000000..103cab6aa7c0
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-int80_32.S
@@ -0,0 +1,53 @@
1/*
2 * Code for the vsyscall page. This version uses the old int $0x80 method.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10 .text
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 int $0x80
16 ret
17.LEND_vsyscall:
18 .size __kernel_vsyscall,.-.LSTART_vsyscall
19 .previous
20
21 .section .eh_frame,"a",@progbits
22.LSTARTFRAMEDLSI:
23 .long .LENDCIEDLSI-.LSTARTCIEDLSI
24.LSTARTCIEDLSI:
25 .long 0 /* CIE ID */
26 .byte 1 /* Version number */
27 .string "zR" /* NUL-terminated augmentation string */
28 .uleb128 1 /* Code alignment factor */
29 .sleb128 -4 /* Data alignment factor */
30 .byte 8 /* Return address register column */
31 .uleb128 1 /* Augmentation value length */
32 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
33 .byte 0x0c /* DW_CFA_def_cfa */
34 .uleb128 4
35 .uleb128 4
36 .byte 0x88 /* DW_CFA_offset, column 0x8 */
37 .uleb128 1
38 .align 4
39.LENDCIEDLSI:
40 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
41.LSTARTFDEDLSI:
42 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
43 .long .LSTART_vsyscall-. /* PC-relative start address */
44 .long .LEND_vsyscall-.LSTART_vsyscall
45 .uleb128 0
46 .align 4
47.LENDFDEDLSI:
48 .previous
49
50/*
51 * Get the common code for the sigreturn entry points.
52 */
53#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
new file mode 100644
index 000000000000..fcf376a37f79
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-note_32.S
@@ -0,0 +1,45 @@
1/*
2 * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
3 * Here we can supply some information useful to userland.
4 */
5
6#include <linux/version.h>
7#include <linux/elfnote.h>
8
9/* Ideally this would use UTS_NAME, but using a quoted string here
10 doesn't work. Remember to change this when changing the
11 kernel's name. */
12ELFNOTE_START(Linux, 0, "a")
13 .long LINUX_VERSION_CODE
14ELFNOTE_END
15
16#ifdef CONFIG_XEN
17/*
18 * Add a special note telling glibc's dynamic linker a fake hardware
19 * flavor that it will use to choose the search path for libraries in the
20 * same way it uses real hardware capabilities like "mmx".
21 * We supply "nosegneg" as the fake capability, to indicate that we
22 * do not like negative offsets in instructions using segment overrides,
23 * since we implement those inefficiently. This makes it possible to
24 * install libraries optimized to avoid those access patterns in someplace
25 * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
26 * corresponding to the bits here is needed to make ldconfig work right.
27 * It should contain:
28 * hwcap 1 nosegneg
29 * to match the mapping of bit to name that we give here.
30 *
31 * At runtime, the fake hardware feature will be considered to be present
32 * if its bit is set in the mask word. So, we start with the mask 0, and
33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
34 */
35
36#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
37
38 .globl VDSO_NOTE_MASK
39ELFNOTE_START(GNU, 2, "a")
40 .long 1 /* ncaps */
41VDSO_NOTE_MASK:
42 .long 0 /* mask */
43 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
44ELFNOTE_END
45#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
new file mode 100644
index 000000000000..a92262f41659
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sigreturn_32.S
@@ -0,0 +1,143 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * So far this code is the same for both int80 and sysenter versions.
4 * This file is #include'd by vsyscall-*.S to define them after the
5 * vsyscall entry point. The kernel assumes that the addresses of these
6 * routines are constant for all vsyscall implementations.
7 */
8
9#include <asm/unistd.h>
10#include <asm/asm-offsets.h>
11
12
13/* XXX
14 Should these be named "_sigtramp" or something?
15*/
16
17 .text
18 .org __kernel_vsyscall+32,0x90
19 .globl __kernel_sigreturn
20 .type __kernel_sigreturn,@function
21__kernel_sigreturn:
22.LSTART_sigreturn:
23 popl %eax /* XXX does this mean it needs unwind info? */
24 movl $__NR_sigreturn, %eax
25 int $0x80
26.LEND_sigreturn:
27 .size __kernel_sigreturn,.-.LSTART_sigreturn
28
29 .balign 32
30 .globl __kernel_rt_sigreturn
31 .type __kernel_rt_sigreturn,@function
32__kernel_rt_sigreturn:
33.LSTART_rt_sigreturn:
34 movl $__NR_rt_sigreturn, %eax
35 int $0x80
36.LEND_rt_sigreturn:
37 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
38 .balign 32
39 .previous
40
41 .section .eh_frame,"a",@progbits
42.LSTARTFRAMEDLSI1:
43 .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
44.LSTARTCIEDLSI1:
45 .long 0 /* CIE ID */
46 .byte 1 /* Version number */
47 .string "zRS" /* NUL-terminated augmentation string */
48 .uleb128 1 /* Code alignment factor */
49 .sleb128 -4 /* Data alignment factor */
50 .byte 8 /* Return address register column */
51 .uleb128 1 /* Augmentation value length */
52 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
53 .byte 0 /* DW_CFA_nop */
54 .align 4
55.LENDCIEDLSI1:
56 .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
57.LSTARTFDEDLSI1:
58 .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
59 /* HACK: The dwarf2 unwind routines will subtract 1 from the
60 return address to get an address in the middle of the
61 presumed call instruction. Since we didn't get here via
62 a call, we need to include the nop before the real start
63 to make up for it. */
64 .long .LSTART_sigreturn-1-. /* PC-relative start address */
65 .long .LEND_sigreturn-.LSTART_sigreturn+1
66 .uleb128 0 /* Augmentation */
67 /* What follows are the instructions for the table generation.
68 We record the locations of each register saved. This is
69 complicated by the fact that the "CFA" is always assumed to
70 be the value of the stack pointer in the caller. This means
71 that we must define the CFA of this body of code to be the
72 saved value of the stack pointer in the sigcontext. Which
73 also means that there is no fixed relation to the other
74 saved registers, which means that we must use DW_CFA_expression
75 to compute their addresses. It also means that when we
76 adjust the stack with the popl, we have to do it all over again. */
77
78#define do_cfa_expr(offset) \
79 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
80 .uleb128 1f-0f; /* length */ \
810: .byte 0x74; /* DW_OP_breg4 */ \
82 .sleb128 offset; /* offset */ \
83 .byte 0x06; /* DW_OP_deref */ \
841:
85
86#define do_expr(regno, offset) \
87 .byte 0x10; /* DW_CFA_expression */ \
88 .uleb128 regno; /* regno */ \
89 .uleb128 1f-0f; /* length */ \
900: .byte 0x74; /* DW_OP_breg4 */ \
91 .sleb128 offset; /* offset */ \
921:
93
94 do_cfa_expr(SIGCONTEXT_esp+4)
95 do_expr(0, SIGCONTEXT_eax+4)
96 do_expr(1, SIGCONTEXT_ecx+4)
97 do_expr(2, SIGCONTEXT_edx+4)
98 do_expr(3, SIGCONTEXT_ebx+4)
99 do_expr(5, SIGCONTEXT_ebp+4)
100 do_expr(6, SIGCONTEXT_esi+4)
101 do_expr(7, SIGCONTEXT_edi+4)
102 do_expr(8, SIGCONTEXT_eip+4)
103
104 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
105
106 do_cfa_expr(SIGCONTEXT_esp)
107 do_expr(0, SIGCONTEXT_eax)
108 do_expr(1, SIGCONTEXT_ecx)
109 do_expr(2, SIGCONTEXT_edx)
110 do_expr(3, SIGCONTEXT_ebx)
111 do_expr(5, SIGCONTEXT_ebp)
112 do_expr(6, SIGCONTEXT_esi)
113 do_expr(7, SIGCONTEXT_edi)
114 do_expr(8, SIGCONTEXT_eip)
115
116 .align 4
117.LENDFDEDLSI1:
118
119 .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
120.LSTARTFDEDLSI2:
121 .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
122 /* HACK: See above wrt unwind library assumptions. */
123 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
124 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
125 .uleb128 0 /* Augmentation */
126 /* What follows are the instructions for the table generation.
127 We record the locations of each register saved. This is
128 slightly less complicated than the above, since we don't
129 modify the stack pointer in the process. */
130
131 do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
132 do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
133 do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
134 do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
135 do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
136 do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
137 do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
138 do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
139 do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
140
141 .align 4
142.LENDFDEDLSI2:
143 .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
new file mode 100644
index 000000000000..ed879bf42995
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sysenter_32.S
@@ -0,0 +1,122 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10/*
11 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
12 * %ecx itself for arg2. The pushing is because the sysexit instruction
13 * (found in entry.S) requires that we clobber %ecx with the desired %esp.
14 * User code might expect that %ecx is unclobbered though, as it would be
15 * for returning via the iret instruction, so we must push and pop.
16 *
17 * The caller puts arg3 in %edx, which the sysexit instruction requires
18 * for %eip. Thus, exactly as for arg2, we must push and pop.
19 *
20 * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
21 * instruction clobbers %esp, the user's %esp won't even survive entry
22 * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
23 * arg6 from the stack.
24 *
25 * You can not use this vsyscall for the clone() syscall because the
26 * three dwords on the parent stack do not get copied to the child.
27 */
28 .text
29 .globl __kernel_vsyscall
30 .type __kernel_vsyscall,@function
31__kernel_vsyscall:
32.LSTART_vsyscall:
33 push %ecx
34.Lpush_ecx:
35 push %edx
36.Lpush_edx:
37 push %ebp
38.Lenter_kernel:
39 movl %esp,%ebp
40 sysenter
41
42 /* 7: align return point with nop's to make disassembly easier */
43 .space 7,0x90
44
45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel
47 /* 16: System call normal return point is here! */
48 .globl SYSENTER_RETURN /* Symbol used by sysenter.c */
49SYSENTER_RETURN:
50 pop %ebp
51.Lpop_ebp:
52 pop %edx
53.Lpop_edx:
54 pop %ecx
55.Lpop_ecx:
56 ret
57.LEND_vsyscall:
58 .size __kernel_vsyscall,.-.LSTART_vsyscall
59 .previous
60
61 .section .eh_frame,"a",@progbits
62.LSTARTFRAMEDLSI:
63 .long .LENDCIEDLSI-.LSTARTCIEDLSI
64.LSTARTCIEDLSI:
65 .long 0 /* CIE ID */
66 .byte 1 /* Version number */
67 .string "zR" /* NUL-terminated augmentation string */
68 .uleb128 1 /* Code alignment factor */
69 .sleb128 -4 /* Data alignment factor */
70 .byte 8 /* Return address register column */
71 .uleb128 1 /* Augmentation value length */
72 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
73 .byte 0x0c /* DW_CFA_def_cfa */
74 .uleb128 4
75 .uleb128 4
76 .byte 0x88 /* DW_CFA_offset, column 0x8 */
77 .uleb128 1
78 .align 4
79.LENDCIEDLSI:
80 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
81.LSTARTFDEDLSI:
82 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
83 .long .LSTART_vsyscall-. /* PC-relative start address */
84 .long .LEND_vsyscall-.LSTART_vsyscall
85 .uleb128 0
86 /* What follows are the instructions for the table generation.
87 We have to record all changes of the stack pointer. */
88 .byte 0x04 /* DW_CFA_advance_loc4 */
89 .long .Lpush_ecx-.LSTART_vsyscall
90 .byte 0x0e /* DW_CFA_def_cfa_offset */
91 .byte 0x08 /* RA at offset 8 now */
92 .byte 0x04 /* DW_CFA_advance_loc4 */
93 .long .Lpush_edx-.Lpush_ecx
94 .byte 0x0e /* DW_CFA_def_cfa_offset */
95 .byte 0x0c /* RA at offset 12 now */
96 .byte 0x04 /* DW_CFA_advance_loc4 */
97 .long .Lenter_kernel-.Lpush_edx
98 .byte 0x0e /* DW_CFA_def_cfa_offset */
99 .byte 0x10 /* RA at offset 16 now */
100 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
101 /* Finally the epilogue. */
102 .byte 0x04 /* DW_CFA_advance_loc4 */
103 .long .Lpop_ebp-.Lenter_kernel
104 .byte 0x0e /* DW_CFA_def_cfa_offset */
105 .byte 0x0c /* RA at offset 12 now */
106 .byte 0xc5 /* DW_CFA_restore %ebp */
107 .byte 0x04 /* DW_CFA_advance_loc4 */
108 .long .Lpop_edx-.Lpop_ebp
109 .byte 0x0e /* DW_CFA_def_cfa_offset */
110 .byte 0x08 /* RA at offset 8 now */
111 .byte 0x04 /* DW_CFA_advance_loc4 */
112 .long .Lpop_ecx-.Lpop_edx
113 .byte 0x0e /* DW_CFA_def_cfa_offset */
114 .byte 0x04 /* RA at offset 4 now */
115 .align 4
116.LENDFDEDLSI:
117 .previous
118
119/*
120 * Get the common code for the sigreturn entry points.
121 */
122#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
new file mode 100644
index 000000000000..a5ab3dc4fd25
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.S
@@ -0,0 +1,15 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vsyscall_int80_start, vsyscall_int80_end
6vsyscall_int80_start:
7 .incbin "arch/x86/kernel/vsyscall-int80_32.so"
8vsyscall_int80_end:
9
10 .globl vsyscall_sysenter_start, vsyscall_sysenter_end
11vsyscall_sysenter_start:
12 .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
13vsyscall_sysenter_end:
14
15__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
new file mode 100644
index 000000000000..4a8b0ed9b8fb
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.lds.S
@@ -0,0 +1,67 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7
8SECTIONS
9{
10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11
12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) }
14 .dynsym : { *(.dynsym) }
15 .dynstr : { *(.dynstr) }
16 .gnu.version : { *(.gnu.version) }
17 .gnu.version_d : { *(.gnu.version_d) }
18 .gnu.version_r : { *(.gnu.version_r) }
19
20 /* This linker script is used both with -r and with -shared.
21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK_asm + 0x400;
25
26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note
28 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
29 .eh_frame : { KEEP (*(.eh_frame)) } :text
30 .dynamic : { *(.dynamic) } :text :dynamic
31 .useless : {
32 *(.got.plt) *(.got)
33 *(.data .data.* .gnu.linkonce.d.*)
34 *(.dynbss)
35 *(.bss .bss.* .gnu.linkonce.b.*)
36 } :text
37}
38
39/*
40 * We must supply the ELF program headers explicitly to get just one
41 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
42 */
43PHDRS
44{
45 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
46 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
47 note PT_NOTE FLAGS(4); /* PF_R */
48 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
49}
50
51/*
52 * This controls what symbols we export from the DSO.
53 */
54VERSION
55{
56 LINUX_2.5 {
57 global:
58 __kernel_vsyscall;
59 __kernel_sigreturn;
60 __kernel_rt_sigreturn;
61
62 local: *;
63 };
64}
65
66/* The ELF entry point can be used to set the AT_SYSINFO value. */
67ENTRY(__kernel_vsyscall);